1 2 /* 3 Defines the basic matrix operations for the SELL matrix storage format. 4 */ 5 #include <../src/mat/impls/sell/seq/sell.h> /*I "petscmat.h" I*/ 6 #include <petscblaslapack.h> 7 #include <petsc/private/kernels/blocktranspose.h> 8 9 static PetscBool cited = PETSC_FALSE; 10 static const char citation[] = "@inproceedings{ZhangELLPACK2018,\n" 11 " author = {Hong Zhang and Richard T. Mills and Karl Rupp and Barry F. Smith},\n" 12 " title = {Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512}},\n" 13 " booktitle = {Proceedings of the 47th International Conference on Parallel Processing},\n" 14 " year = 2018\n" 15 "}\n"; 16 17 #if defined(PETSC_HAVE_IMMINTRIN_H) && (defined(__AVX512F__) || (defined(__AVX2__) && defined(__FMA__)) || defined(__AVX__)) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 18 19 #include <immintrin.h> 20 21 #if !defined(_MM_SCALE_8) 22 #define _MM_SCALE_8 8 23 #endif 24 25 #if defined(__AVX512F__) 26 /* these do not work 27 vec_idx = _mm512_loadunpackhi_epi32(vec_idx,acolidx); 28 vec_vals = _mm512_loadunpackhi_pd(vec_vals,aval); 29 */ 30 #define AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \ 31 /* if the mask bit is set, copy from acolidx, otherwise from vec_idx */ \ 32 vec_idx = _mm256_loadu_si256((__m256i const *)acolidx); \ 33 vec_vals = _mm512_loadu_pd(aval); \ 34 vec_x = _mm512_i32gather_pd(vec_idx, x, _MM_SCALE_8); \ 35 vec_y = _mm512_fmadd_pd(vec_x, vec_vals, vec_y) 36 #elif defined(__AVX2__) && defined(__FMA__) 37 #define AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \ 38 vec_vals = _mm256_loadu_pd(aval); \ 39 vec_idx = _mm_loadu_si128((__m128i const *)acolidx); /* SSE2 */ \ 40 vec_x = _mm256_i32gather_pd(x, vec_idx, _MM_SCALE_8); \ 41 vec_y = _mm256_fmadd_pd(vec_x, vec_vals, vec_y) 42 #endif 43 #endif /* PETSC_HAVE_IMMINTRIN_H */ 44 45 /*@C 46 MatSeqSELLSetPreallocation - For good matrix assembly performance 47 the user should preallocate the matrix storage by setting the parameter `nz` 48 (or the array `nnz`). 49 50 Collective 51 52 Input Parameters: 53 + B - The `MATSEQSELL` matrix 54 . rlenmax - number of nonzeros per row (same for all rows), ignored if `rlen` is provided 55 - rlen - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL` 56 57 Level: intermediate 58 59 Notes: 60 Specify the preallocated storage with either `rlenmax` or `rlen` (not both). 61 Set `rlenmax` = `PETSC_DEFAULT` and `rlen` = `NULL` for PETSc to control dynamic memory 62 allocation. 63 64 You can call `MatGetInfo()` to get information on how effective the preallocation was; 65 for example the fields mallocs,nz_allocated,nz_used,nz_unneeded; 66 You can also run with the option `-info` and look for messages with the string 67 malloc in them to see if additional memory allocation was needed. 68 69 Developer Note: 70 Use `rlenmax` of `MAT_SKIP_ALLOCATION` to not allocate any space for the matrix 71 entries or columns indices. 72 73 The maximum number of nonzeos in any row should be as accurate as possible. 74 If it is underestimated, you will get bad performance due to reallocation 75 (`MatSeqXSELLReallocateSELL()`). 76 77 .seealso: `Mat`, `MATSEQSELL`, `MATSELL`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatGetInfo()` 78 @*/ 79 PetscErrorCode MatSeqSELLSetPreallocation(Mat B, PetscInt rlenmax, const PetscInt rlen[]) 80 { 81 PetscFunctionBegin; 82 PetscValidHeaderSpecific(B, MAT_CLASSID, 1); 83 PetscValidType(B, 1); 84 PetscTryMethod(B, "MatSeqSELLSetPreallocation_C", (Mat, PetscInt, const PetscInt[]), (B, rlenmax, rlen)); 85 PetscFunctionReturn(PETSC_SUCCESS); 86 } 87 88 PetscErrorCode MatSeqSELLSetPreallocation_SeqSELL(Mat B, PetscInt maxallocrow, const PetscInt rlen[]) 89 { 90 Mat_SeqSELL *b; 91 PetscInt i, j, totalslices; 92 PetscBool skipallocation = PETSC_FALSE, realalloc = PETSC_FALSE; 93 94 PetscFunctionBegin; 95 if (maxallocrow >= 0 || rlen) realalloc = PETSC_TRUE; 96 if (maxallocrow == MAT_SKIP_ALLOCATION) { 97 skipallocation = PETSC_TRUE; 98 maxallocrow = 0; 99 } 100 101 PetscCall(PetscLayoutSetUp(B->rmap)); 102 PetscCall(PetscLayoutSetUp(B->cmap)); 103 104 /* FIXME: if one preallocates more space than needed, the matrix does not shrink automatically, but for best performance it should */ 105 if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 5; 106 PetscCheck(maxallocrow >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "maxallocrow cannot be less than 0: value %" PetscInt_FMT, maxallocrow); 107 if (rlen) { 108 for (i = 0; i < B->rmap->n; i++) { 109 PetscCheck(rlen[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "rlen cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, rlen[i]); 110 PetscCheck(rlen[i] <= B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "rlen cannot be greater than row length: local row %" PetscInt_FMT " value %" PetscInt_FMT " rowlength %" PetscInt_FMT, i, rlen[i], B->cmap->n); 111 } 112 } 113 114 B->preallocated = PETSC_TRUE; 115 116 b = (Mat_SeqSELL *)B->data; 117 118 if (!b->sliceheight) { /* not set yet */ 119 #if defined(PETSC_HAVE_CUDA) 120 b->sliceheight = 16; 121 #else 122 b->sliceheight = 8; 123 #endif 124 } 125 totalslices = PetscCeilInt(B->rmap->n, b->sliceheight); 126 b->totalslices = totalslices; 127 if (!skipallocation) { 128 if (B->rmap->n % b->sliceheight) PetscCall(PetscInfo(B, "Padding rows to the SEQSELL matrix because the number of rows is not the multiple of the slice height (value %" PetscInt_FMT ")\n", B->rmap->n)); 129 130 if (!b->sliidx) { /* sliidx gives the starting index of each slice, the last element is the total space allocated */ 131 PetscCall(PetscMalloc1(totalslices + 1, &b->sliidx)); 132 } 133 if (!rlen) { /* if rlen is not provided, allocate same space for all the slices */ 134 if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 10; 135 else if (maxallocrow < 0) maxallocrow = 1; 136 for (i = 0; i <= totalslices; i++) b->sliidx[i] = b->sliceheight * i * maxallocrow; 137 } else { 138 maxallocrow = 0; 139 b->sliidx[0] = 0; 140 for (i = 1; i < totalslices; i++) { 141 b->sliidx[i] = 0; 142 for (j = 0; j < b->sliceheight; j++) { b->sliidx[i] = PetscMax(b->sliidx[i], rlen[b->sliceheight * (i - 1) + j]); } 143 maxallocrow = PetscMax(b->sliidx[i], maxallocrow); 144 PetscCall(PetscIntSumError(b->sliidx[i - 1], b->sliceheight * b->sliidx[i], &b->sliidx[i])); 145 } 146 /* last slice */ 147 b->sliidx[totalslices] = 0; 148 for (j = b->sliceheight * (totalslices - 1); j < B->rmap->n; j++) b->sliidx[totalslices] = PetscMax(b->sliidx[totalslices], rlen[j]); 149 maxallocrow = PetscMax(b->sliidx[totalslices], maxallocrow); 150 b->sliidx[totalslices] = b->sliidx[totalslices - 1] + b->sliceheight * b->sliidx[totalslices]; 151 } 152 153 /* allocate space for val, colidx, rlen */ 154 /* FIXME: should B's old memory be unlogged? */ 155 PetscCall(MatSeqXSELLFreeSELL(B, &b->val, &b->colidx)); 156 /* FIXME: assuming an element of the bit array takes 8 bits */ 157 PetscCall(PetscMalloc2(b->sliidx[totalslices], &b->val, b->sliidx[totalslices], &b->colidx)); 158 /* b->rlen will count nonzeros in each row so far. We dont copy rlen to b->rlen because the matrix has not been set. */ 159 PetscCall(PetscCalloc1(b->sliceheight * totalslices, &b->rlen)); 160 161 b->singlemalloc = PETSC_TRUE; 162 b->free_val = PETSC_TRUE; 163 b->free_colidx = PETSC_TRUE; 164 } else { 165 b->free_val = PETSC_FALSE; 166 b->free_colidx = PETSC_FALSE; 167 } 168 169 b->nz = 0; 170 b->maxallocrow = maxallocrow; 171 b->rlenmax = maxallocrow; 172 b->maxallocmat = b->sliidx[totalslices]; 173 B->info.nz_unneeded = (double)b->maxallocmat; 174 if (realalloc) PetscCall(MatSetOption(B, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE)); 175 PetscFunctionReturn(PETSC_SUCCESS); 176 } 177 178 PetscErrorCode MatGetRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v) 179 { 180 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 181 PetscInt shift; 182 183 PetscFunctionBegin; 184 PetscCheck(row >= 0 && row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row %" PetscInt_FMT " out of range", row); 185 if (nz) *nz = a->rlen[row]; 186 shift = a->sliidx[row / a->sliceheight] + (row % a->sliceheight); 187 if (!a->getrowcols) { PetscCall(PetscMalloc2(a->rlenmax, &a->getrowcols, a->rlenmax, &a->getrowvals)); } 188 if (idx) { 189 PetscInt j; 190 for (j = 0; j < a->rlen[row]; j++) a->getrowcols[j] = a->colidx[shift + a->sliceheight * j]; 191 *idx = a->getrowcols; 192 } 193 if (v) { 194 PetscInt j; 195 for (j = 0; j < a->rlen[row]; j++) a->getrowvals[j] = a->val[shift + a->sliceheight * j]; 196 *v = a->getrowvals; 197 } 198 PetscFunctionReturn(PETSC_SUCCESS); 199 } 200 201 PetscErrorCode MatRestoreRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v) 202 { 203 PetscFunctionBegin; 204 PetscFunctionReturn(PETSC_SUCCESS); 205 } 206 207 PetscErrorCode MatConvert_SeqSELL_SeqAIJ(Mat A, MatType newtype, MatReuse reuse, Mat *newmat) 208 { 209 Mat B; 210 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 211 PetscInt i; 212 213 PetscFunctionBegin; 214 if (reuse == MAT_REUSE_MATRIX) { 215 B = *newmat; 216 PetscCall(MatZeroEntries(B)); 217 } else { 218 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B)); 219 PetscCall(MatSetSizes(B, A->rmap->n, A->cmap->n, A->rmap->N, A->cmap->N)); 220 PetscCall(MatSetType(B, MATSEQAIJ)); 221 PetscCall(MatSeqAIJSetPreallocation(B, 0, a->rlen)); 222 } 223 224 for (i = 0; i < A->rmap->n; i++) { 225 PetscInt nz = 0, *cols = NULL; 226 PetscScalar *vals = NULL; 227 228 PetscCall(MatGetRow_SeqSELL(A, i, &nz, &cols, &vals)); 229 PetscCall(MatSetValues(B, 1, &i, nz, cols, vals, INSERT_VALUES)); 230 PetscCall(MatRestoreRow_SeqSELL(A, i, &nz, &cols, &vals)); 231 } 232 233 PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY)); 234 PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY)); 235 B->rmap->bs = A->rmap->bs; 236 237 if (reuse == MAT_INPLACE_MATRIX) { 238 PetscCall(MatHeaderReplace(A, &B)); 239 } else { 240 *newmat = B; 241 } 242 PetscFunctionReturn(PETSC_SUCCESS); 243 } 244 245 #include <../src/mat/impls/aij/seq/aij.h> 246 247 PetscErrorCode MatConvert_SeqAIJ_SeqSELL(Mat A, MatType newtype, MatReuse reuse, Mat *newmat) 248 { 249 Mat B; 250 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 251 PetscInt *ai = a->i, m = A->rmap->N, n = A->cmap->N, i, *rowlengths, row, ncols; 252 const PetscInt *cols; 253 const PetscScalar *vals; 254 255 PetscFunctionBegin; 256 257 if (reuse == MAT_REUSE_MATRIX) { 258 B = *newmat; 259 } else { 260 if (PetscDefined(USE_DEBUG) || !a->ilen) { 261 PetscCall(PetscMalloc1(m, &rowlengths)); 262 for (i = 0; i < m; i++) rowlengths[i] = ai[i + 1] - ai[i]; 263 } 264 if (PetscDefined(USE_DEBUG) && a->ilen) { 265 PetscBool eq; 266 PetscCall(PetscMemcmp(rowlengths, a->ilen, m * sizeof(PetscInt), &eq)); 267 PetscCheck(eq, PETSC_COMM_SELF, PETSC_ERR_PLIB, "SeqAIJ ilen array incorrect"); 268 PetscCall(PetscFree(rowlengths)); 269 rowlengths = a->ilen; 270 } else if (a->ilen) rowlengths = a->ilen; 271 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B)); 272 PetscCall(MatSetSizes(B, m, n, m, n)); 273 PetscCall(MatSetType(B, MATSEQSELL)); 274 PetscCall(MatSeqSELLSetPreallocation(B, 0, rowlengths)); 275 if (rowlengths != a->ilen) PetscCall(PetscFree(rowlengths)); 276 } 277 278 for (row = 0; row < m; row++) { 279 PetscCall(MatGetRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals)); 280 PetscCall(MatSetValues_SeqSELL(B, 1, &row, ncols, cols, vals, INSERT_VALUES)); 281 PetscCall(MatRestoreRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals)); 282 } 283 PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY)); 284 PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY)); 285 B->rmap->bs = A->rmap->bs; 286 287 if (reuse == MAT_INPLACE_MATRIX) { 288 PetscCall(MatHeaderReplace(A, &B)); 289 } else { 290 *newmat = B; 291 } 292 PetscFunctionReturn(PETSC_SUCCESS); 293 } 294 295 PetscErrorCode MatMult_SeqSELL(Mat A, Vec xx, Vec yy) 296 { 297 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 298 PetscScalar *y; 299 const PetscScalar *x; 300 const MatScalar *aval = a->val; 301 PetscInt totalslices = a->totalslices; 302 const PetscInt *acolidx = a->colidx; 303 PetscInt i, j; 304 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 305 __m512d vec_x, vec_y, vec_vals; 306 __m256i vec_idx; 307 __mmask8 mask; 308 __m512d vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4; 309 __m256i vec_idx2, vec_idx3, vec_idx4; 310 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 311 __m128i vec_idx; 312 __m256d vec_x, vec_y, vec_y2, vec_vals; 313 MatScalar yval; 314 PetscInt r, rows_left, row, nnz_in_row; 315 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 316 __m128d vec_x_tmp; 317 __m256d vec_x, vec_y, vec_y2, vec_vals; 318 MatScalar yval; 319 PetscInt r, rows_left, row, nnz_in_row; 320 #else 321 PetscInt k, sliceheight = a->sliceheight; 322 PetscScalar *sum; 323 #endif 324 325 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 326 #pragma disjoint(*x, *y, *aval) 327 #endif 328 329 PetscFunctionBegin; 330 PetscCall(VecGetArrayRead(xx, &x)); 331 PetscCall(VecGetArray(yy, &y)); 332 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 333 PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight); 334 for (i = 0; i < totalslices; i++) { /* loop over slices */ 335 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 336 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 337 338 vec_y = _mm512_setzero_pd(); 339 vec_y2 = _mm512_setzero_pd(); 340 vec_y3 = _mm512_setzero_pd(); 341 vec_y4 = _mm512_setzero_pd(); 342 343 j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice column */ 344 switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) { 345 case 3: 346 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 347 acolidx += 8; 348 aval += 8; 349 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 350 acolidx += 8; 351 aval += 8; 352 AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3); 353 acolidx += 8; 354 aval += 8; 355 j += 3; 356 break; 357 case 2: 358 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 359 acolidx += 8; 360 aval += 8; 361 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 362 acolidx += 8; 363 aval += 8; 364 j += 2; 365 break; 366 case 1: 367 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 368 acolidx += 8; 369 aval += 8; 370 j += 1; 371 break; 372 } 373 #pragma novector 374 for (; j < (a->sliidx[i + 1] >> 3); j += 4) { 375 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 376 acolidx += 8; 377 aval += 8; 378 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 379 acolidx += 8; 380 aval += 8; 381 AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3); 382 acolidx += 8; 383 aval += 8; 384 AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4); 385 acolidx += 8; 386 aval += 8; 387 } 388 389 vec_y = _mm512_add_pd(vec_y, vec_y2); 390 vec_y = _mm512_add_pd(vec_y, vec_y3); 391 vec_y = _mm512_add_pd(vec_y, vec_y4); 392 if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */ 393 mask = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07))); 394 _mm512_mask_storeu_pd(&y[8 * i], mask, vec_y); 395 } else { 396 _mm512_storeu_pd(&y[8 * i], vec_y); 397 } 398 } 399 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 400 PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight); 401 for (i = 0; i < totalslices; i++) { /* loop over full slices */ 402 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 403 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 404 405 /* last slice may have padding rows. Don't use vectorization. */ 406 if (i == totalslices - 1 && (A->rmap->n & 0x07)) { 407 rows_left = A->rmap->n - 8 * i; 408 for (r = 0; r < rows_left; ++r) { 409 yval = (MatScalar)0; 410 row = 8 * i + r; 411 nnz_in_row = a->rlen[row]; 412 for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]]; 413 y[row] = yval; 414 } 415 break; 416 } 417 418 vec_y = _mm256_setzero_pd(); 419 vec_y2 = _mm256_setzero_pd(); 420 421 /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */ 422 #pragma novector 423 #pragma unroll(2) 424 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) { 425 AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 426 aval += 4; 427 acolidx += 4; 428 AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y2); 429 aval += 4; 430 acolidx += 4; 431 } 432 433 _mm256_storeu_pd(y + i * 8, vec_y); 434 _mm256_storeu_pd(y + i * 8 + 4, vec_y2); 435 } 436 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 437 PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight); 438 for (i = 0; i < totalslices; i++) { /* loop over full slices */ 439 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 440 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 441 442 vec_y = _mm256_setzero_pd(); 443 vec_y2 = _mm256_setzero_pd(); 444 445 /* last slice may have padding rows. Don't use vectorization. */ 446 if (i == totalslices - 1 && (A->rmap->n & 0x07)) { 447 rows_left = A->rmap->n - 8 * i; 448 for (r = 0; r < rows_left; ++r) { 449 yval = (MatScalar)0; 450 row = 8 * i + r; 451 nnz_in_row = a->rlen[row]; 452 for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]]; 453 y[row] = yval; 454 } 455 break; 456 } 457 458 /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */ 459 #pragma novector 460 #pragma unroll(2) 461 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) { 462 vec_vals = _mm256_loadu_pd(aval); 463 vec_x_tmp = _mm_setzero_pd(); 464 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 465 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 466 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0); 467 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 468 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 469 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1); 470 vec_y = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y); 471 aval += 4; 472 473 vec_vals = _mm256_loadu_pd(aval); 474 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 475 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 476 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0); 477 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 478 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 479 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1); 480 vec_y2 = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2); 481 aval += 4; 482 } 483 484 _mm256_storeu_pd(y + i * 8, vec_y); 485 _mm256_storeu_pd(y + i * 8 + 4, vec_y2); 486 } 487 #else 488 PetscCall(PetscMalloc1(sliceheight, &sum)); 489 for (i = 0; i < totalslices; i++) { /* loop over slices */ 490 for (j = 0; j < sliceheight; j++) { 491 sum[j] = 0.0; 492 for (k = a->sliidx[i] + j; k < a->sliidx[i + 1]; k += sliceheight) sum[j] += aval[k] * x[acolidx[k]]; 493 } 494 if (i == totalslices - 1 && (A->rmap->n % sliceheight)) { /* if last slice has padding rows */ 495 for (j = 0; j < (A->rmap->n % sliceheight); j++) y[sliceheight * i + j] = sum[j]; 496 } else { 497 for (j = 0; j < sliceheight; j++) y[sliceheight * i + j] = sum[j]; 498 } 499 } 500 PetscCall(PetscFree(sum)); 501 #endif 502 503 PetscCall(PetscLogFlops(2.0 * a->nz - a->nonzerorowcnt)); /* theoretical minimal FLOPs */ 504 PetscCall(VecRestoreArrayRead(xx, &x)); 505 PetscCall(VecRestoreArray(yy, &y)); 506 PetscFunctionReturn(PETSC_SUCCESS); 507 } 508 509 #include <../src/mat/impls/aij/seq/ftn-kernels/fmultadd.h> 510 PetscErrorCode MatMultAdd_SeqSELL(Mat A, Vec xx, Vec yy, Vec zz) 511 { 512 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 513 PetscScalar *y, *z; 514 const PetscScalar *x; 515 const MatScalar *aval = a->val; 516 PetscInt totalslices = a->totalslices; 517 const PetscInt *acolidx = a->colidx; 518 PetscInt i, j; 519 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 520 __m512d vec_x, vec_y, vec_vals; 521 __m256i vec_idx; 522 __mmask8 mask; 523 __m512d vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4; 524 __m256i vec_idx2, vec_idx3, vec_idx4; 525 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 526 __m128d vec_x_tmp; 527 __m256d vec_x, vec_y, vec_y2, vec_vals; 528 MatScalar yval; 529 PetscInt r, row, nnz_in_row; 530 #else 531 PetscInt k, sliceheight = a->sliceheight; 532 PetscScalar *sum; 533 #endif 534 535 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 536 #pragma disjoint(*x, *y, *aval) 537 #endif 538 539 PetscFunctionBegin; 540 if (!a->nz) { 541 PetscCall(VecCopy(yy, zz)); 542 PetscFunctionReturn(PETSC_SUCCESS); 543 } 544 PetscCall(VecGetArrayRead(xx, &x)); 545 PetscCall(VecGetArrayPair(yy, zz, &y, &z)); 546 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 547 PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight); 548 for (i = 0; i < totalslices; i++) { /* loop over slices */ 549 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 550 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 551 552 if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */ 553 mask = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07))); 554 vec_y = _mm512_mask_loadu_pd(vec_y, mask, &y[8 * i]); 555 } else { 556 vec_y = _mm512_loadu_pd(&y[8 * i]); 557 } 558 vec_y2 = _mm512_setzero_pd(); 559 vec_y3 = _mm512_setzero_pd(); 560 vec_y4 = _mm512_setzero_pd(); 561 562 j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice column */ 563 switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) { 564 case 3: 565 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 566 acolidx += 8; 567 aval += 8; 568 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 569 acolidx += 8; 570 aval += 8; 571 AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3); 572 acolidx += 8; 573 aval += 8; 574 j += 3; 575 break; 576 case 2: 577 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 578 acolidx += 8; 579 aval += 8; 580 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 581 acolidx += 8; 582 aval += 8; 583 j += 2; 584 break; 585 case 1: 586 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 587 acolidx += 8; 588 aval += 8; 589 j += 1; 590 break; 591 } 592 #pragma novector 593 for (; j < (a->sliidx[i + 1] >> 3); j += 4) { 594 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 595 acolidx += 8; 596 aval += 8; 597 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 598 acolidx += 8; 599 aval += 8; 600 AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3); 601 acolidx += 8; 602 aval += 8; 603 AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4); 604 acolidx += 8; 605 aval += 8; 606 } 607 608 vec_y = _mm512_add_pd(vec_y, vec_y2); 609 vec_y = _mm512_add_pd(vec_y, vec_y3); 610 vec_y = _mm512_add_pd(vec_y, vec_y4); 611 if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */ 612 _mm512_mask_storeu_pd(&z[8 * i], mask, vec_y); 613 } else { 614 _mm512_storeu_pd(&z[8 * i], vec_y); 615 } 616 } 617 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 618 PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight); 619 for (i = 0; i < totalslices; i++) { /* loop over full slices */ 620 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 621 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 622 623 /* last slice may have padding rows. Don't use vectorization. */ 624 if (i == totalslices - 1 && (A->rmap->n & 0x07)) { 625 for (r = 0; r < (A->rmap->n & 0x07); ++r) { 626 row = 8 * i + r; 627 yval = (MatScalar)0.0; 628 nnz_in_row = a->rlen[row]; 629 for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]]; 630 z[row] = y[row] + yval; 631 } 632 break; 633 } 634 635 vec_y = _mm256_loadu_pd(y + 8 * i); 636 vec_y2 = _mm256_loadu_pd(y + 8 * i + 4); 637 638 /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */ 639 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) { 640 vec_vals = _mm256_loadu_pd(aval); 641 vec_x_tmp = _mm_setzero_pd(); 642 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 643 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 644 vec_x = _mm256_setzero_pd(); 645 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0); 646 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 647 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 648 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1); 649 vec_y = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y); 650 aval += 4; 651 652 vec_vals = _mm256_loadu_pd(aval); 653 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 654 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 655 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0); 656 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 657 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 658 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1); 659 vec_y2 = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2); 660 aval += 4; 661 } 662 663 _mm256_storeu_pd(z + i * 8, vec_y); 664 _mm256_storeu_pd(z + i * 8 + 4, vec_y2); 665 } 666 #else 667 PetscCall(PetscMalloc1(sliceheight, &sum)); 668 for (i = 0; i < totalslices; i++) { /* loop over slices */ 669 for (j = 0; j < sliceheight; j++) { 670 sum[j] = 0.0; 671 for (k = a->sliidx[i] + j; k < a->sliidx[i + 1]; k += sliceheight) sum[j] += aval[k] * x[acolidx[k]]; 672 } 673 if (i == totalslices - 1 && (A->rmap->n % sliceheight)) { 674 for (j = 0; j < (A->rmap->n % sliceheight); j++) z[sliceheight * i + j] = y[sliceheight * i + j] + sum[j]; 675 } else { 676 for (j = 0; j < sliceheight; j++) z[sliceheight * i + j] = y[sliceheight * i + j] + sum[j]; 677 } 678 } 679 PetscCall(PetscFree(sum)); 680 #endif 681 682 PetscCall(PetscLogFlops(2.0 * a->nz)); 683 PetscCall(VecRestoreArrayRead(xx, &x)); 684 PetscCall(VecRestoreArrayPair(yy, zz, &y, &z)); 685 PetscFunctionReturn(PETSC_SUCCESS); 686 } 687 688 PetscErrorCode MatMultTransposeAdd_SeqSELL(Mat A, Vec xx, Vec zz, Vec yy) 689 { 690 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 691 PetscScalar *y; 692 const PetscScalar *x; 693 const MatScalar *aval = a->val; 694 const PetscInt *acolidx = a->colidx; 695 PetscInt i, j, r, row, nnz_in_row, totalslices = a->totalslices, sliceheight = a->sliceheight; 696 697 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 698 #pragma disjoint(*x, *y, *aval) 699 #endif 700 701 PetscFunctionBegin; 702 if (A->symmetric == PETSC_BOOL3_TRUE) { 703 PetscCall(MatMultAdd_SeqSELL(A, xx, zz, yy)); 704 PetscFunctionReturn(PETSC_SUCCESS); 705 } 706 if (zz != yy) PetscCall(VecCopy(zz, yy)); 707 708 if (a->nz) { 709 PetscCall(VecGetArrayRead(xx, &x)); 710 PetscCall(VecGetArray(yy, &y)); 711 for (i = 0; i < a->totalslices; i++) { /* loop over slices */ 712 if (i == totalslices - 1 && (A->rmap->n % sliceheight)) { 713 for (r = 0; r < (A->rmap->n % sliceheight); ++r) { 714 row = sliceheight * i + r; 715 nnz_in_row = a->rlen[row]; 716 for (j = 0; j < nnz_in_row; ++j) y[acolidx[sliceheight * j + r]] += aval[sliceheight * j + r] * x[row]; 717 } 718 break; 719 } 720 for (r = 0; r < sliceheight; ++r) 721 for (j = a->sliidx[i] + r; j < a->sliidx[i + 1]; j += sliceheight) y[acolidx[j]] += aval[j] * x[sliceheight * i + r]; 722 } 723 PetscCall(PetscLogFlops(2.0 * a->nz)); 724 PetscCall(VecRestoreArrayRead(xx, &x)); 725 PetscCall(VecRestoreArray(yy, &y)); 726 } 727 PetscFunctionReturn(PETSC_SUCCESS); 728 } 729 730 PetscErrorCode MatMultTranspose_SeqSELL(Mat A, Vec xx, Vec yy) 731 { 732 PetscFunctionBegin; 733 if (A->symmetric == PETSC_BOOL3_TRUE) { 734 PetscCall(MatMult_SeqSELL(A, xx, yy)); 735 } else { 736 PetscCall(VecSet(yy, 0.0)); 737 PetscCall(MatMultTransposeAdd_SeqSELL(A, xx, yy, yy)); 738 } 739 PetscFunctionReturn(PETSC_SUCCESS); 740 } 741 742 /* 743 Checks for missing diagonals 744 */ 745 PetscErrorCode MatMissingDiagonal_SeqSELL(Mat A, PetscBool *missing, PetscInt *d) 746 { 747 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 748 PetscInt *diag, i; 749 750 PetscFunctionBegin; 751 *missing = PETSC_FALSE; 752 if (A->rmap->n > 0 && !(a->colidx)) { 753 *missing = PETSC_TRUE; 754 if (d) *d = 0; 755 PetscCall(PetscInfo(A, "Matrix has no entries therefore is missing diagonal\n")); 756 } else { 757 diag = a->diag; 758 for (i = 0; i < A->rmap->n; i++) { 759 if (diag[i] == -1) { 760 *missing = PETSC_TRUE; 761 if (d) *d = i; 762 PetscCall(PetscInfo(A, "Matrix is missing diagonal number %" PetscInt_FMT "\n", i)); 763 break; 764 } 765 } 766 } 767 PetscFunctionReturn(PETSC_SUCCESS); 768 } 769 770 PetscErrorCode MatMarkDiagonal_SeqSELL(Mat A) 771 { 772 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 773 PetscInt i, j, m = A->rmap->n, shift; 774 775 PetscFunctionBegin; 776 if (!a->diag) { 777 PetscCall(PetscMalloc1(m, &a->diag)); 778 a->free_diag = PETSC_TRUE; 779 } 780 for (i = 0; i < m; i++) { /* loop over rows */ 781 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */ 782 a->diag[i] = -1; 783 for (j = 0; j < a->rlen[i]; j++) { 784 if (a->colidx[shift + a->sliceheight * j] == i) { 785 a->diag[i] = shift + a->sliceheight * j; 786 break; 787 } 788 } 789 } 790 PetscFunctionReturn(PETSC_SUCCESS); 791 } 792 793 /* 794 Negative shift indicates do not generate an error if there is a zero diagonal, just invert it anyways 795 */ 796 PetscErrorCode MatInvertDiagonal_SeqSELL(Mat A, PetscScalar omega, PetscScalar fshift) 797 { 798 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 799 PetscInt i, *diag, m = A->rmap->n; 800 MatScalar *val = a->val; 801 PetscScalar *idiag, *mdiag; 802 803 PetscFunctionBegin; 804 if (a->idiagvalid) PetscFunctionReturn(PETSC_SUCCESS); 805 PetscCall(MatMarkDiagonal_SeqSELL(A)); 806 diag = a->diag; 807 if (!a->idiag) { 808 PetscCall(PetscMalloc3(m, &a->idiag, m, &a->mdiag, m, &a->ssor_work)); 809 val = a->val; 810 } 811 mdiag = a->mdiag; 812 idiag = a->idiag; 813 814 if (omega == 1.0 && PetscRealPart(fshift) <= 0.0) { 815 for (i = 0; i < m; i++) { 816 mdiag[i] = val[diag[i]]; 817 if (!PetscAbsScalar(mdiag[i])) { /* zero diagonal */ 818 PetscCheck(PetscRealPart(fshift), PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Zero diagonal on row %" PetscInt_FMT, i); 819 PetscCall(PetscInfo(A, "Zero diagonal on row %" PetscInt_FMT "\n", i)); 820 A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 821 A->factorerror_zeropivot_value = 0.0; 822 A->factorerror_zeropivot_row = i; 823 } 824 idiag[i] = 1.0 / val[diag[i]]; 825 } 826 PetscCall(PetscLogFlops(m)); 827 } else { 828 for (i = 0; i < m; i++) { 829 mdiag[i] = val[diag[i]]; 830 idiag[i] = omega / (fshift + val[diag[i]]); 831 } 832 PetscCall(PetscLogFlops(2.0 * m)); 833 } 834 a->idiagvalid = PETSC_TRUE; 835 PetscFunctionReturn(PETSC_SUCCESS); 836 } 837 838 PetscErrorCode MatZeroEntries_SeqSELL(Mat A) 839 { 840 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 841 842 PetscFunctionBegin; 843 PetscCall(PetscArrayzero(a->val, a->sliidx[a->totalslices])); 844 PetscCall(MatSeqSELLInvalidateDiagonal(A)); 845 PetscFunctionReturn(PETSC_SUCCESS); 846 } 847 848 PetscErrorCode MatDestroy_SeqSELL(Mat A) 849 { 850 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 851 852 PetscFunctionBegin; 853 #if defined(PETSC_USE_LOG) 854 PetscCall(PetscLogObjectState((PetscObject)A, "Rows=%" PetscInt_FMT ", Cols=%" PetscInt_FMT ", NZ=%" PetscInt_FMT, A->rmap->n, A->cmap->n, a->nz)); 855 #endif 856 PetscCall(MatSeqXSELLFreeSELL(A, &a->val, &a->colidx)); 857 PetscCall(ISDestroy(&a->row)); 858 PetscCall(ISDestroy(&a->col)); 859 PetscCall(PetscFree(a->diag)); 860 PetscCall(PetscFree(a->rlen)); 861 PetscCall(PetscFree(a->sliidx)); 862 PetscCall(PetscFree3(a->idiag, a->mdiag, a->ssor_work)); 863 PetscCall(PetscFree(a->solve_work)); 864 PetscCall(ISDestroy(&a->icol)); 865 PetscCall(PetscFree(a->saved_values)); 866 PetscCall(PetscFree2(a->getrowcols, a->getrowvals)); 867 PetscCall(PetscFree(A->data)); 868 869 PetscCall(PetscObjectChangeTypeName((PetscObject)A, NULL)); 870 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatStoreValues_C", NULL)); 871 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatRetrieveValues_C", NULL)); 872 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLSetPreallocation_C", NULL)); 873 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetArray_C", NULL)); 874 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLRestoreArray_C", NULL)); 875 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_SeqSELL_SeqAIJ_C", NULL)); 876 #if defined(PETSC_HAVE_CUDA) 877 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_SeqSELL_SeqSELLCUDA_C", NULL)); 878 #endif 879 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetFillRatio_C", NULL)); 880 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetMaxSliceWidth_C", NULL)); 881 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetAvgSliceWidth_C", NULL)); 882 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLSetSliceHeight_C", NULL)); 883 PetscFunctionReturn(PETSC_SUCCESS); 884 } 885 886 PetscErrorCode MatSetOption_SeqSELL(Mat A, MatOption op, PetscBool flg) 887 { 888 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 889 890 PetscFunctionBegin; 891 switch (op) { 892 case MAT_ROW_ORIENTED: 893 a->roworiented = flg; 894 break; 895 case MAT_KEEP_NONZERO_PATTERN: 896 a->keepnonzeropattern = flg; 897 break; 898 case MAT_NEW_NONZERO_LOCATIONS: 899 a->nonew = (flg ? 0 : 1); 900 break; 901 case MAT_NEW_NONZERO_LOCATION_ERR: 902 a->nonew = (flg ? -1 : 0); 903 break; 904 case MAT_NEW_NONZERO_ALLOCATION_ERR: 905 a->nonew = (flg ? -2 : 0); 906 break; 907 case MAT_UNUSED_NONZERO_LOCATION_ERR: 908 a->nounused = (flg ? -1 : 0); 909 break; 910 case MAT_FORCE_DIAGONAL_ENTRIES: 911 case MAT_IGNORE_OFF_PROC_ENTRIES: 912 case MAT_USE_HASH_TABLE: 913 case MAT_SORTED_FULL: 914 PetscCall(PetscInfo(A, "Option %s ignored\n", MatOptions[op])); 915 break; 916 case MAT_SPD: 917 case MAT_SYMMETRIC: 918 case MAT_STRUCTURALLY_SYMMETRIC: 919 case MAT_HERMITIAN: 920 case MAT_SYMMETRY_ETERNAL: 921 case MAT_STRUCTURAL_SYMMETRY_ETERNAL: 922 case MAT_SPD_ETERNAL: 923 /* These options are handled directly by MatSetOption() */ 924 break; 925 default: 926 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unknown option %d", op); 927 } 928 PetscFunctionReturn(PETSC_SUCCESS); 929 } 930 931 PetscErrorCode MatGetDiagonal_SeqSELL(Mat A, Vec v) 932 { 933 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 934 PetscInt i, j, n, shift; 935 PetscScalar *x, zero = 0.0; 936 937 PetscFunctionBegin; 938 PetscCall(VecGetLocalSize(v, &n)); 939 PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector"); 940 941 if (A->factortype == MAT_FACTOR_ILU || A->factortype == MAT_FACTOR_LU) { 942 PetscInt *diag = a->diag; 943 PetscCall(VecGetArray(v, &x)); 944 for (i = 0; i < n; i++) x[i] = 1.0 / a->val[diag[i]]; 945 PetscCall(VecRestoreArray(v, &x)); 946 PetscFunctionReturn(PETSC_SUCCESS); 947 } 948 949 PetscCall(VecSet(v, zero)); 950 PetscCall(VecGetArray(v, &x)); 951 for (i = 0; i < n; i++) { /* loop over rows */ 952 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */ 953 x[i] = 0; 954 for (j = 0; j < a->rlen[i]; j++) { 955 if (a->colidx[shift + a->sliceheight * j] == i) { 956 x[i] = a->val[shift + a->sliceheight * j]; 957 break; 958 } 959 } 960 } 961 PetscCall(VecRestoreArray(v, &x)); 962 PetscFunctionReturn(PETSC_SUCCESS); 963 } 964 965 PetscErrorCode MatDiagonalScale_SeqSELL(Mat A, Vec ll, Vec rr) 966 { 967 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 968 const PetscScalar *l, *r; 969 PetscInt i, j, m, n, row; 970 971 PetscFunctionBegin; 972 if (ll) { 973 /* The local size is used so that VecMPI can be passed to this routine 974 by MatDiagonalScale_MPISELL */ 975 PetscCall(VecGetLocalSize(ll, &m)); 976 PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length"); 977 PetscCall(VecGetArrayRead(ll, &l)); 978 for (i = 0; i < a->totalslices; i++) { /* loop over slices */ 979 if (i == a->totalslices - 1 && (A->rmap->n % a->sliceheight)) { /* if last slice has padding rows */ 980 for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = (row + 1) % a->sliceheight) { 981 if (row < (A->rmap->n % a->sliceheight)) a->val[j] *= l[a->sliceheight * i + row]; 982 } 983 } else { 984 for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = (row + 1) % a->sliceheight) { a->val[j] *= l[a->sliceheight * i + row]; } 985 } 986 } 987 PetscCall(VecRestoreArrayRead(ll, &l)); 988 PetscCall(PetscLogFlops(a->nz)); 989 } 990 if (rr) { 991 PetscCall(VecGetLocalSize(rr, &n)); 992 PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length"); 993 PetscCall(VecGetArrayRead(rr, &r)); 994 for (i = 0; i < a->totalslices; i++) { /* loop over slices */ 995 if (i == a->totalslices - 1 && (A->rmap->n % a->sliceheight)) { /* if last slice has padding rows */ 996 for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) % a->sliceheight)) { 997 if (row < (A->rmap->n % a->sliceheight)) a->val[j] *= r[a->colidx[j]]; 998 } 999 } else { 1000 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j++) a->val[j] *= r[a->colidx[j]]; 1001 } 1002 } 1003 PetscCall(VecRestoreArrayRead(rr, &r)); 1004 PetscCall(PetscLogFlops(a->nz)); 1005 } 1006 PetscCall(MatSeqSELLInvalidateDiagonal(A)); 1007 #if defined(PETSC_HAVE_CUDA) 1008 if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) A->offloadmask = PETSC_OFFLOAD_CPU; 1009 #endif 1010 PetscFunctionReturn(PETSC_SUCCESS); 1011 } 1012 1013 PetscErrorCode MatGetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], PetscScalar v[]) 1014 { 1015 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1016 PetscInt *cp, i, k, low, high, t, row, col, l; 1017 PetscInt shift; 1018 MatScalar *vp; 1019 1020 PetscFunctionBegin; 1021 for (k = 0; k < m; k++) { /* loop over requested rows */ 1022 row = im[k]; 1023 if (row < 0) continue; 1024 PetscCheck(row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->n - 1); 1025 shift = a->sliidx[row / a->sliceheight] + (row % a->sliceheight); /* starting index of the row */ 1026 cp = a->colidx + shift; /* pointer to the row */ 1027 vp = a->val + shift; /* pointer to the row */ 1028 for (l = 0; l < n; l++) { /* loop over requested columns */ 1029 col = in[l]; 1030 if (col < 0) continue; 1031 PetscCheck(col < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column too large: row %" PetscInt_FMT " max %" PetscInt_FMT, col, A->cmap->n - 1); 1032 high = a->rlen[row]; 1033 low = 0; /* assume unsorted */ 1034 while (high - low > 5) { 1035 t = (low + high) / 2; 1036 if (*(cp + a->sliceheight * t) > col) high = t; 1037 else low = t; 1038 } 1039 for (i = low; i < high; i++) { 1040 if (*(cp + a->sliceheight * i) > col) break; 1041 if (*(cp + a->sliceheight * i) == col) { 1042 *v++ = *(vp + a->sliceheight * i); 1043 goto finished; 1044 } 1045 } 1046 *v++ = 0.0; 1047 finished:; 1048 } 1049 } 1050 PetscFunctionReturn(PETSC_SUCCESS); 1051 } 1052 1053 PetscErrorCode MatView_SeqSELL_ASCII(Mat A, PetscViewer viewer) 1054 { 1055 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1056 PetscInt i, j, m = A->rmap->n, shift; 1057 const char *name; 1058 PetscViewerFormat format; 1059 1060 PetscFunctionBegin; 1061 PetscCall(PetscViewerGetFormat(viewer, &format)); 1062 if (format == PETSC_VIEWER_ASCII_MATLAB) { 1063 PetscInt nofinalvalue = 0; 1064 /* 1065 if (m && ((a->i[m] == a->i[m-1]) || (a->j[a->nz-1] != A->cmap->n-1))) { 1066 nofinalvalue = 1; 1067 } 1068 */ 1069 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1070 PetscCall(PetscViewerASCIIPrintf(viewer, "%% Size = %" PetscInt_FMT " %" PetscInt_FMT " \n", m, A->cmap->n)); 1071 PetscCall(PetscViewerASCIIPrintf(viewer, "%% Nonzeros = %" PetscInt_FMT " \n", a->nz)); 1072 #if defined(PETSC_USE_COMPLEX) 1073 PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",4);\n", a->nz + nofinalvalue)); 1074 #else 1075 PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",3);\n", a->nz + nofinalvalue)); 1076 #endif 1077 PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = [\n")); 1078 1079 for (i = 0; i < m; i++) { 1080 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1081 for (j = 0; j < a->rlen[i]; j++) { 1082 #if defined(PETSC_USE_COMPLEX) 1083 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %18.16e %18.16e\n", i + 1, a->colidx[shift + a->sliceheight * j] + 1, (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j]))); 1084 #else 1085 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %18.16e\n", i + 1, a->colidx[shift + a->sliceheight * j] + 1, (double)a->val[shift + a->sliceheight * j])); 1086 #endif 1087 } 1088 } 1089 /* 1090 if (nofinalvalue) { 1091 #if defined(PETSC_USE_COMPLEX) 1092 PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %18.16e %18.16e\n",m,A->cmap->n,0.,0.)); 1093 #else 1094 PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %18.16e\n",m,A->cmap->n,0.0)); 1095 #endif 1096 } 1097 */ 1098 PetscCall(PetscObjectGetName((PetscObject)A, &name)); 1099 PetscCall(PetscViewerASCIIPrintf(viewer, "];\n %s = spconvert(zzz);\n", name)); 1100 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1101 } else if (format == PETSC_VIEWER_ASCII_FACTOR_INFO || format == PETSC_VIEWER_ASCII_INFO) { 1102 PetscFunctionReturn(PETSC_SUCCESS); 1103 } else if (format == PETSC_VIEWER_ASCII_COMMON) { 1104 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1105 for (i = 0; i < m; i++) { 1106 PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i)); 1107 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1108 for (j = 0; j < a->rlen[i]; j++) { 1109 #if defined(PETSC_USE_COMPLEX) 1110 if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) > 0.0 && PetscRealPart(a->val[shift + a->sliceheight * j]) != 0.0) { 1111 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j]))); 1112 } else if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) < 0.0 && PetscRealPart(a->val[shift + a->sliceheight * j]) != 0.0) { 1113 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)-PetscImaginaryPart(a->val[shift + a->sliceheight * j]))); 1114 } else if (PetscRealPart(a->val[shift + a->sliceheight * j]) != 0.0) { 1115 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]))); 1116 } 1117 #else 1118 if (a->val[shift + a->sliceheight * j] != 0.0) PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)a->val[shift + a->sliceheight * j])); 1119 #endif 1120 } 1121 PetscCall(PetscViewerASCIIPrintf(viewer, "\n")); 1122 } 1123 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1124 } else if (format == PETSC_VIEWER_ASCII_DENSE) { 1125 PetscInt cnt = 0, jcnt; 1126 PetscScalar value; 1127 #if defined(PETSC_USE_COMPLEX) 1128 PetscBool realonly = PETSC_TRUE; 1129 for (i = 0; i < a->sliidx[a->totalslices]; i++) { 1130 if (PetscImaginaryPart(a->val[i]) != 0.0) { 1131 realonly = PETSC_FALSE; 1132 break; 1133 } 1134 } 1135 #endif 1136 1137 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1138 for (i = 0; i < m; i++) { 1139 jcnt = 0; 1140 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1141 for (j = 0; j < A->cmap->n; j++) { 1142 if (jcnt < a->rlen[i] && j == a->colidx[shift + a->sliceheight * j]) { 1143 value = a->val[cnt++]; 1144 jcnt++; 1145 } else { 1146 value = 0.0; 1147 } 1148 #if defined(PETSC_USE_COMPLEX) 1149 if (realonly) { 1150 PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)PetscRealPart(value))); 1151 } else { 1152 PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e+%7.5e i ", (double)PetscRealPart(value), (double)PetscImaginaryPart(value))); 1153 } 1154 #else 1155 PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)value)); 1156 #endif 1157 } 1158 PetscCall(PetscViewerASCIIPrintf(viewer, "\n")); 1159 } 1160 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1161 } else if (format == PETSC_VIEWER_ASCII_MATRIXMARKET) { 1162 PetscInt fshift = 1; 1163 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1164 #if defined(PETSC_USE_COMPLEX) 1165 PetscCall(PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate complex general\n")); 1166 #else 1167 PetscCall(PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate real general\n")); 1168 #endif 1169 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %" PetscInt_FMT "\n", m, A->cmap->n, a->nz)); 1170 for (i = 0; i < m; i++) { 1171 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1172 for (j = 0; j < a->rlen[i]; j++) { 1173 #if defined(PETSC_USE_COMPLEX) 1174 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g %g\n", i + fshift, a->colidx[shift + a->sliceheight * j] + fshift, (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j]))); 1175 #else 1176 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g\n", i + fshift, a->colidx[shift + a->sliceheight * j] + fshift, (double)a->val[shift + a->sliceheight * j])); 1177 #endif 1178 } 1179 } 1180 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1181 } else if (format == PETSC_VIEWER_NATIVE) { 1182 for (i = 0; i < a->totalslices; i++) { /* loop over slices */ 1183 PetscInt row; 1184 PetscCall(PetscViewerASCIIPrintf(viewer, "slice %" PetscInt_FMT ": %" PetscInt_FMT " %" PetscInt_FMT "\n", i, a->sliidx[i], a->sliidx[i + 1])); 1185 for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = (row + 1) % a->sliceheight) { 1186 #if defined(PETSC_USE_COMPLEX) 1187 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1188 PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g + %g i\n", a->sliceheight * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j]))); 1189 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1190 PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g - %g i\n", a->sliceheight * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), -(double)PetscImaginaryPart(a->val[j]))); 1191 } else { 1192 PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g\n", a->sliceheight * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]))); 1193 } 1194 #else 1195 PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g\n", a->sliceheight * i + row, a->colidx[j], (double)a->val[j])); 1196 #endif 1197 } 1198 } 1199 } else { 1200 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1201 if (A->factortype) { 1202 for (i = 0; i < m; i++) { 1203 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1204 PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i)); 1205 /* L part */ 1206 for (j = shift; j < a->diag[i]; j += a->sliceheight) { 1207 #if defined(PETSC_USE_COMPLEX) 1208 if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) > 0.0) { 1209 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j]))); 1210 } else if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) < 0.0) { 1211 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j])))); 1212 } else { 1213 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j]))); 1214 } 1215 #else 1216 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j])); 1217 #endif 1218 } 1219 /* diagonal */ 1220 j = a->diag[i]; 1221 #if defined(PETSC_USE_COMPLEX) 1222 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1223 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)PetscImaginaryPart(1.0 / a->val[j]))); 1224 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1225 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)(-PetscImaginaryPart(1.0 / a->val[j])))); 1226 } else { 1227 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]))); 1228 } 1229 #else 1230 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)(1.0 / a->val[j]))); 1231 #endif 1232 1233 /* U part */ 1234 for (j = a->diag[i] + 1; j < shift + a->sliceheight * a->rlen[i]; j += a->sliceheight) { 1235 #if defined(PETSC_USE_COMPLEX) 1236 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1237 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j]))); 1238 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1239 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j])))); 1240 } else { 1241 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j]))); 1242 } 1243 #else 1244 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j])); 1245 #endif 1246 } 1247 PetscCall(PetscViewerASCIIPrintf(viewer, "\n")); 1248 } 1249 } else { 1250 for (i = 0; i < m; i++) { 1251 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1252 PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i)); 1253 for (j = 0; j < a->rlen[i]; j++) { 1254 #if defined(PETSC_USE_COMPLEX) 1255 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1256 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j]))); 1257 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1258 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)-PetscImaginaryPart(a->val[shift + a->sliceheight * j]))); 1259 } else { 1260 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]))); 1261 } 1262 #else 1263 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)a->val[shift + a->sliceheight * j])); 1264 #endif 1265 } 1266 PetscCall(PetscViewerASCIIPrintf(viewer, "\n")); 1267 } 1268 } 1269 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1270 } 1271 PetscCall(PetscViewerFlush(viewer)); 1272 PetscFunctionReturn(PETSC_SUCCESS); 1273 } 1274 1275 #include <petscdraw.h> 1276 PetscErrorCode MatView_SeqSELL_Draw_Zoom(PetscDraw draw, void *Aa) 1277 { 1278 Mat A = (Mat)Aa; 1279 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1280 PetscInt i, j, m = A->rmap->n, shift; 1281 int color; 1282 PetscReal xl, yl, xr, yr, x_l, x_r, y_l, y_r; 1283 PetscViewer viewer; 1284 PetscViewerFormat format; 1285 1286 PetscFunctionBegin; 1287 PetscCall(PetscObjectQuery((PetscObject)A, "Zoomviewer", (PetscObject *)&viewer)); 1288 PetscCall(PetscViewerGetFormat(viewer, &format)); 1289 PetscCall(PetscDrawGetCoordinates(draw, &xl, &yl, &xr, &yr)); 1290 1291 /* loop over matrix elements drawing boxes */ 1292 1293 if (format != PETSC_VIEWER_DRAW_CONTOUR) { 1294 PetscDrawCollectiveBegin(draw); 1295 /* Blue for negative, Cyan for zero and Red for positive */ 1296 color = PETSC_DRAW_BLUE; 1297 for (i = 0; i < m; i++) { 1298 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */ 1299 y_l = m - i - 1.0; 1300 y_r = y_l + 1.0; 1301 for (j = 0; j < a->rlen[i]; j++) { 1302 x_l = a->colidx[shift + a->sliceheight * j]; 1303 x_r = x_l + 1.0; 1304 if (PetscRealPart(a->val[shift + a->sliceheight * j]) >= 0.) continue; 1305 PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color)); 1306 } 1307 } 1308 color = PETSC_DRAW_CYAN; 1309 for (i = 0; i < m; i++) { 1310 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1311 y_l = m - i - 1.0; 1312 y_r = y_l + 1.0; 1313 for (j = 0; j < a->rlen[i]; j++) { 1314 x_l = a->colidx[shift + a->sliceheight * j]; 1315 x_r = x_l + 1.0; 1316 if (a->val[shift + a->sliceheight * j] != 0.) continue; 1317 PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color)); 1318 } 1319 } 1320 color = PETSC_DRAW_RED; 1321 for (i = 0; i < m; i++) { 1322 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1323 y_l = m - i - 1.0; 1324 y_r = y_l + 1.0; 1325 for (j = 0; j < a->rlen[i]; j++) { 1326 x_l = a->colidx[shift + a->sliceheight * j]; 1327 x_r = x_l + 1.0; 1328 if (PetscRealPart(a->val[shift + a->sliceheight * j]) <= 0.) continue; 1329 PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color)); 1330 } 1331 } 1332 PetscDrawCollectiveEnd(draw); 1333 } else { 1334 /* use contour shading to indicate magnitude of values */ 1335 /* first determine max of all nonzero values */ 1336 PetscReal minv = 0.0, maxv = 0.0; 1337 PetscInt count = 0; 1338 PetscDraw popup; 1339 for (i = 0; i < a->sliidx[a->totalslices]; i++) { 1340 if (PetscAbsScalar(a->val[i]) > maxv) maxv = PetscAbsScalar(a->val[i]); 1341 } 1342 if (minv >= maxv) maxv = minv + PETSC_SMALL; 1343 PetscCall(PetscDrawGetPopup(draw, &popup)); 1344 PetscCall(PetscDrawScalePopup(popup, minv, maxv)); 1345 1346 PetscDrawCollectiveBegin(draw); 1347 for (i = 0; i < m; i++) { 1348 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1349 y_l = m - i - 1.0; 1350 y_r = y_l + 1.0; 1351 for (j = 0; j < a->rlen[i]; j++) { 1352 x_l = a->colidx[shift + a->sliceheight * j]; 1353 x_r = x_l + 1.0; 1354 color = PetscDrawRealToColor(PetscAbsScalar(a->val[count]), minv, maxv); 1355 PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color)); 1356 count++; 1357 } 1358 } 1359 PetscDrawCollectiveEnd(draw); 1360 } 1361 PetscFunctionReturn(PETSC_SUCCESS); 1362 } 1363 1364 #include <petscdraw.h> 1365 PetscErrorCode MatView_SeqSELL_Draw(Mat A, PetscViewer viewer) 1366 { 1367 PetscDraw draw; 1368 PetscReal xr, yr, xl, yl, h, w; 1369 PetscBool isnull; 1370 1371 PetscFunctionBegin; 1372 PetscCall(PetscViewerDrawGetDraw(viewer, 0, &draw)); 1373 PetscCall(PetscDrawIsNull(draw, &isnull)); 1374 if (isnull) PetscFunctionReturn(PETSC_SUCCESS); 1375 1376 xr = A->cmap->n; 1377 yr = A->rmap->n; 1378 h = yr / 10.0; 1379 w = xr / 10.0; 1380 xr += w; 1381 yr += h; 1382 xl = -w; 1383 yl = -h; 1384 PetscCall(PetscDrawSetCoordinates(draw, xl, yl, xr, yr)); 1385 PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", (PetscObject)viewer)); 1386 PetscCall(PetscDrawZoom(draw, MatView_SeqSELL_Draw_Zoom, A)); 1387 PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", NULL)); 1388 PetscCall(PetscDrawSave(draw)); 1389 PetscFunctionReturn(PETSC_SUCCESS); 1390 } 1391 1392 PetscErrorCode MatView_SeqSELL(Mat A, PetscViewer viewer) 1393 { 1394 PetscBool iascii, isbinary, isdraw; 1395 1396 PetscFunctionBegin; 1397 PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii)); 1398 PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERBINARY, &isbinary)); 1399 PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERDRAW, &isdraw)); 1400 if (iascii) { 1401 PetscCall(MatView_SeqSELL_ASCII(A, viewer)); 1402 } else if (isbinary) { 1403 /* PetscCall(MatView_SeqSELL_Binary(A,viewer)); */ 1404 } else if (isdraw) PetscCall(MatView_SeqSELL_Draw(A, viewer)); 1405 PetscFunctionReturn(PETSC_SUCCESS); 1406 } 1407 1408 PetscErrorCode MatAssemblyEnd_SeqSELL(Mat A, MatAssemblyType mode) 1409 { 1410 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1411 PetscInt i, shift, row_in_slice, row, nrow, *cp, lastcol, j, k; 1412 MatScalar *vp; 1413 1414 PetscFunctionBegin; 1415 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS); 1416 /* To do: compress out the unused elements */ 1417 PetscCall(MatMarkDiagonal_SeqSELL(A)); 1418 PetscCall(PetscInfo(A, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: %" PetscInt_FMT " allocated %" PetscInt_FMT " used (%" PetscInt_FMT " nonzeros+%" PetscInt_FMT " paddedzeros)\n", A->rmap->n, A->cmap->n, a->maxallocmat, a->sliidx[a->totalslices], a->nz, a->sliidx[a->totalslices] - a->nz)); 1419 PetscCall(PetscInfo(A, "Number of mallocs during MatSetValues() is %" PetscInt_FMT "\n", a->reallocs)); 1420 PetscCall(PetscInfo(A, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", a->rlenmax)); 1421 a->nonzerorowcnt = 0; 1422 /* Set unused slots for column indices to last valid column index. Set unused slots for values to zero. This allows for a use of unmasked intrinsics -> higher performance */ 1423 for (i = 0; i < a->totalslices; ++i) { 1424 shift = a->sliidx[i]; /* starting index of the slice */ 1425 cp = a->colidx + shift; /* pointer to the column indices of the slice */ 1426 vp = a->val + shift; /* pointer to the nonzero values of the slice */ 1427 for (row_in_slice = 0; row_in_slice < a->sliceheight; ++row_in_slice) { /* loop over rows in the slice */ 1428 row = a->sliceheight * i + row_in_slice; 1429 nrow = a->rlen[row]; /* number of nonzeros in row */ 1430 /* 1431 Search for the nearest nonzero. Normally setting the index to zero may cause extra communication. 1432 But if the entire slice are empty, it is fine to use 0 since the index will not be loaded. 1433 */ 1434 lastcol = 0; 1435 if (nrow > 0) { /* nonempty row */ 1436 a->nonzerorowcnt++; 1437 lastcol = cp[a->sliceheight * (nrow - 1) + row_in_slice]; /* use the index from the last nonzero at current row */ 1438 } else if (!row_in_slice) { /* first row of the correct slice is empty */ 1439 for (j = 1; j < a->sliceheight; j++) { 1440 if (a->rlen[a->sliceheight * i + j]) { 1441 lastcol = cp[j]; 1442 break; 1443 } 1444 } 1445 } else { 1446 if (a->sliidx[i + 1] != shift) lastcol = cp[row_in_slice - 1]; /* use the index from the previous row */ 1447 } 1448 1449 for (k = nrow; k < (a->sliidx[i + 1] - shift) / a->sliceheight; ++k) { 1450 cp[a->sliceheight * k + row_in_slice] = lastcol; 1451 vp[a->sliceheight * k + row_in_slice] = (MatScalar)0; 1452 } 1453 } 1454 } 1455 1456 A->info.mallocs += a->reallocs; 1457 a->reallocs = 0; 1458 1459 PetscCall(MatSeqSELLInvalidateDiagonal(A)); 1460 PetscFunctionReturn(PETSC_SUCCESS); 1461 } 1462 1463 PetscErrorCode MatGetInfo_SeqSELL(Mat A, MatInfoType flag, MatInfo *info) 1464 { 1465 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1466 1467 PetscFunctionBegin; 1468 info->block_size = 1.0; 1469 info->nz_allocated = a->maxallocmat; 1470 info->nz_used = a->sliidx[a->totalslices]; /* include padding zeros */ 1471 info->nz_unneeded = (a->maxallocmat - a->sliidx[a->totalslices]); 1472 info->assemblies = A->num_ass; 1473 info->mallocs = A->info.mallocs; 1474 info->memory = 0; /* REVIEW ME */ 1475 if (A->factortype) { 1476 info->fill_ratio_given = A->info.fill_ratio_given; 1477 info->fill_ratio_needed = A->info.fill_ratio_needed; 1478 info->factor_mallocs = A->info.factor_mallocs; 1479 } else { 1480 info->fill_ratio_given = 0; 1481 info->fill_ratio_needed = 0; 1482 info->factor_mallocs = 0; 1483 } 1484 PetscFunctionReturn(PETSC_SUCCESS); 1485 } 1486 1487 PetscErrorCode MatSetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], const PetscScalar v[], InsertMode is) 1488 { 1489 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1490 PetscInt shift, i, k, l, low, high, t, ii, row, col, nrow; 1491 PetscInt *cp, nonew = a->nonew, lastcol = -1; 1492 MatScalar *vp, value; 1493 #if defined(PETSC_HAVE_CUDA) 1494 PetscBool inserted = PETSC_FALSE; 1495 #endif 1496 1497 PetscFunctionBegin; 1498 for (k = 0; k < m; k++) { /* loop over added rows */ 1499 row = im[k]; 1500 if (row < 0) continue; 1501 PetscCheck(row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->n - 1); 1502 shift = a->sliidx[row / a->sliceheight] + row % a->sliceheight; /* starting index of the row */ 1503 cp = a->colidx + shift; /* pointer to the row */ 1504 vp = a->val + shift; /* pointer to the row */ 1505 nrow = a->rlen[row]; 1506 low = 0; 1507 high = nrow; 1508 1509 for (l = 0; l < n; l++) { /* loop over added columns */ 1510 col = in[l]; 1511 if (col < 0) continue; 1512 PetscCheck(col < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Col too large: row %" PetscInt_FMT " max %" PetscInt_FMT, col, A->cmap->n - 1); 1513 if (a->roworiented) { 1514 value = v[l + k * n]; 1515 } else { 1516 value = v[k + l * m]; 1517 } 1518 if ((value == 0.0 && a->ignorezeroentries) && (is == ADD_VALUES)) continue; 1519 1520 /* search in this row for the specified column, i indicates the column to be set */ 1521 if (col <= lastcol) low = 0; 1522 else high = nrow; 1523 lastcol = col; 1524 while (high - low > 5) { 1525 t = (low + high) / 2; 1526 if (*(cp + a->sliceheight * t) > col) high = t; 1527 else low = t; 1528 } 1529 for (i = low; i < high; i++) { 1530 if (*(cp + a->sliceheight * i) > col) break; 1531 if (*(cp + a->sliceheight * i) == col) { 1532 if (is == ADD_VALUES) *(vp + a->sliceheight * i) += value; 1533 else *(vp + a->sliceheight * i) = value; 1534 #if defined(PETSC_HAVE_CUDA) 1535 inserted = PETSC_TRUE; 1536 #endif 1537 low = i + 1; 1538 goto noinsert; 1539 } 1540 } 1541 if (value == 0.0 && a->ignorezeroentries) goto noinsert; 1542 if (nonew == 1) goto noinsert; 1543 PetscCheck(nonew != -1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Inserting a new nonzero (%" PetscInt_FMT ", %" PetscInt_FMT ") in the matrix", row, col); 1544 /* If the current row length exceeds the slice width (e.g. nrow==slice_width), allocate a new space, otherwise do nothing */ 1545 MatSeqXSELLReallocateSELL(A, A->rmap->n, 1, nrow, a->sliidx, a->sliceheight, row / a->sliceheight, row, col, a->colidx, a->val, cp, vp, nonew, MatScalar); 1546 /* add the new nonzero to the high position, shift the remaining elements in current row to the right by one slot */ 1547 for (ii = nrow - 1; ii >= i; ii--) { 1548 *(cp + a->sliceheight * (ii + 1)) = *(cp + a->sliceheight * ii); 1549 *(vp + a->sliceheight * (ii + 1)) = *(vp + a->sliceheight * ii); 1550 } 1551 a->rlen[row]++; 1552 *(cp + a->sliceheight * i) = col; 1553 *(vp + a->sliceheight * i) = value; 1554 a->nz++; 1555 A->nonzerostate++; 1556 #if defined(PETSC_HAVE_CUDA) 1557 inserted = PETSC_TRUE; 1558 #endif 1559 low = i + 1; 1560 high++; 1561 nrow++; 1562 noinsert:; 1563 } 1564 a->rlen[row] = nrow; 1565 } 1566 #if defined(PETSC_HAVE_CUDA) 1567 if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED && inserted) A->offloadmask = PETSC_OFFLOAD_CPU; 1568 #endif 1569 PetscFunctionReturn(PETSC_SUCCESS); 1570 } 1571 1572 PetscErrorCode MatCopy_SeqSELL(Mat A, Mat B, MatStructure str) 1573 { 1574 PetscFunctionBegin; 1575 /* If the two matrices have the same copy implementation, use fast copy. */ 1576 if (str == SAME_NONZERO_PATTERN && (A->ops->copy == B->ops->copy)) { 1577 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1578 Mat_SeqSELL *b = (Mat_SeqSELL *)B->data; 1579 1580 PetscCheck(a->sliidx[a->totalslices] == b->sliidx[b->totalslices], PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Number of nonzeros in two matrices are different"); 1581 PetscCall(PetscArraycpy(b->val, a->val, a->sliidx[a->totalslices])); 1582 } else { 1583 PetscCall(MatCopy_Basic(A, B, str)); 1584 } 1585 PetscFunctionReturn(PETSC_SUCCESS); 1586 } 1587 1588 PetscErrorCode MatSetUp_SeqSELL(Mat A) 1589 { 1590 PetscFunctionBegin; 1591 PetscCall(MatSeqSELLSetPreallocation(A, PETSC_DEFAULT, NULL)); 1592 PetscFunctionReturn(PETSC_SUCCESS); 1593 } 1594 1595 PetscErrorCode MatSeqSELLGetArray_SeqSELL(Mat A, PetscScalar *array[]) 1596 { 1597 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1598 1599 PetscFunctionBegin; 1600 *array = a->val; 1601 PetscFunctionReturn(PETSC_SUCCESS); 1602 } 1603 1604 PetscErrorCode MatSeqSELLRestoreArray_SeqSELL(Mat A, PetscScalar *array[]) 1605 { 1606 PetscFunctionBegin; 1607 PetscFunctionReturn(PETSC_SUCCESS); 1608 } 1609 1610 PetscErrorCode MatRealPart_SeqSELL(Mat A) 1611 { 1612 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1613 PetscInt i; 1614 MatScalar *aval = a->val; 1615 1616 PetscFunctionBegin; 1617 for (i = 0; i < a->sliidx[a->totalslices]; i++) aval[i] = PetscRealPart(aval[i]); 1618 #if defined(PETSC_HAVE_CUDA) 1619 if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) A->offloadmask = PETSC_OFFLOAD_CPU; 1620 #endif 1621 PetscFunctionReturn(PETSC_SUCCESS); 1622 } 1623 1624 PetscErrorCode MatImaginaryPart_SeqSELL(Mat A) 1625 { 1626 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1627 PetscInt i; 1628 MatScalar *aval = a->val; 1629 1630 PetscFunctionBegin; 1631 for (i = 0; i < a->sliidx[a->totalslices]; i++) aval[i] = PetscImaginaryPart(aval[i]); 1632 PetscCall(MatSeqSELLInvalidateDiagonal(A)); 1633 #if defined(PETSC_HAVE_CUDA) 1634 if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) A->offloadmask = PETSC_OFFLOAD_CPU; 1635 #endif 1636 PetscFunctionReturn(PETSC_SUCCESS); 1637 } 1638 1639 PetscErrorCode MatScale_SeqSELL(Mat inA, PetscScalar alpha) 1640 { 1641 Mat_SeqSELL *a = (Mat_SeqSELL *)inA->data; 1642 MatScalar *aval = a->val; 1643 PetscScalar oalpha = alpha; 1644 PetscBLASInt one = 1, size; 1645 1646 PetscFunctionBegin; 1647 PetscCall(PetscBLASIntCast(a->sliidx[a->totalslices], &size)); 1648 PetscCallBLAS("BLASscal", BLASscal_(&size, &oalpha, aval, &one)); 1649 PetscCall(PetscLogFlops(a->nz)); 1650 PetscCall(MatSeqSELLInvalidateDiagonal(inA)); 1651 #if defined(PETSC_HAVE_CUDA) 1652 if (inA->offloadmask != PETSC_OFFLOAD_UNALLOCATED) inA->offloadmask = PETSC_OFFLOAD_CPU; 1653 #endif 1654 PetscFunctionReturn(PETSC_SUCCESS); 1655 } 1656 1657 PetscErrorCode MatShift_SeqSELL(Mat Y, PetscScalar a) 1658 { 1659 Mat_SeqSELL *y = (Mat_SeqSELL *)Y->data; 1660 1661 PetscFunctionBegin; 1662 if (!Y->preallocated || !y->nz) PetscCall(MatSeqSELLSetPreallocation(Y, 1, NULL)); 1663 PetscCall(MatShift_Basic(Y, a)); 1664 PetscFunctionReturn(PETSC_SUCCESS); 1665 } 1666 1667 PetscErrorCode MatSOR_SeqSELL(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx) 1668 { 1669 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1670 PetscScalar *x, sum, *t; 1671 const MatScalar *idiag = NULL, *mdiag; 1672 const PetscScalar *b, *xb; 1673 PetscInt n, m = A->rmap->n, i, j, shift; 1674 const PetscInt *diag; 1675 1676 PetscFunctionBegin; 1677 its = its * lits; 1678 1679 if (fshift != a->fshift || omega != a->omega) a->idiagvalid = PETSC_FALSE; /* must recompute idiag[] */ 1680 if (!a->idiagvalid) PetscCall(MatInvertDiagonal_SeqSELL(A, omega, fshift)); 1681 a->fshift = fshift; 1682 a->omega = omega; 1683 1684 diag = a->diag; 1685 t = a->ssor_work; 1686 idiag = a->idiag; 1687 mdiag = a->mdiag; 1688 1689 PetscCall(VecGetArray(xx, &x)); 1690 PetscCall(VecGetArrayRead(bb, &b)); 1691 /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */ 1692 PetscCheck(flag != SOR_APPLY_UPPER, PETSC_COMM_SELF, PETSC_ERR_SUP, "SOR_APPLY_UPPER is not implemented"); 1693 PetscCheck(flag != SOR_APPLY_LOWER, PETSC_COMM_SELF, PETSC_ERR_SUP, "SOR_APPLY_LOWER is not implemented"); 1694 PetscCheck(!(flag & SOR_EISENSTAT), PETSC_COMM_SELF, PETSC_ERR_SUP, "No support yet for Eisenstat"); 1695 1696 if (flag & SOR_ZERO_INITIAL_GUESS) { 1697 if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) { 1698 for (i = 0; i < m; i++) { 1699 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */ 1700 sum = b[i]; 1701 n = (diag[i] - shift) / a->sliceheight; 1702 for (j = 0; j < n; j++) sum -= a->val[shift + a->sliceheight * j] * x[a->colidx[shift + a->sliceheight * j]]; 1703 t[i] = sum; 1704 x[i] = sum * idiag[i]; 1705 } 1706 xb = t; 1707 PetscCall(PetscLogFlops(a->nz)); 1708 } else xb = b; 1709 if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) { 1710 for (i = m - 1; i >= 0; i--) { 1711 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */ 1712 sum = xb[i]; 1713 n = a->rlen[i] - (diag[i] - shift) / a->sliceheight - 1; 1714 for (j = 1; j <= n; j++) sum -= a->val[diag[i] + a->sliceheight * j] * x[a->colidx[diag[i] + a->sliceheight * j]]; 1715 if (xb == b) { 1716 x[i] = sum * idiag[i]; 1717 } else { 1718 x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */ 1719 } 1720 } 1721 PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */ 1722 } 1723 its--; 1724 } 1725 while (its--) { 1726 if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) { 1727 for (i = 0; i < m; i++) { 1728 /* lower */ 1729 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */ 1730 sum = b[i]; 1731 n = (diag[i] - shift) / a->sliceheight; 1732 for (j = 0; j < n; j++) sum -= a->val[shift + a->sliceheight * j] * x[a->colidx[shift + a->sliceheight * j]]; 1733 t[i] = sum; /* save application of the lower-triangular part */ 1734 /* upper */ 1735 n = a->rlen[i] - (diag[i] - shift) / a->sliceheight - 1; 1736 for (j = 1; j <= n; j++) sum -= a->val[diag[i] + a->sliceheight * j] * x[a->colidx[diag[i] + a->sliceheight * j]]; 1737 x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */ 1738 } 1739 xb = t; 1740 PetscCall(PetscLogFlops(2.0 * a->nz)); 1741 } else xb = b; 1742 if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) { 1743 for (i = m - 1; i >= 0; i--) { 1744 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */ 1745 sum = xb[i]; 1746 if (xb == b) { 1747 /* whole matrix (no checkpointing available) */ 1748 n = a->rlen[i]; 1749 for (j = 0; j < n; j++) sum -= a->val[shift + a->sliceheight * j] * x[a->colidx[shift + a->sliceheight * j]]; 1750 x[i] = (1. - omega) * x[i] + (sum + mdiag[i] * x[i]) * idiag[i]; 1751 } else { /* lower-triangular part has been saved, so only apply upper-triangular */ 1752 n = a->rlen[i] - (diag[i] - shift) / a->sliceheight - 1; 1753 for (j = 1; j <= n; j++) sum -= a->val[diag[i] + a->sliceheight * j] * x[a->colidx[diag[i] + a->sliceheight * j]]; 1754 x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */ 1755 } 1756 } 1757 if (xb == b) { 1758 PetscCall(PetscLogFlops(2.0 * a->nz)); 1759 } else { 1760 PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */ 1761 } 1762 } 1763 } 1764 PetscCall(VecRestoreArray(xx, &x)); 1765 PetscCall(VecRestoreArrayRead(bb, &b)); 1766 PetscFunctionReturn(PETSC_SUCCESS); 1767 } 1768 1769 static struct _MatOps MatOps_Values = {MatSetValues_SeqSELL, 1770 MatGetRow_SeqSELL, 1771 MatRestoreRow_SeqSELL, 1772 MatMult_SeqSELL, 1773 /* 4*/ MatMultAdd_SeqSELL, 1774 MatMultTranspose_SeqSELL, 1775 MatMultTransposeAdd_SeqSELL, 1776 NULL, 1777 NULL, 1778 NULL, 1779 /* 10*/ NULL, 1780 NULL, 1781 NULL, 1782 MatSOR_SeqSELL, 1783 NULL, 1784 /* 15*/ MatGetInfo_SeqSELL, 1785 MatEqual_SeqSELL, 1786 MatGetDiagonal_SeqSELL, 1787 MatDiagonalScale_SeqSELL, 1788 NULL, 1789 /* 20*/ NULL, 1790 MatAssemblyEnd_SeqSELL, 1791 MatSetOption_SeqSELL, 1792 MatZeroEntries_SeqSELL, 1793 /* 24*/ NULL, 1794 NULL, 1795 NULL, 1796 NULL, 1797 NULL, 1798 /* 29*/ MatSetUp_SeqSELL, 1799 NULL, 1800 NULL, 1801 NULL, 1802 NULL, 1803 /* 34*/ MatDuplicate_SeqSELL, 1804 NULL, 1805 NULL, 1806 NULL, 1807 NULL, 1808 /* 39*/ NULL, 1809 NULL, 1810 NULL, 1811 MatGetValues_SeqSELL, 1812 MatCopy_SeqSELL, 1813 /* 44*/ NULL, 1814 MatScale_SeqSELL, 1815 MatShift_SeqSELL, 1816 NULL, 1817 NULL, 1818 /* 49*/ NULL, 1819 NULL, 1820 NULL, 1821 NULL, 1822 NULL, 1823 /* 54*/ MatFDColoringCreate_SeqXAIJ, 1824 NULL, 1825 NULL, 1826 NULL, 1827 NULL, 1828 /* 59*/ NULL, 1829 MatDestroy_SeqSELL, 1830 MatView_SeqSELL, 1831 NULL, 1832 NULL, 1833 /* 64*/ NULL, 1834 NULL, 1835 NULL, 1836 NULL, 1837 NULL, 1838 /* 69*/ NULL, 1839 NULL, 1840 NULL, 1841 NULL, 1842 NULL, 1843 /* 74*/ NULL, 1844 MatFDColoringApply_AIJ, /* reuse the FDColoring function for AIJ */ 1845 NULL, 1846 NULL, 1847 NULL, 1848 /* 79*/ NULL, 1849 NULL, 1850 NULL, 1851 NULL, 1852 NULL, 1853 /* 84*/ NULL, 1854 NULL, 1855 NULL, 1856 NULL, 1857 NULL, 1858 /* 89*/ NULL, 1859 NULL, 1860 NULL, 1861 NULL, 1862 NULL, 1863 /* 94*/ NULL, 1864 NULL, 1865 NULL, 1866 NULL, 1867 NULL, 1868 /* 99*/ NULL, 1869 NULL, 1870 NULL, 1871 MatConjugate_SeqSELL, 1872 NULL, 1873 /*104*/ NULL, 1874 NULL, 1875 NULL, 1876 NULL, 1877 NULL, 1878 /*109*/ NULL, 1879 NULL, 1880 NULL, 1881 NULL, 1882 MatMissingDiagonal_SeqSELL, 1883 /*114*/ NULL, 1884 NULL, 1885 NULL, 1886 NULL, 1887 NULL, 1888 /*119*/ NULL, 1889 NULL, 1890 NULL, 1891 NULL, 1892 NULL, 1893 /*124*/ NULL, 1894 NULL, 1895 NULL, 1896 NULL, 1897 NULL, 1898 /*129*/ NULL, 1899 NULL, 1900 NULL, 1901 NULL, 1902 NULL, 1903 /*134*/ NULL, 1904 NULL, 1905 NULL, 1906 NULL, 1907 NULL, 1908 /*139*/ NULL, 1909 NULL, 1910 NULL, 1911 MatFDColoringSetUp_SeqXAIJ, 1912 NULL, 1913 /*144*/ NULL, 1914 NULL, 1915 NULL, 1916 NULL, 1917 NULL, 1918 NULL, 1919 /*150*/ NULL, 1920 NULL}; 1921 1922 PetscErrorCode MatStoreValues_SeqSELL(Mat mat) 1923 { 1924 Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data; 1925 1926 PetscFunctionBegin; 1927 PetscCheck(a->nonew, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first"); 1928 1929 /* allocate space for values if not already there */ 1930 if (!a->saved_values) PetscCall(PetscMalloc1(a->sliidx[a->totalslices] + 1, &a->saved_values)); 1931 1932 /* copy values over */ 1933 PetscCall(PetscArraycpy(a->saved_values, a->val, a->sliidx[a->totalslices])); 1934 PetscFunctionReturn(PETSC_SUCCESS); 1935 } 1936 1937 PetscErrorCode MatRetrieveValues_SeqSELL(Mat mat) 1938 { 1939 Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data; 1940 1941 PetscFunctionBegin; 1942 PetscCheck(a->nonew, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first"); 1943 PetscCheck(a->saved_values, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatStoreValues(A);first"); 1944 PetscCall(PetscArraycpy(a->val, a->saved_values, a->sliidx[a->totalslices])); 1945 PetscFunctionReturn(PETSC_SUCCESS); 1946 } 1947 1948 PetscErrorCode MatSeqSELLGetFillRatio_SeqSELL(Mat mat, PetscReal *ratio) 1949 { 1950 Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data; 1951 1952 PetscFunctionBegin; 1953 if (a->totalslices && a->sliidx[a->totalslices]) { 1954 *ratio = (PetscReal)(a->sliidx[a->totalslices] - a->nz) / a->sliidx[a->totalslices]; 1955 } else { 1956 *ratio = 0.0; 1957 } 1958 PetscFunctionReturn(PETSC_SUCCESS); 1959 } 1960 1961 PetscErrorCode MatSeqSELLGetMaxSliceWidth_SeqSELL(Mat mat, PetscInt *slicewidth) 1962 { 1963 Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data; 1964 PetscInt i, current_slicewidth; 1965 1966 PetscFunctionBegin; 1967 *slicewidth = 0; 1968 for (i = 0; i < a->totalslices; i++) { 1969 current_slicewidth = (a->sliidx[i + 1] - a->sliidx[i]) / a->sliceheight; 1970 if (current_slicewidth > *slicewidth) *slicewidth = current_slicewidth; 1971 } 1972 PetscFunctionReturn(PETSC_SUCCESS); 1973 } 1974 1975 PetscErrorCode MatSeqSELLGetAvgSliceWidth_SeqSELL(Mat mat, PetscReal *slicewidth) 1976 { 1977 Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data; 1978 1979 PetscFunctionBegin; 1980 *slicewidth = 0; 1981 if (a->totalslices) { *slicewidth = (PetscReal)a->sliidx[a->totalslices] / a->sliceheight / a->totalslices; } 1982 PetscFunctionReturn(PETSC_SUCCESS); 1983 } 1984 1985 PetscErrorCode MatSeqSELLSetSliceHeight_SeqSELL(Mat A, PetscInt sliceheight) 1986 { 1987 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1988 1989 PetscFunctionBegin; 1990 if (A->preallocated) PetscFunctionReturn(PETSC_SUCCESS); 1991 PetscCheck(a->sliceheight <= 0 || a->sliceheight == sliceheight, PETSC_COMM_SELF, PETSC_ERR_SUP, "Cannot change slice height %" PetscInt_FMT " to %" PetscInt_FMT, a->sliceheight, sliceheight); 1992 a->sliceheight = sliceheight; 1993 PetscFunctionReturn(PETSC_SUCCESS); 1994 } 1995 1996 /*@C 1997 MatSeqSELLRestoreArray - returns access to the array where the data for a `MATSEQSELL` matrix is stored obtained by `MatSeqSELLGetArray()` 1998 1999 Not Collective 2000 2001 Input Parameters: 2002 + A - a `MATSEQSELL` matrix 2003 - array - pointer to the data 2004 2005 Level: intermediate 2006 2007 .seealso: `Mat`, `MATSEQSELL`, `MatSeqSELLGetArray()`, `MatSeqSELLRestoreArrayF90()` 2008 @*/ 2009 PetscErrorCode MatSeqSELLRestoreArray(Mat A, PetscScalar **array) 2010 { 2011 PetscFunctionBegin; 2012 PetscUseMethod(A, "MatSeqSELLRestoreArray_C", (Mat, PetscScalar **), (A, array)); 2013 PetscFunctionReturn(PETSC_SUCCESS); 2014 } 2015 2016 /*@C 2017 MatSeqSELLGetFillRatio - returns a ratio that indicates the irregularity of the matrix. 2018 2019 Not Collective 2020 2021 Input Parameter: 2022 . A - a MATSEQSELL matrix 2023 2024 Output Parameter: 2025 . ratio - ratio of number of padded zeros to number of allocated elements 2026 2027 Level: intermediate 2028 @*/ 2029 PetscErrorCode MatSeqSELLGetFillRatio(Mat A, PetscReal *ratio) 2030 { 2031 PetscFunctionBegin; 2032 PetscUseMethod(A, "MatSeqSELLGetFillRatio_C", (Mat, PetscReal *), (A, ratio)); 2033 PetscFunctionReturn(PETSC_SUCCESS); 2034 } 2035 2036 /*@C 2037 MatSeqSELLGetMaxSliceWidth - returns the maximum slice width. 2038 2039 Not Collective 2040 2041 Input Parameter: 2042 . A - a MATSEQSELL matrix 2043 2044 Output Parameter: 2045 . slicewidth - maximum slice width 2046 2047 Level: intermediate 2048 @*/ 2049 PetscErrorCode MatSeqSELLGetMaxSliceWidth(Mat A, PetscInt *slicewidth) 2050 { 2051 PetscFunctionBegin; 2052 PetscUseMethod(A, "MatSeqSELLGetMaxSliceWidth_C", (Mat, PetscInt *), (A, slicewidth)); 2053 PetscFunctionReturn(PETSC_SUCCESS); 2054 } 2055 2056 /*@C 2057 MatSeqSELLGetAvgSliceWidth - returns the average slice width. 2058 2059 Not Collective 2060 2061 Input Parameter: 2062 . A - a MATSEQSELL matrix 2063 2064 Output Parameter: 2065 . slicewidth - average slice width 2066 2067 Level: intermediate 2068 @*/ 2069 PetscErrorCode MatSeqSELLGetAvgSliceWidth(Mat A, PetscReal *slicewidth) 2070 { 2071 PetscFunctionBegin; 2072 PetscUseMethod(A, "MatSeqSELLGetAvgSliceWidth_C", (Mat, PetscReal *), (A, slicewidth)); 2073 PetscFunctionReturn(PETSC_SUCCESS); 2074 } 2075 2076 /*@C 2077 MatSeqSELLSetSliceHeight - sets the slice height. 2078 2079 Not Collective 2080 2081 Input Parameters: 2082 + A - a MATSEQSELL matrix 2083 - sliceheight - slice height 2084 2085 Notes: 2086 You cannot change the slice height once it have been set. 2087 2088 The slice height must be set before MatSetUp() or MatXXXSetPreallocation() is called. 2089 2090 Level: intermediate 2091 @*/ 2092 PetscErrorCode MatSeqSELLSetSliceHeight(Mat A, PetscInt sliceheight) 2093 { 2094 PetscFunctionBegin; 2095 PetscUseMethod(A, "MatSeqSELLSetSliceHeight_C", (Mat, PetscInt), (A, sliceheight)); 2096 PetscFunctionReturn(PETSC_SUCCESS); 2097 } 2098 2099 /*@C 2100 MatSeqSELLGetVarSliceSize - returns the variance of the slice size. 2101 2102 Not Collective 2103 2104 Input Parameter: 2105 . A - a MATSEQSELL matrix 2106 2107 Output Parameter: 2108 . variance - variance of the slice size 2109 2110 Level: intermediate 2111 @*/ 2112 PetscErrorCode MatSeqSELLGetVarSliceSize(Mat A, PetscReal *variance) 2113 { 2114 PetscFunctionBegin; 2115 PetscUseMethod(A, "MatSeqSELLGetVarSliceSize_C", (Mat, PetscReal *), (A, variance)); 2116 PetscFunctionReturn(PETSC_SUCCESS); 2117 } 2118 2119 #if defined(PETSC_HAVE_CUDA) 2120 PETSC_EXTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLCUDA(Mat); 2121 #endif 2122 2123 PETSC_EXTERN PetscErrorCode MatCreate_SeqSELL(Mat B) 2124 { 2125 Mat_SeqSELL *b; 2126 PetscMPIInt size; 2127 2128 PetscFunctionBegin; 2129 PetscCall(PetscCitationsRegister(citation, &cited)); 2130 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size)); 2131 PetscCheck(size <= 1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Comm must be of size 1"); 2132 2133 PetscCall(PetscNew(&b)); 2134 2135 B->data = (void *)b; 2136 2137 PetscCall(PetscMemcpy(B->ops, &MatOps_Values, sizeof(struct _MatOps))); 2138 2139 b->row = NULL; 2140 b->col = NULL; 2141 b->icol = NULL; 2142 b->reallocs = 0; 2143 b->ignorezeroentries = PETSC_FALSE; 2144 b->roworiented = PETSC_TRUE; 2145 b->nonew = 0; 2146 b->diag = NULL; 2147 b->solve_work = NULL; 2148 B->spptr = NULL; 2149 b->saved_values = NULL; 2150 b->idiag = NULL; 2151 b->mdiag = NULL; 2152 b->ssor_work = NULL; 2153 b->omega = 1.0; 2154 b->fshift = 0.0; 2155 b->idiagvalid = PETSC_FALSE; 2156 b->keepnonzeropattern = PETSC_FALSE; 2157 b->sliceheight = 0; 2158 2159 PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQSELL)); 2160 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetArray_C", MatSeqSELLGetArray_SeqSELL)); 2161 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLRestoreArray_C", MatSeqSELLRestoreArray_SeqSELL)); 2162 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatStoreValues_C", MatStoreValues_SeqSELL)); 2163 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatRetrieveValues_C", MatRetrieveValues_SeqSELL)); 2164 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLSetPreallocation_C", MatSeqSELLSetPreallocation_SeqSELL)); 2165 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_SeqSELL_SeqAIJ_C", MatConvert_SeqSELL_SeqAIJ)); 2166 #if defined(PETSC_HAVE_CUDA) 2167 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_SeqSELL_SeqSELLCUDA_C", MatConvert_SeqSELL_SeqSELLCUDA)); 2168 #endif 2169 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetFillRatio_C", MatSeqSELLGetFillRatio_SeqSELL)); 2170 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetMaxSliceWidth_C", MatSeqSELLGetMaxSliceWidth_SeqSELL)); 2171 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetAvgSliceWidth_C", MatSeqSELLGetAvgSliceWidth_SeqSELL)); 2172 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLSetSliceHeight_C", MatSeqSELLSetSliceHeight_SeqSELL)); 2173 2174 PetscObjectOptionsBegin((PetscObject)B); 2175 { 2176 PetscInt newsh = -1; 2177 PetscBool flg; 2178 2179 PetscCall(PetscOptionsInt("-mat_sell_slice_height", "Set the slice height used to store SELL matrix", "MatSELLSetSliceHeight", newsh, &newsh, &flg)); 2180 if (flg) { PetscCall(MatSeqSELLSetSliceHeight(B, newsh)); } 2181 } 2182 PetscOptionsEnd(); 2183 PetscFunctionReturn(PETSC_SUCCESS); 2184 } 2185 2186 /* 2187 Given a matrix generated with MatGetFactor() duplicates all the information in A into B 2188 */ 2189 PetscErrorCode MatDuplicateNoCreate_SeqSELL(Mat C, Mat A, MatDuplicateOption cpvalues, PetscBool mallocmatspace) 2190 { 2191 Mat_SeqSELL *c = (Mat_SeqSELL *)C->data, *a = (Mat_SeqSELL *)A->data; 2192 PetscInt i, m = A->rmap->n; 2193 PetscInt totalslices = a->totalslices; 2194 2195 PetscFunctionBegin; 2196 C->factortype = A->factortype; 2197 c->row = NULL; 2198 c->col = NULL; 2199 c->icol = NULL; 2200 c->reallocs = 0; 2201 C->assembled = PETSC_TRUE; 2202 2203 PetscCall(PetscLayoutReference(A->rmap, &C->rmap)); 2204 PetscCall(PetscLayoutReference(A->cmap, &C->cmap)); 2205 2206 PetscCall(PetscMalloc1(a->sliceheight * totalslices, &c->rlen)); 2207 PetscCall(PetscMalloc1(totalslices + 1, &c->sliidx)); 2208 2209 for (i = 0; i < m; i++) c->rlen[i] = a->rlen[i]; 2210 for (i = 0; i < totalslices + 1; i++) c->sliidx[i] = a->sliidx[i]; 2211 2212 /* allocate the matrix space */ 2213 if (mallocmatspace) { 2214 PetscCall(PetscMalloc2(a->maxallocmat, &c->val, a->maxallocmat, &c->colidx)); 2215 2216 c->singlemalloc = PETSC_TRUE; 2217 2218 if (m > 0) { 2219 PetscCall(PetscArraycpy(c->colidx, a->colidx, a->maxallocmat)); 2220 if (cpvalues == MAT_COPY_VALUES) { 2221 PetscCall(PetscArraycpy(c->val, a->val, a->maxallocmat)); 2222 } else { 2223 PetscCall(PetscArrayzero(c->val, a->maxallocmat)); 2224 } 2225 } 2226 } 2227 2228 c->ignorezeroentries = a->ignorezeroentries; 2229 c->roworiented = a->roworiented; 2230 c->nonew = a->nonew; 2231 if (a->diag) { 2232 PetscCall(PetscMalloc1(m, &c->diag)); 2233 for (i = 0; i < m; i++) c->diag[i] = a->diag[i]; 2234 } else c->diag = NULL; 2235 2236 c->solve_work = NULL; 2237 c->saved_values = NULL; 2238 c->idiag = NULL; 2239 c->ssor_work = NULL; 2240 c->keepnonzeropattern = a->keepnonzeropattern; 2241 c->free_val = PETSC_TRUE; 2242 c->free_colidx = PETSC_TRUE; 2243 2244 c->maxallocmat = a->maxallocmat; 2245 c->maxallocrow = a->maxallocrow; 2246 c->rlenmax = a->rlenmax; 2247 c->nz = a->nz; 2248 C->preallocated = PETSC_TRUE; 2249 2250 c->nonzerorowcnt = a->nonzerorowcnt; 2251 C->nonzerostate = A->nonzerostate; 2252 2253 PetscCall(PetscFunctionListDuplicate(((PetscObject)A)->qlist, &((PetscObject)C)->qlist)); 2254 PetscFunctionReturn(PETSC_SUCCESS); 2255 } 2256 2257 PetscErrorCode MatDuplicate_SeqSELL(Mat A, MatDuplicateOption cpvalues, Mat *B) 2258 { 2259 PetscFunctionBegin; 2260 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B)); 2261 PetscCall(MatSetSizes(*B, A->rmap->n, A->cmap->n, A->rmap->n, A->cmap->n)); 2262 if (!(A->rmap->n % A->rmap->bs) && !(A->cmap->n % A->cmap->bs)) PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 2263 PetscCall(MatSetType(*B, ((PetscObject)A)->type_name)); 2264 PetscCall(MatDuplicateNoCreate_SeqSELL(*B, A, cpvalues, PETSC_TRUE)); 2265 PetscFunctionReturn(PETSC_SUCCESS); 2266 } 2267 2268 /*MC 2269 MATSEQSELL - MATSEQSELL = "seqsell" - A matrix type to be used for sequential sparse matrices, 2270 based on the sliced Ellpack format 2271 2272 Options Database Key: 2273 . -mat_type seqsell - sets the matrix type to "`MATSEQELL` during a call to `MatSetFromOptions()` 2274 2275 Level: beginner 2276 2277 .seealso: `Mat`, `MatCreateSeqSell()`, `MATSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATAIJ`, `MATMPIAIJ` 2278 M*/ 2279 2280 /*MC 2281 MATSELL - MATSELL = "sell" - A matrix type to be used for sparse matrices. 2282 2283 This matrix type is identical to `MATSEQSELL` when constructed with a single process communicator, 2284 and `MATMPISELL` otherwise. As a result, for single process communicators, 2285 `MatSeqSELLSetPreallocation()` is supported, and similarly `MatMPISELLSetPreallocation()` is supported 2286 for communicators controlling multiple processes. It is recommended that you call both of 2287 the above preallocation routines for simplicity. 2288 2289 Options Database Key: 2290 . -mat_type sell - sets the matrix type to "sell" during a call to MatSetFromOptions() 2291 2292 Level: beginner 2293 2294 Notes: 2295 This format is only supported for real scalars, double precision, and 32 bit indices (the defaults). 2296 2297 It can provide better performance on Intel and AMD processes with AVX2 or AVX512 support for matrices that have a similar number of 2298 non-zeros in contiguous groups of rows. However if the computation is memory bandwidth limited it may not provide much improvement. 2299 2300 Developer Notes: 2301 On Intel (and AMD) systems some of the matrix operations use SIMD (AVX) instructions to achieve higher performance. 2302 2303 The sparse matrix format is as follows. For simplicity we assume a slice size of 2, it is actually 8 2304 .vb 2305 (2 0 3 4) 2306 Consider the matrix A = (5 0 6 0) 2307 (0 0 7 8) 2308 (0 0 9 9) 2309 2310 symbolically the Ellpack format can be written as 2311 2312 (2 3 4 |) (0 2 3 |) 2313 v = (5 6 0 |) colidx = (0 2 2 |) 2314 -------- --------- 2315 (7 8 |) (2 3 |) 2316 (9 9 |) (2 3 |) 2317 2318 The data for 2 contiguous rows of the matrix are stored together (in column-major format) (with any left-over rows handled as a special case). 2319 Any of the rows in a slice fewer columns than the rest of the slice (row 1 above) are padded with a previous valid column in their "extra" colidx[] locations and 2320 zeros in their "extra" v locations so that the matrix operations do not need special code to handle different length rows within the 2 rows in a slice. 2321 2322 The one-dimensional representation of v used in the code is (2 5 3 6 4 0 7 9 8 9) and for colidx is (0 0 2 2 3 2 2 2 3 3) 2323 2324 .ve 2325 2326 See MatMult_SeqSELL() for how this format is used with the SIMD operations to achieve high performance. 2327 2328 References: 2329 . * - Hong Zhang, Richard T. Mills, Karl Rupp, and Barry F. Smith, Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512}, 2330 Proceedings of the 47th International Conference on Parallel Processing, 2018. 2331 2332 .seealso: `Mat`, `MatCreateSeqSELL()`, `MatCreateSeqAIJ()`, `MatCreateSell()`, `MATSEQSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATMPIAIJ`, `MATAIJ` 2333 M*/ 2334 2335 /*@C 2336 MatCreateSeqSELL - Creates a sparse matrix in `MATSEQSELL` format. 2337 2338 Collective 2339 2340 Input Parameters: 2341 + comm - MPI communicator, set to `PETSC_COMM_SELF` 2342 . m - number of rows 2343 . n - number of columns 2344 . rlenmax - maximum number of nonzeros in a row, ignored if `rlen` is provided 2345 - rlen - array containing the number of nonzeros in the various rows (possibly different for each row) or NULL 2346 2347 Output Parameter: 2348 . A - the matrix 2349 2350 Level: intermediate 2351 2352 Notes: 2353 It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 2354 MatXXXXSetPreallocation() paradigm instead of this routine directly. 2355 [MatXXXXSetPreallocation() is, for example, `MatSeqSELLSetPreallocation()`] 2356 2357 Specify the preallocated storage with either `rlenmax` or `rlen` (not both). 2358 Set `rlenmax` = `PETSC_DEFAULT` and `rlen` = `NULL` for PETSc to control dynamic memory 2359 allocation. 2360 2361 .seealso: `Mat`, `MATSEQSELL`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatSeqSELLSetPreallocation()`, `MATSELL`, `MATSEQSELL`, `MATMPISELL` 2362 @*/ 2363 PetscErrorCode MatCreateSeqSELL(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt rlenmax, const PetscInt rlen[], Mat *A) 2364 { 2365 PetscFunctionBegin; 2366 PetscCall(MatCreate(comm, A)); 2367 PetscCall(MatSetSizes(*A, m, n, m, n)); 2368 PetscCall(MatSetType(*A, MATSEQSELL)); 2369 PetscCall(MatSeqSELLSetPreallocation_SeqSELL(*A, rlenmax, rlen)); 2370 PetscFunctionReturn(PETSC_SUCCESS); 2371 } 2372 2373 PetscErrorCode MatEqual_SeqSELL(Mat A, Mat B, PetscBool *flg) 2374 { 2375 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data, *b = (Mat_SeqSELL *)B->data; 2376 PetscInt totalslices = a->totalslices; 2377 2378 PetscFunctionBegin; 2379 /* If the matrix dimensions are not equal,or no of nonzeros */ 2380 if ((A->rmap->n != B->rmap->n) || (A->cmap->n != B->cmap->n) || (a->nz != b->nz) || (a->rlenmax != b->rlenmax)) { 2381 *flg = PETSC_FALSE; 2382 PetscFunctionReturn(PETSC_SUCCESS); 2383 } 2384 /* if the a->colidx are the same */ 2385 PetscCall(PetscArraycmp(a->colidx, b->colidx, a->sliidx[totalslices], flg)); 2386 if (!*flg) PetscFunctionReturn(PETSC_SUCCESS); 2387 /* if a->val are the same */ 2388 PetscCall(PetscArraycmp(a->val, b->val, a->sliidx[totalslices], flg)); 2389 PetscFunctionReturn(PETSC_SUCCESS); 2390 } 2391 2392 PetscErrorCode MatSeqSELLInvalidateDiagonal(Mat A) 2393 { 2394 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 2395 2396 PetscFunctionBegin; 2397 a->idiagvalid = PETSC_FALSE; 2398 PetscFunctionReturn(PETSC_SUCCESS); 2399 } 2400 2401 PetscErrorCode MatConjugate_SeqSELL(Mat A) 2402 { 2403 #if defined(PETSC_USE_COMPLEX) 2404 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 2405 PetscInt i; 2406 PetscScalar *val = a->val; 2407 2408 PetscFunctionBegin; 2409 for (i = 0; i < a->sliidx[a->totalslices]; i++) { val[i] = PetscConj(val[i]); } 2410 #if defined(PETSC_HAVE_CUDA) 2411 if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) A->offloadmask = PETSC_OFFLOAD_CPU; 2412 #endif 2413 #else 2414 PetscFunctionBegin; 2415 #endif 2416 PetscFunctionReturn(PETSC_SUCCESS); 2417 } 2418