1 2 /* 3 Defines the basic matrix operations for the SELL matrix storage format. 4 */ 5 #include <../src/mat/impls/sell/seq/sell.h> /*I "petscmat.h" I*/ 6 #include <petscblaslapack.h> 7 #include <petsc/private/kernels/blocktranspose.h> 8 9 static PetscBool cited = PETSC_FALSE; 10 static const char citation[] = "@inproceedings{ZhangELLPACK2018,\n" 11 " author = {Hong Zhang and Richard T. Mills and Karl Rupp and Barry F. Smith},\n" 12 " title = {Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512}},\n" 13 " booktitle = {Proceedings of the 47th International Conference on Parallel Processing},\n" 14 " year = 2018\n" 15 "}\n"; 16 17 #if defined(PETSC_HAVE_IMMINTRIN_H) && (defined(__AVX512F__) || (defined(__AVX2__) && defined(__FMA__)) || defined(__AVX__)) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 18 19 #include <immintrin.h> 20 21 #if !defined(_MM_SCALE_8) 22 #define _MM_SCALE_8 8 23 #endif 24 25 #if defined(__AVX512F__) 26 /* these do not work 27 vec_idx = _mm512_loadunpackhi_epi32(vec_idx,acolidx); 28 vec_vals = _mm512_loadunpackhi_pd(vec_vals,aval); 29 */ 30 #define AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \ 31 /* if the mask bit is set, copy from acolidx, otherwise from vec_idx */ \ 32 vec_idx = _mm256_loadu_si256((__m256i const *)acolidx); \ 33 vec_vals = _mm512_loadu_pd(aval); \ 34 vec_x = _mm512_i32gather_pd(vec_idx, x, _MM_SCALE_8); \ 35 vec_y = _mm512_fmadd_pd(vec_x, vec_vals, vec_y) 36 #elif defined(__AVX2__) && defined(__FMA__) 37 #define AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \ 38 vec_vals = _mm256_loadu_pd(aval); \ 39 vec_idx = _mm_loadu_si128((__m128i const *)acolidx); /* SSE2 */ \ 40 vec_x = _mm256_i32gather_pd(x, vec_idx, _MM_SCALE_8); \ 41 vec_y = _mm256_fmadd_pd(vec_x, vec_vals, vec_y) 42 #endif 43 #endif /* PETSC_HAVE_IMMINTRIN_H */ 44 45 /*@C 46 MatSeqSELLSetPreallocation - For good matrix assembly performance 47 the user should preallocate the matrix storage by setting the parameter nz 48 (or the array nnz). By setting these parameters accurately, performance 49 during matrix assembly can be increased significantly. 50 51 Collective 52 53 Input Parameters: 54 + B - The `MATSEQSELL` matrix 55 . rlenmax - number of nonzeros per row (same for all rows) 56 - rlen - array containing the number of nonzeros in the various rows 57 (possibly different for each row) or `NULL` 58 59 Level: intermediate 60 61 Notes: 62 If `rlen` is given then `rlenmax` is ignored. 63 64 Specify the preallocated storage with either `rlenmax` or `rlen` (not both). 65 Set `rlenmax` = `PETSC_DEFAULT` and `rlen` = `NULL` for PETSc to control dynamic memory 66 allocation. For large problems you MUST preallocate memory or you 67 will get TERRIBLE performance, see the users' manual chapter on matrices. 68 69 You can call `MatGetInfo()` to get information on how effective the preallocation was; 70 for example the fields mallocs,nz_allocated,nz_used,nz_unneeded; 71 You can also run with the option `-info` and look for messages with the string 72 malloc in them to see if additional memory allocation was needed. 73 74 Developers Note: 75 Use `rlenmax` of `MAT_SKIP_ALLOCATION` to not allocate any space for the matrix 76 entries or columns indices. 77 78 The maximum number of nonzeos in any row should be as accurate as possible. 79 If it is underestimated, you will get bad performance due to reallocation 80 (`MatSeqXSELLReallocateSELL()`). 81 82 .seealso: `Mat`, `MATSEQSELL`, `MATSELL`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatGetInfo()` 83 @*/ 84 PetscErrorCode MatSeqSELLSetPreallocation(Mat B, PetscInt rlenmax, const PetscInt rlen[]) 85 { 86 PetscFunctionBegin; 87 PetscValidHeaderSpecific(B, MAT_CLASSID, 1); 88 PetscValidType(B, 1); 89 PetscTryMethod(B, "MatSeqSELLSetPreallocation_C", (Mat, PetscInt, const PetscInt[]), (B, rlenmax, rlen)); 90 PetscFunctionReturn(PETSC_SUCCESS); 91 } 92 93 PetscErrorCode MatSeqSELLSetPreallocation_SeqSELL(Mat B, PetscInt maxallocrow, const PetscInt rlen[]) 94 { 95 Mat_SeqSELL *b; 96 PetscInt i, j, totalslices; 97 PetscBool skipallocation = PETSC_FALSE, realalloc = PETSC_FALSE; 98 99 PetscFunctionBegin; 100 if (maxallocrow >= 0 || rlen) realalloc = PETSC_TRUE; 101 if (maxallocrow == MAT_SKIP_ALLOCATION) { 102 skipallocation = PETSC_TRUE; 103 maxallocrow = 0; 104 } 105 106 PetscCall(PetscLayoutSetUp(B->rmap)); 107 PetscCall(PetscLayoutSetUp(B->cmap)); 108 109 /* FIXME: if one preallocates more space than needed, the matrix does not shrink automatically, but for best performance it should */ 110 if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 5; 111 PetscCheck(maxallocrow >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "maxallocrow cannot be less than 0: value %" PetscInt_FMT, maxallocrow); 112 if (rlen) { 113 for (i = 0; i < B->rmap->n; i++) { 114 PetscCheck(rlen[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "rlen cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, rlen[i]); 115 PetscCheck(rlen[i] <= B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "rlen cannot be greater than row length: local row %" PetscInt_FMT " value %" PetscInt_FMT " rowlength %" PetscInt_FMT, i, rlen[i], B->cmap->n); 116 } 117 } 118 119 B->preallocated = PETSC_TRUE; 120 121 b = (Mat_SeqSELL *)B->data; 122 123 totalslices = PetscCeilInt(B->rmap->n, 8); 124 b->totalslices = totalslices; 125 if (!skipallocation) { 126 if (B->rmap->n & 0x07) PetscCall(PetscInfo(B, "Padding rows to the SEQSELL matrix because the number of rows is not the multiple of 8 (value %" PetscInt_FMT ")\n", B->rmap->n)); 127 128 if (!b->sliidx) { /* sliidx gives the starting index of each slice, the last element is the total space allocated */ 129 PetscCall(PetscMalloc1(totalslices + 1, &b->sliidx)); 130 } 131 if (!rlen) { /* if rlen is not provided, allocate same space for all the slices */ 132 if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 10; 133 else if (maxallocrow < 0) maxallocrow = 1; 134 for (i = 0; i <= totalslices; i++) b->sliidx[i] = i * 8 * maxallocrow; 135 } else { 136 maxallocrow = 0; 137 b->sliidx[0] = 0; 138 for (i = 1; i < totalslices; i++) { 139 b->sliidx[i] = 0; 140 for (j = 0; j < 8; j++) b->sliidx[i] = PetscMax(b->sliidx[i], rlen[8 * (i - 1) + j]); 141 maxallocrow = PetscMax(b->sliidx[i], maxallocrow); 142 PetscCall(PetscIntSumError(b->sliidx[i - 1], 8 * b->sliidx[i], &b->sliidx[i])); 143 } 144 /* last slice */ 145 b->sliidx[totalslices] = 0; 146 for (j = (totalslices - 1) * 8; j < B->rmap->n; j++) b->sliidx[totalslices] = PetscMax(b->sliidx[totalslices], rlen[j]); 147 maxallocrow = PetscMax(b->sliidx[totalslices], maxallocrow); 148 b->sliidx[totalslices] = b->sliidx[totalslices - 1] + 8 * b->sliidx[totalslices]; 149 } 150 151 /* allocate space for val, colidx, rlen */ 152 /* FIXME: should B's old memory be unlogged? */ 153 PetscCall(MatSeqXSELLFreeSELL(B, &b->val, &b->colidx)); 154 /* FIXME: assuming an element of the bit array takes 8 bits */ 155 PetscCall(PetscMalloc2(b->sliidx[totalslices], &b->val, b->sliidx[totalslices], &b->colidx)); 156 /* b->rlen will count nonzeros in each row so far. We dont copy rlen to b->rlen because the matrix has not been set. */ 157 PetscCall(PetscCalloc1(8 * totalslices, &b->rlen)); 158 159 b->singlemalloc = PETSC_TRUE; 160 b->free_val = PETSC_TRUE; 161 b->free_colidx = PETSC_TRUE; 162 } else { 163 b->free_val = PETSC_FALSE; 164 b->free_colidx = PETSC_FALSE; 165 } 166 167 b->nz = 0; 168 b->maxallocrow = maxallocrow; 169 b->rlenmax = maxallocrow; 170 b->maxallocmat = b->sliidx[totalslices]; 171 B->info.nz_unneeded = (double)b->maxallocmat; 172 if (realalloc) PetscCall(MatSetOption(B, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE)); 173 PetscFunctionReturn(PETSC_SUCCESS); 174 } 175 176 PetscErrorCode MatGetRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v) 177 { 178 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 179 PetscInt shift; 180 181 PetscFunctionBegin; 182 PetscCheck(row >= 0 && row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row %" PetscInt_FMT " out of range", row); 183 if (nz) *nz = a->rlen[row]; 184 shift = a->sliidx[row >> 3] + (row & 0x07); 185 if (!a->getrowcols) PetscCall(PetscMalloc2(a->rlenmax, &a->getrowcols, a->rlenmax, &a->getrowvals)); 186 if (idx) { 187 PetscInt j; 188 for (j = 0; j < a->rlen[row]; j++) a->getrowcols[j] = a->colidx[shift + 8 * j]; 189 *idx = a->getrowcols; 190 } 191 if (v) { 192 PetscInt j; 193 for (j = 0; j < a->rlen[row]; j++) a->getrowvals[j] = a->val[shift + 8 * j]; 194 *v = a->getrowvals; 195 } 196 PetscFunctionReturn(PETSC_SUCCESS); 197 } 198 199 PetscErrorCode MatRestoreRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v) 200 { 201 PetscFunctionBegin; 202 PetscFunctionReturn(PETSC_SUCCESS); 203 } 204 205 PetscErrorCode MatConvert_SeqSELL_SeqAIJ(Mat A, MatType newtype, MatReuse reuse, Mat *newmat) 206 { 207 Mat B; 208 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 209 PetscInt i; 210 211 PetscFunctionBegin; 212 if (reuse == MAT_REUSE_MATRIX) { 213 B = *newmat; 214 PetscCall(MatZeroEntries(B)); 215 } else { 216 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B)); 217 PetscCall(MatSetSizes(B, A->rmap->n, A->cmap->n, A->rmap->N, A->cmap->N)); 218 PetscCall(MatSetType(B, MATSEQAIJ)); 219 PetscCall(MatSeqAIJSetPreallocation(B, 0, a->rlen)); 220 } 221 222 for (i = 0; i < A->rmap->n; i++) { 223 PetscInt nz = 0, *cols = NULL; 224 PetscScalar *vals = NULL; 225 226 PetscCall(MatGetRow_SeqSELL(A, i, &nz, &cols, &vals)); 227 PetscCall(MatSetValues(B, 1, &i, nz, cols, vals, INSERT_VALUES)); 228 PetscCall(MatRestoreRow_SeqSELL(A, i, &nz, &cols, &vals)); 229 } 230 231 PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY)); 232 PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY)); 233 B->rmap->bs = A->rmap->bs; 234 235 if (reuse == MAT_INPLACE_MATRIX) { 236 PetscCall(MatHeaderReplace(A, &B)); 237 } else { 238 *newmat = B; 239 } 240 PetscFunctionReturn(PETSC_SUCCESS); 241 } 242 243 #include <../src/mat/impls/aij/seq/aij.h> 244 245 PetscErrorCode MatConvert_SeqAIJ_SeqSELL(Mat A, MatType newtype, MatReuse reuse, Mat *newmat) 246 { 247 Mat B; 248 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 249 PetscInt *ai = a->i, m = A->rmap->N, n = A->cmap->N, i, *rowlengths, row, ncols; 250 const PetscInt *cols; 251 const PetscScalar *vals; 252 253 PetscFunctionBegin; 254 255 if (reuse == MAT_REUSE_MATRIX) { 256 B = *newmat; 257 } else { 258 if (PetscDefined(USE_DEBUG) || !a->ilen) { 259 PetscCall(PetscMalloc1(m, &rowlengths)); 260 for (i = 0; i < m; i++) rowlengths[i] = ai[i + 1] - ai[i]; 261 } 262 if (PetscDefined(USE_DEBUG) && a->ilen) { 263 PetscBool eq; 264 PetscCall(PetscMemcmp(rowlengths, a->ilen, m * sizeof(PetscInt), &eq)); 265 PetscCheck(eq, PETSC_COMM_SELF, PETSC_ERR_PLIB, "SeqAIJ ilen array incorrect"); 266 PetscCall(PetscFree(rowlengths)); 267 rowlengths = a->ilen; 268 } else if (a->ilen) rowlengths = a->ilen; 269 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B)); 270 PetscCall(MatSetSizes(B, m, n, m, n)); 271 PetscCall(MatSetType(B, MATSEQSELL)); 272 PetscCall(MatSeqSELLSetPreallocation(B, 0, rowlengths)); 273 if (rowlengths != a->ilen) PetscCall(PetscFree(rowlengths)); 274 } 275 276 for (row = 0; row < m; row++) { 277 PetscCall(MatGetRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals)); 278 PetscCall(MatSetValues_SeqSELL(B, 1, &row, ncols, cols, vals, INSERT_VALUES)); 279 PetscCall(MatRestoreRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals)); 280 } 281 PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY)); 282 PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY)); 283 B->rmap->bs = A->rmap->bs; 284 285 if (reuse == MAT_INPLACE_MATRIX) { 286 PetscCall(MatHeaderReplace(A, &B)); 287 } else { 288 *newmat = B; 289 } 290 PetscFunctionReturn(PETSC_SUCCESS); 291 } 292 293 PetscErrorCode MatMult_SeqSELL(Mat A, Vec xx, Vec yy) 294 { 295 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 296 PetscScalar *y; 297 const PetscScalar *x; 298 const MatScalar *aval = a->val; 299 PetscInt totalslices = a->totalslices; 300 const PetscInt *acolidx = a->colidx; 301 PetscInt i, j; 302 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 303 __m512d vec_x, vec_y, vec_vals; 304 __m256i vec_idx; 305 __mmask8 mask; 306 __m512d vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4; 307 __m256i vec_idx2, vec_idx3, vec_idx4; 308 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 309 __m128i vec_idx; 310 __m256d vec_x, vec_y, vec_y2, vec_vals; 311 MatScalar yval; 312 PetscInt r, rows_left, row, nnz_in_row; 313 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 314 __m128d vec_x_tmp; 315 __m256d vec_x, vec_y, vec_y2, vec_vals; 316 MatScalar yval; 317 PetscInt r, rows_left, row, nnz_in_row; 318 #else 319 PetscScalar sum[8]; 320 #endif 321 322 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 323 #pragma disjoint(*x, *y, *aval) 324 #endif 325 326 PetscFunctionBegin; 327 PetscCall(VecGetArrayRead(xx, &x)); 328 PetscCall(VecGetArray(yy, &y)); 329 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 330 for (i = 0; i < totalslices; i++) { /* loop over slices */ 331 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 332 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 333 334 vec_y = _mm512_setzero_pd(); 335 vec_y2 = _mm512_setzero_pd(); 336 vec_y3 = _mm512_setzero_pd(); 337 vec_y4 = _mm512_setzero_pd(); 338 339 j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice column */ 340 switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) { 341 case 3: 342 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 343 acolidx += 8; 344 aval += 8; 345 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 346 acolidx += 8; 347 aval += 8; 348 AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3); 349 acolidx += 8; 350 aval += 8; 351 j += 3; 352 break; 353 case 2: 354 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 355 acolidx += 8; 356 aval += 8; 357 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 358 acolidx += 8; 359 aval += 8; 360 j += 2; 361 break; 362 case 1: 363 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 364 acolidx += 8; 365 aval += 8; 366 j += 1; 367 break; 368 } 369 #pragma novector 370 for (; j < (a->sliidx[i + 1] >> 3); j += 4) { 371 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 372 acolidx += 8; 373 aval += 8; 374 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 375 acolidx += 8; 376 aval += 8; 377 AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3); 378 acolidx += 8; 379 aval += 8; 380 AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4); 381 acolidx += 8; 382 aval += 8; 383 } 384 385 vec_y = _mm512_add_pd(vec_y, vec_y2); 386 vec_y = _mm512_add_pd(vec_y, vec_y3); 387 vec_y = _mm512_add_pd(vec_y, vec_y4); 388 if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */ 389 mask = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07))); 390 _mm512_mask_storeu_pd(&y[8 * i], mask, vec_y); 391 } else { 392 _mm512_storeu_pd(&y[8 * i], vec_y); 393 } 394 } 395 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 396 for (i = 0; i < totalslices; i++) { /* loop over full slices */ 397 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 398 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 399 400 /* last slice may have padding rows. Don't use vectorization. */ 401 if (i == totalslices - 1 && (A->rmap->n & 0x07)) { 402 rows_left = A->rmap->n - 8 * i; 403 for (r = 0; r < rows_left; ++r) { 404 yval = (MatScalar)0; 405 row = 8 * i + r; 406 nnz_in_row = a->rlen[row]; 407 for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]]; 408 y[row] = yval; 409 } 410 break; 411 } 412 413 vec_y = _mm256_setzero_pd(); 414 vec_y2 = _mm256_setzero_pd(); 415 416 /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */ 417 #pragma novector 418 #pragma unroll(2) 419 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) { 420 AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 421 aval += 4; 422 acolidx += 4; 423 AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y2); 424 aval += 4; 425 acolidx += 4; 426 } 427 428 _mm256_storeu_pd(y + i * 8, vec_y); 429 _mm256_storeu_pd(y + i * 8 + 4, vec_y2); 430 } 431 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 432 for (i = 0; i < totalslices; i++) { /* loop over full slices */ 433 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 434 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 435 436 vec_y = _mm256_setzero_pd(); 437 vec_y2 = _mm256_setzero_pd(); 438 439 /* last slice may have padding rows. Don't use vectorization. */ 440 if (i == totalslices - 1 && (A->rmap->n & 0x07)) { 441 rows_left = A->rmap->n - 8 * i; 442 for (r = 0; r < rows_left; ++r) { 443 yval = (MatScalar)0; 444 row = 8 * i + r; 445 nnz_in_row = a->rlen[row]; 446 for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]]; 447 y[row] = yval; 448 } 449 break; 450 } 451 452 /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */ 453 #pragma novector 454 #pragma unroll(2) 455 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) { 456 vec_vals = _mm256_loadu_pd(aval); 457 vec_x_tmp = _mm_setzero_pd(); 458 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 459 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 460 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0); 461 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 462 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 463 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1); 464 vec_y = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y); 465 aval += 4; 466 467 vec_vals = _mm256_loadu_pd(aval); 468 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 469 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 470 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0); 471 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 472 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 473 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1); 474 vec_y2 = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2); 475 aval += 4; 476 } 477 478 _mm256_storeu_pd(y + i * 8, vec_y); 479 _mm256_storeu_pd(y + i * 8 + 4, vec_y2); 480 } 481 #else 482 for (i = 0; i < totalslices; i++) { /* loop over slices */ 483 for (j = 0; j < 8; j++) sum[j] = 0.0; 484 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) { 485 sum[0] += aval[j] * x[acolidx[j]]; 486 sum[1] += aval[j + 1] * x[acolidx[j + 1]]; 487 sum[2] += aval[j + 2] * x[acolidx[j + 2]]; 488 sum[3] += aval[j + 3] * x[acolidx[j + 3]]; 489 sum[4] += aval[j + 4] * x[acolidx[j + 4]]; 490 sum[5] += aval[j + 5] * x[acolidx[j + 5]]; 491 sum[6] += aval[j + 6] * x[acolidx[j + 6]]; 492 sum[7] += aval[j + 7] * x[acolidx[j + 7]]; 493 } 494 if (i == totalslices - 1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */ 495 for (j = 0; j < (A->rmap->n & 0x07); j++) y[8 * i + j] = sum[j]; 496 } else { 497 for (j = 0; j < 8; j++) y[8 * i + j] = sum[j]; 498 } 499 } 500 #endif 501 502 PetscCall(PetscLogFlops(2.0 * a->nz - a->nonzerorowcnt)); /* theoretical minimal FLOPs */ 503 PetscCall(VecRestoreArrayRead(xx, &x)); 504 PetscCall(VecRestoreArray(yy, &y)); 505 PetscFunctionReturn(PETSC_SUCCESS); 506 } 507 508 #include <../src/mat/impls/aij/seq/ftn-kernels/fmultadd.h> 509 PetscErrorCode MatMultAdd_SeqSELL(Mat A, Vec xx, Vec yy, Vec zz) 510 { 511 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 512 PetscScalar *y, *z; 513 const PetscScalar *x; 514 const MatScalar *aval = a->val; 515 PetscInt totalslices = a->totalslices; 516 const PetscInt *acolidx = a->colidx; 517 PetscInt i, j; 518 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 519 __m512d vec_x, vec_y, vec_vals; 520 __m256i vec_idx; 521 __mmask8 mask; 522 __m512d vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4; 523 __m256i vec_idx2, vec_idx3, vec_idx4; 524 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 525 __m128d vec_x_tmp; 526 __m256d vec_x, vec_y, vec_y2, vec_vals; 527 MatScalar yval; 528 PetscInt r, row, nnz_in_row; 529 #else 530 PetscScalar sum[8]; 531 #endif 532 533 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 534 #pragma disjoint(*x, *y, *aval) 535 #endif 536 537 PetscFunctionBegin; 538 PetscCall(VecGetArrayRead(xx, &x)); 539 PetscCall(VecGetArrayPair(yy, zz, &y, &z)); 540 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 541 for (i = 0; i < totalslices; i++) { /* loop over slices */ 542 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 543 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 544 545 if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */ 546 mask = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07))); 547 vec_y = _mm512_mask_loadu_pd(vec_y, mask, &y[8 * i]); 548 } else { 549 vec_y = _mm512_loadu_pd(&y[8 * i]); 550 } 551 vec_y2 = _mm512_setzero_pd(); 552 vec_y3 = _mm512_setzero_pd(); 553 vec_y4 = _mm512_setzero_pd(); 554 555 j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice column */ 556 switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) { 557 case 3: 558 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 559 acolidx += 8; 560 aval += 8; 561 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 562 acolidx += 8; 563 aval += 8; 564 AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3); 565 acolidx += 8; 566 aval += 8; 567 j += 3; 568 break; 569 case 2: 570 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 571 acolidx += 8; 572 aval += 8; 573 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 574 acolidx += 8; 575 aval += 8; 576 j += 2; 577 break; 578 case 1: 579 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 580 acolidx += 8; 581 aval += 8; 582 j += 1; 583 break; 584 } 585 #pragma novector 586 for (; j < (a->sliidx[i + 1] >> 3); j += 4) { 587 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 588 acolidx += 8; 589 aval += 8; 590 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 591 acolidx += 8; 592 aval += 8; 593 AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3); 594 acolidx += 8; 595 aval += 8; 596 AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4); 597 acolidx += 8; 598 aval += 8; 599 } 600 601 vec_y = _mm512_add_pd(vec_y, vec_y2); 602 vec_y = _mm512_add_pd(vec_y, vec_y3); 603 vec_y = _mm512_add_pd(vec_y, vec_y4); 604 if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */ 605 _mm512_mask_storeu_pd(&z[8 * i], mask, vec_y); 606 } else { 607 _mm512_storeu_pd(&z[8 * i], vec_y); 608 } 609 } 610 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 611 for (i = 0; i < totalslices; i++) { /* loop over full slices */ 612 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 613 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 614 615 /* last slice may have padding rows. Don't use vectorization. */ 616 if (i == totalslices - 1 && (A->rmap->n & 0x07)) { 617 for (r = 0; r < (A->rmap->n & 0x07); ++r) { 618 row = 8 * i + r; 619 yval = (MatScalar)0.0; 620 nnz_in_row = a->rlen[row]; 621 for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]]; 622 z[row] = y[row] + yval; 623 } 624 break; 625 } 626 627 vec_y = _mm256_loadu_pd(y + 8 * i); 628 vec_y2 = _mm256_loadu_pd(y + 8 * i + 4); 629 630 /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */ 631 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) { 632 vec_vals = _mm256_loadu_pd(aval); 633 vec_x_tmp = _mm_setzero_pd(); 634 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 635 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 636 vec_x = _mm256_setzero_pd(); 637 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0); 638 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 639 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 640 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1); 641 vec_y = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y); 642 aval += 4; 643 644 vec_vals = _mm256_loadu_pd(aval); 645 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 646 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 647 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0); 648 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 649 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 650 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1); 651 vec_y2 = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2); 652 aval += 4; 653 } 654 655 _mm256_storeu_pd(z + i * 8, vec_y); 656 _mm256_storeu_pd(z + i * 8 + 4, vec_y2); 657 } 658 #else 659 for (i = 0; i < totalslices; i++) { /* loop over slices */ 660 for (j = 0; j < 8; j++) sum[j] = 0.0; 661 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) { 662 sum[0] += aval[j] * x[acolidx[j]]; 663 sum[1] += aval[j + 1] * x[acolidx[j + 1]]; 664 sum[2] += aval[j + 2] * x[acolidx[j + 2]]; 665 sum[3] += aval[j + 3] * x[acolidx[j + 3]]; 666 sum[4] += aval[j + 4] * x[acolidx[j + 4]]; 667 sum[5] += aval[j + 5] * x[acolidx[j + 5]]; 668 sum[6] += aval[j + 6] * x[acolidx[j + 6]]; 669 sum[7] += aval[j + 7] * x[acolidx[j + 7]]; 670 } 671 if (i == totalslices - 1 && (A->rmap->n & 0x07)) { 672 for (j = 0; j < (A->rmap->n & 0x07); j++) z[8 * i + j] = y[8 * i + j] + sum[j]; 673 } else { 674 for (j = 0; j < 8; j++) z[8 * i + j] = y[8 * i + j] + sum[j]; 675 } 676 } 677 #endif 678 679 PetscCall(PetscLogFlops(2.0 * a->nz)); 680 PetscCall(VecRestoreArrayRead(xx, &x)); 681 PetscCall(VecRestoreArrayPair(yy, zz, &y, &z)); 682 PetscFunctionReturn(PETSC_SUCCESS); 683 } 684 685 PetscErrorCode MatMultTransposeAdd_SeqSELL(Mat A, Vec xx, Vec zz, Vec yy) 686 { 687 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 688 PetscScalar *y; 689 const PetscScalar *x; 690 const MatScalar *aval = a->val; 691 const PetscInt *acolidx = a->colidx; 692 PetscInt i, j, r, row, nnz_in_row, totalslices = a->totalslices; 693 694 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 695 #pragma disjoint(*x, *y, *aval) 696 #endif 697 698 PetscFunctionBegin; 699 if (A->symmetric == PETSC_BOOL3_TRUE) { 700 PetscCall(MatMultAdd_SeqSELL(A, xx, zz, yy)); 701 PetscFunctionReturn(PETSC_SUCCESS); 702 } 703 if (zz != yy) PetscCall(VecCopy(zz, yy)); 704 PetscCall(VecGetArrayRead(xx, &x)); 705 PetscCall(VecGetArray(yy, &y)); 706 for (i = 0; i < a->totalslices; i++) { /* loop over slices */ 707 if (i == totalslices - 1 && (A->rmap->n & 0x07)) { 708 for (r = 0; r < (A->rmap->n & 0x07); ++r) { 709 row = 8 * i + r; 710 nnz_in_row = a->rlen[row]; 711 for (j = 0; j < nnz_in_row; ++j) y[acolidx[8 * j + r]] += aval[8 * j + r] * x[row]; 712 } 713 break; 714 } 715 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) { 716 y[acolidx[j]] += aval[j] * x[8 * i]; 717 y[acolidx[j + 1]] += aval[j + 1] * x[8 * i + 1]; 718 y[acolidx[j + 2]] += aval[j + 2] * x[8 * i + 2]; 719 y[acolidx[j + 3]] += aval[j + 3] * x[8 * i + 3]; 720 y[acolidx[j + 4]] += aval[j + 4] * x[8 * i + 4]; 721 y[acolidx[j + 5]] += aval[j + 5] * x[8 * i + 5]; 722 y[acolidx[j + 6]] += aval[j + 6] * x[8 * i + 6]; 723 y[acolidx[j + 7]] += aval[j + 7] * x[8 * i + 7]; 724 } 725 } 726 PetscCall(PetscLogFlops(2.0 * a->sliidx[a->totalslices])); 727 PetscCall(VecRestoreArrayRead(xx, &x)); 728 PetscCall(VecRestoreArray(yy, &y)); 729 PetscFunctionReturn(PETSC_SUCCESS); 730 } 731 732 PetscErrorCode MatMultTranspose_SeqSELL(Mat A, Vec xx, Vec yy) 733 { 734 PetscFunctionBegin; 735 if (A->symmetric == PETSC_BOOL3_TRUE) { 736 PetscCall(MatMult_SeqSELL(A, xx, yy)); 737 } else { 738 PetscCall(VecSet(yy, 0.0)); 739 PetscCall(MatMultTransposeAdd_SeqSELL(A, xx, yy, yy)); 740 } 741 PetscFunctionReturn(PETSC_SUCCESS); 742 } 743 744 /* 745 Checks for missing diagonals 746 */ 747 PetscErrorCode MatMissingDiagonal_SeqSELL(Mat A, PetscBool *missing, PetscInt *d) 748 { 749 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 750 PetscInt *diag, i; 751 752 PetscFunctionBegin; 753 *missing = PETSC_FALSE; 754 if (A->rmap->n > 0 && !(a->colidx)) { 755 *missing = PETSC_TRUE; 756 if (d) *d = 0; 757 PetscCall(PetscInfo(A, "Matrix has no entries therefore is missing diagonal\n")); 758 } else { 759 diag = a->diag; 760 for (i = 0; i < A->rmap->n; i++) { 761 if (diag[i] == -1) { 762 *missing = PETSC_TRUE; 763 if (d) *d = i; 764 PetscCall(PetscInfo(A, "Matrix is missing diagonal number %" PetscInt_FMT "\n", i)); 765 break; 766 } 767 } 768 } 769 PetscFunctionReturn(PETSC_SUCCESS); 770 } 771 772 PetscErrorCode MatMarkDiagonal_SeqSELL(Mat A) 773 { 774 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 775 PetscInt i, j, m = A->rmap->n, shift; 776 777 PetscFunctionBegin; 778 if (!a->diag) { 779 PetscCall(PetscMalloc1(m, &a->diag)); 780 a->free_diag = PETSC_TRUE; 781 } 782 for (i = 0; i < m; i++) { /* loop over rows */ 783 shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */ 784 a->diag[i] = -1; 785 for (j = 0; j < a->rlen[i]; j++) { 786 if (a->colidx[shift + j * 8] == i) { 787 a->diag[i] = shift + j * 8; 788 break; 789 } 790 } 791 } 792 PetscFunctionReturn(PETSC_SUCCESS); 793 } 794 795 /* 796 Negative shift indicates do not generate an error if there is a zero diagonal, just invert it anyways 797 */ 798 PetscErrorCode MatInvertDiagonal_SeqSELL(Mat A, PetscScalar omega, PetscScalar fshift) 799 { 800 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 801 PetscInt i, *diag, m = A->rmap->n; 802 MatScalar *val = a->val; 803 PetscScalar *idiag, *mdiag; 804 805 PetscFunctionBegin; 806 if (a->idiagvalid) PetscFunctionReturn(PETSC_SUCCESS); 807 PetscCall(MatMarkDiagonal_SeqSELL(A)); 808 diag = a->diag; 809 if (!a->idiag) { 810 PetscCall(PetscMalloc3(m, &a->idiag, m, &a->mdiag, m, &a->ssor_work)); 811 val = a->val; 812 } 813 mdiag = a->mdiag; 814 idiag = a->idiag; 815 816 if (omega == 1.0 && PetscRealPart(fshift) <= 0.0) { 817 for (i = 0; i < m; i++) { 818 mdiag[i] = val[diag[i]]; 819 if (!PetscAbsScalar(mdiag[i])) { /* zero diagonal */ 820 PetscCheck(PetscRealPart(fshift), PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Zero diagonal on row %" PetscInt_FMT, i); 821 PetscCall(PetscInfo(A, "Zero diagonal on row %" PetscInt_FMT "\n", i)); 822 A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 823 A->factorerror_zeropivot_value = 0.0; 824 A->factorerror_zeropivot_row = i; 825 } 826 idiag[i] = 1.0 / val[diag[i]]; 827 } 828 PetscCall(PetscLogFlops(m)); 829 } else { 830 for (i = 0; i < m; i++) { 831 mdiag[i] = val[diag[i]]; 832 idiag[i] = omega / (fshift + val[diag[i]]); 833 } 834 PetscCall(PetscLogFlops(2.0 * m)); 835 } 836 a->idiagvalid = PETSC_TRUE; 837 PetscFunctionReturn(PETSC_SUCCESS); 838 } 839 840 PetscErrorCode MatZeroEntries_SeqSELL(Mat A) 841 { 842 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 843 844 PetscFunctionBegin; 845 PetscCall(PetscArrayzero(a->val, a->sliidx[a->totalslices])); 846 PetscCall(MatSeqSELLInvalidateDiagonal(A)); 847 PetscFunctionReturn(PETSC_SUCCESS); 848 } 849 850 PetscErrorCode MatDestroy_SeqSELL(Mat A) 851 { 852 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 853 854 PetscFunctionBegin; 855 #if defined(PETSC_USE_LOG) 856 PetscCall(PetscLogObjectState((PetscObject)A, "Rows=%" PetscInt_FMT ", Cols=%" PetscInt_FMT ", NZ=%" PetscInt_FMT, A->rmap->n, A->cmap->n, a->nz)); 857 #endif 858 PetscCall(MatSeqXSELLFreeSELL(A, &a->val, &a->colidx)); 859 PetscCall(ISDestroy(&a->row)); 860 PetscCall(ISDestroy(&a->col)); 861 PetscCall(PetscFree(a->diag)); 862 PetscCall(PetscFree(a->rlen)); 863 PetscCall(PetscFree(a->sliidx)); 864 PetscCall(PetscFree3(a->idiag, a->mdiag, a->ssor_work)); 865 PetscCall(PetscFree(a->solve_work)); 866 PetscCall(ISDestroy(&a->icol)); 867 PetscCall(PetscFree(a->saved_values)); 868 PetscCall(PetscFree2(a->getrowcols, a->getrowvals)); 869 870 PetscCall(PetscFree(A->data)); 871 872 PetscCall(PetscObjectChangeTypeName((PetscObject)A, NULL)); 873 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatStoreValues_C", NULL)); 874 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatRetrieveValues_C", NULL)); 875 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLSetPreallocation_C", NULL)); 876 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetArray_C", NULL)); 877 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLRestoreArray_C", NULL)); 878 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqsell_seqaij_C", NULL)); 879 PetscFunctionReturn(PETSC_SUCCESS); 880 } 881 882 PetscErrorCode MatSetOption_SeqSELL(Mat A, MatOption op, PetscBool flg) 883 { 884 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 885 886 PetscFunctionBegin; 887 switch (op) { 888 case MAT_ROW_ORIENTED: 889 a->roworiented = flg; 890 break; 891 case MAT_KEEP_NONZERO_PATTERN: 892 a->keepnonzeropattern = flg; 893 break; 894 case MAT_NEW_NONZERO_LOCATIONS: 895 a->nonew = (flg ? 0 : 1); 896 break; 897 case MAT_NEW_NONZERO_LOCATION_ERR: 898 a->nonew = (flg ? -1 : 0); 899 break; 900 case MAT_NEW_NONZERO_ALLOCATION_ERR: 901 a->nonew = (flg ? -2 : 0); 902 break; 903 case MAT_UNUSED_NONZERO_LOCATION_ERR: 904 a->nounused = (flg ? -1 : 0); 905 break; 906 case MAT_FORCE_DIAGONAL_ENTRIES: 907 case MAT_IGNORE_OFF_PROC_ENTRIES: 908 case MAT_USE_HASH_TABLE: 909 case MAT_SORTED_FULL: 910 PetscCall(PetscInfo(A, "Option %s ignored\n", MatOptions[op])); 911 break; 912 case MAT_SPD: 913 case MAT_SYMMETRIC: 914 case MAT_STRUCTURALLY_SYMMETRIC: 915 case MAT_HERMITIAN: 916 case MAT_SYMMETRY_ETERNAL: 917 case MAT_STRUCTURAL_SYMMETRY_ETERNAL: 918 case MAT_SPD_ETERNAL: 919 /* These options are handled directly by MatSetOption() */ 920 break; 921 default: 922 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unknown option %d", op); 923 } 924 PetscFunctionReturn(PETSC_SUCCESS); 925 } 926 927 PetscErrorCode MatGetDiagonal_SeqSELL(Mat A, Vec v) 928 { 929 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 930 PetscInt i, j, n, shift; 931 PetscScalar *x, zero = 0.0; 932 933 PetscFunctionBegin; 934 PetscCall(VecGetLocalSize(v, &n)); 935 PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector"); 936 937 if (A->factortype == MAT_FACTOR_ILU || A->factortype == MAT_FACTOR_LU) { 938 PetscInt *diag = a->diag; 939 PetscCall(VecGetArray(v, &x)); 940 for (i = 0; i < n; i++) x[i] = 1.0 / a->val[diag[i]]; 941 PetscCall(VecRestoreArray(v, &x)); 942 PetscFunctionReturn(PETSC_SUCCESS); 943 } 944 945 PetscCall(VecSet(v, zero)); 946 PetscCall(VecGetArray(v, &x)); 947 for (i = 0; i < n; i++) { /* loop over rows */ 948 shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */ 949 x[i] = 0; 950 for (j = 0; j < a->rlen[i]; j++) { 951 if (a->colidx[shift + j * 8] == i) { 952 x[i] = a->val[shift + j * 8]; 953 break; 954 } 955 } 956 } 957 PetscCall(VecRestoreArray(v, &x)); 958 PetscFunctionReturn(PETSC_SUCCESS); 959 } 960 961 PetscErrorCode MatDiagonalScale_SeqSELL(Mat A, Vec ll, Vec rr) 962 { 963 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 964 const PetscScalar *l, *r; 965 PetscInt i, j, m, n, row; 966 967 PetscFunctionBegin; 968 if (ll) { 969 /* The local size is used so that VecMPI can be passed to this routine 970 by MatDiagonalScale_MPISELL */ 971 PetscCall(VecGetLocalSize(ll, &m)); 972 PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length"); 973 PetscCall(VecGetArrayRead(ll, &l)); 974 for (i = 0; i < a->totalslices; i++) { /* loop over slices */ 975 if (i == a->totalslices - 1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */ 976 for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) & 0x07)) { 977 if (row < (A->rmap->n & 0x07)) a->val[j] *= l[8 * i + row]; 978 } 979 } else { 980 for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) & 0x07)) a->val[j] *= l[8 * i + row]; 981 } 982 } 983 PetscCall(VecRestoreArrayRead(ll, &l)); 984 PetscCall(PetscLogFlops(a->nz)); 985 } 986 if (rr) { 987 PetscCall(VecGetLocalSize(rr, &n)); 988 PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length"); 989 PetscCall(VecGetArrayRead(rr, &r)); 990 for (i = 0; i < a->totalslices; i++) { /* loop over slices */ 991 if (i == a->totalslices - 1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */ 992 for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) & 0x07)) { 993 if (row < (A->rmap->n & 0x07)) a->val[j] *= r[a->colidx[j]]; 994 } 995 } else { 996 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j++) a->val[j] *= r[a->colidx[j]]; 997 } 998 } 999 PetscCall(VecRestoreArrayRead(rr, &r)); 1000 PetscCall(PetscLogFlops(a->nz)); 1001 } 1002 PetscCall(MatSeqSELLInvalidateDiagonal(A)); 1003 PetscFunctionReturn(PETSC_SUCCESS); 1004 } 1005 1006 PetscErrorCode MatGetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], PetscScalar v[]) 1007 { 1008 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1009 PetscInt *cp, i, k, low, high, t, row, col, l; 1010 PetscInt shift; 1011 MatScalar *vp; 1012 1013 PetscFunctionBegin; 1014 for (k = 0; k < m; k++) { /* loop over requested rows */ 1015 row = im[k]; 1016 if (row < 0) continue; 1017 PetscCheck(row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->n - 1); 1018 shift = a->sliidx[row >> 3] + (row & 0x07); /* starting index of the row */ 1019 cp = a->colidx + shift; /* pointer to the row */ 1020 vp = a->val + shift; /* pointer to the row */ 1021 for (l = 0; l < n; l++) { /* loop over requested columns */ 1022 col = in[l]; 1023 if (col < 0) continue; 1024 PetscCheck(col < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column too large: row %" PetscInt_FMT " max %" PetscInt_FMT, col, A->cmap->n - 1); 1025 high = a->rlen[row]; 1026 low = 0; /* assume unsorted */ 1027 while (high - low > 5) { 1028 t = (low + high) / 2; 1029 if (*(cp + t * 8) > col) high = t; 1030 else low = t; 1031 } 1032 for (i = low; i < high; i++) { 1033 if (*(cp + 8 * i) > col) break; 1034 if (*(cp + 8 * i) == col) { 1035 *v++ = *(vp + 8 * i); 1036 goto finished; 1037 } 1038 } 1039 *v++ = 0.0; 1040 finished:; 1041 } 1042 } 1043 PetscFunctionReturn(PETSC_SUCCESS); 1044 } 1045 1046 PetscErrorCode MatView_SeqSELL_ASCII(Mat A, PetscViewer viewer) 1047 { 1048 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1049 PetscInt i, j, m = A->rmap->n, shift; 1050 const char *name; 1051 PetscViewerFormat format; 1052 1053 PetscFunctionBegin; 1054 PetscCall(PetscViewerGetFormat(viewer, &format)); 1055 if (format == PETSC_VIEWER_ASCII_MATLAB) { 1056 PetscInt nofinalvalue = 0; 1057 /* 1058 if (m && ((a->i[m] == a->i[m-1]) || (a->j[a->nz-1] != A->cmap->n-1))) { 1059 nofinalvalue = 1; 1060 } 1061 */ 1062 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1063 PetscCall(PetscViewerASCIIPrintf(viewer, "%% Size = %" PetscInt_FMT " %" PetscInt_FMT " \n", m, A->cmap->n)); 1064 PetscCall(PetscViewerASCIIPrintf(viewer, "%% Nonzeros = %" PetscInt_FMT " \n", a->nz)); 1065 #if defined(PETSC_USE_COMPLEX) 1066 PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",4);\n", a->nz + nofinalvalue)); 1067 #else 1068 PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",3);\n", a->nz + nofinalvalue)); 1069 #endif 1070 PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = [\n")); 1071 1072 for (i = 0; i < m; i++) { 1073 shift = a->sliidx[i >> 3] + (i & 0x07); 1074 for (j = 0; j < a->rlen[i]; j++) { 1075 #if defined(PETSC_USE_COMPLEX) 1076 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %18.16e %18.16e\n", i + 1, a->colidx[shift + 8 * j] + 1, (double)PetscRealPart(a->val[shift + 8 * j]), (double)PetscImaginaryPart(a->val[shift + 8 * j]))); 1077 #else 1078 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %18.16e\n", i + 1, a->colidx[shift + 8 * j] + 1, (double)a->val[shift + 8 * j])); 1079 #endif 1080 } 1081 } 1082 /* 1083 if (nofinalvalue) { 1084 #if defined(PETSC_USE_COMPLEX) 1085 PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %18.16e %18.16e\n",m,A->cmap->n,0.,0.)); 1086 #else 1087 PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %18.16e\n",m,A->cmap->n,0.0)); 1088 #endif 1089 } 1090 */ 1091 PetscCall(PetscObjectGetName((PetscObject)A, &name)); 1092 PetscCall(PetscViewerASCIIPrintf(viewer, "];\n %s = spconvert(zzz);\n", name)); 1093 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1094 } else if (format == PETSC_VIEWER_ASCII_FACTOR_INFO || format == PETSC_VIEWER_ASCII_INFO) { 1095 PetscFunctionReturn(PETSC_SUCCESS); 1096 } else if (format == PETSC_VIEWER_ASCII_COMMON) { 1097 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1098 for (i = 0; i < m; i++) { 1099 PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i)); 1100 shift = a->sliidx[i >> 3] + (i & 0x07); 1101 for (j = 0; j < a->rlen[i]; j++) { 1102 #if defined(PETSC_USE_COMPLEX) 1103 if (PetscImaginaryPart(a->val[shift + 8 * j]) > 0.0 && PetscRealPart(a->val[shift + 8 * j]) != 0.0) { 1104 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]), (double)PetscImaginaryPart(a->val[shift + 8 * j]))); 1105 } else if (PetscImaginaryPart(a->val[shift + 8 * j]) < 0.0 && PetscRealPart(a->val[shift + 8 * j]) != 0.0) { 1106 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]), (double)-PetscImaginaryPart(a->val[shift + 8 * j]))); 1107 } else if (PetscRealPart(a->val[shift + 8 * j]) != 0.0) { 1108 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]))); 1109 } 1110 #else 1111 if (a->val[shift + 8 * j] != 0.0) PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + 8 * j], (double)a->val[shift + 8 * j])); 1112 #endif 1113 } 1114 PetscCall(PetscViewerASCIIPrintf(viewer, "\n")); 1115 } 1116 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1117 } else if (format == PETSC_VIEWER_ASCII_DENSE) { 1118 PetscInt cnt = 0, jcnt; 1119 PetscScalar value; 1120 #if defined(PETSC_USE_COMPLEX) 1121 PetscBool realonly = PETSC_TRUE; 1122 for (i = 0; i < a->sliidx[a->totalslices]; i++) { 1123 if (PetscImaginaryPart(a->val[i]) != 0.0) { 1124 realonly = PETSC_FALSE; 1125 break; 1126 } 1127 } 1128 #endif 1129 1130 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1131 for (i = 0; i < m; i++) { 1132 jcnt = 0; 1133 shift = a->sliidx[i >> 3] + (i & 0x07); 1134 for (j = 0; j < A->cmap->n; j++) { 1135 if (jcnt < a->rlen[i] && j == a->colidx[shift + 8 * j]) { 1136 value = a->val[cnt++]; 1137 jcnt++; 1138 } else { 1139 value = 0.0; 1140 } 1141 #if defined(PETSC_USE_COMPLEX) 1142 if (realonly) { 1143 PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)PetscRealPart(value))); 1144 } else { 1145 PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e+%7.5e i ", (double)PetscRealPart(value), (double)PetscImaginaryPart(value))); 1146 } 1147 #else 1148 PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)value)); 1149 #endif 1150 } 1151 PetscCall(PetscViewerASCIIPrintf(viewer, "\n")); 1152 } 1153 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1154 } else if (format == PETSC_VIEWER_ASCII_MATRIXMARKET) { 1155 PetscInt fshift = 1; 1156 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1157 #if defined(PETSC_USE_COMPLEX) 1158 PetscCall(PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate complex general\n")); 1159 #else 1160 PetscCall(PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate real general\n")); 1161 #endif 1162 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %" PetscInt_FMT "\n", m, A->cmap->n, a->nz)); 1163 for (i = 0; i < m; i++) { 1164 shift = a->sliidx[i >> 3] + (i & 0x07); 1165 for (j = 0; j < a->rlen[i]; j++) { 1166 #if defined(PETSC_USE_COMPLEX) 1167 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g %g\n", i + fshift, a->colidx[shift + 8 * j] + fshift, (double)PetscRealPart(a->val[shift + 8 * j]), (double)PetscImaginaryPart(a->val[shift + 8 * j]))); 1168 #else 1169 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g\n", i + fshift, a->colidx[shift + 8 * j] + fshift, (double)a->val[shift + 8 * j])); 1170 #endif 1171 } 1172 } 1173 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1174 } else if (format == PETSC_VIEWER_NATIVE) { 1175 for (i = 0; i < a->totalslices; i++) { /* loop over slices */ 1176 PetscInt row; 1177 PetscCall(PetscViewerASCIIPrintf(viewer, "slice %" PetscInt_FMT ": %" PetscInt_FMT " %" PetscInt_FMT "\n", i, a->sliidx[i], a->sliidx[i + 1])); 1178 for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) & 0x07)) { 1179 #if defined(PETSC_USE_COMPLEX) 1180 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1181 PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g + %g i\n", 8 * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j]))); 1182 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1183 PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g - %g i\n", 8 * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), -(double)PetscImaginaryPart(a->val[j]))); 1184 } else { 1185 PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g\n", 8 * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]))); 1186 } 1187 #else 1188 PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g\n", 8 * i + row, a->colidx[j], (double)a->val[j])); 1189 #endif 1190 } 1191 } 1192 } else { 1193 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1194 if (A->factortype) { 1195 for (i = 0; i < m; i++) { 1196 shift = a->sliidx[i >> 3] + (i & 0x07); 1197 PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i)); 1198 /* L part */ 1199 for (j = shift; j < a->diag[i]; j += 8) { 1200 #if defined(PETSC_USE_COMPLEX) 1201 if (PetscImaginaryPart(a->val[shift + 8 * j]) > 0.0) { 1202 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j]))); 1203 } else if (PetscImaginaryPart(a->val[shift + 8 * j]) < 0.0) { 1204 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j])))); 1205 } else { 1206 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j]))); 1207 } 1208 #else 1209 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j])); 1210 #endif 1211 } 1212 /* diagonal */ 1213 j = a->diag[i]; 1214 #if defined(PETSC_USE_COMPLEX) 1215 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1216 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)PetscImaginaryPart(1.0 / a->val[j]))); 1217 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1218 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)(-PetscImaginaryPart(1.0 / a->val[j])))); 1219 } else { 1220 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]))); 1221 } 1222 #else 1223 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)(1.0 / a->val[j]))); 1224 #endif 1225 1226 /* U part */ 1227 for (j = a->diag[i] + 1; j < shift + 8 * a->rlen[i]; j += 8) { 1228 #if defined(PETSC_USE_COMPLEX) 1229 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1230 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j]))); 1231 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1232 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j])))); 1233 } else { 1234 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j]))); 1235 } 1236 #else 1237 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j])); 1238 #endif 1239 } 1240 PetscCall(PetscViewerASCIIPrintf(viewer, "\n")); 1241 } 1242 } else { 1243 for (i = 0; i < m; i++) { 1244 shift = a->sliidx[i >> 3] + (i & 0x07); 1245 PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i)); 1246 for (j = 0; j < a->rlen[i]; j++) { 1247 #if defined(PETSC_USE_COMPLEX) 1248 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1249 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]), (double)PetscImaginaryPart(a->val[shift + 8 * j]))); 1250 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1251 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]), (double)-PetscImaginaryPart(a->val[shift + 8 * j]))); 1252 } else { 1253 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]))); 1254 } 1255 #else 1256 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + 8 * j], (double)a->val[shift + 8 * j])); 1257 #endif 1258 } 1259 PetscCall(PetscViewerASCIIPrintf(viewer, "\n")); 1260 } 1261 } 1262 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1263 } 1264 PetscCall(PetscViewerFlush(viewer)); 1265 PetscFunctionReturn(PETSC_SUCCESS); 1266 } 1267 1268 #include <petscdraw.h> 1269 PetscErrorCode MatView_SeqSELL_Draw_Zoom(PetscDraw draw, void *Aa) 1270 { 1271 Mat A = (Mat)Aa; 1272 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1273 PetscInt i, j, m = A->rmap->n, shift; 1274 int color; 1275 PetscReal xl, yl, xr, yr, x_l, x_r, y_l, y_r; 1276 PetscViewer viewer; 1277 PetscViewerFormat format; 1278 1279 PetscFunctionBegin; 1280 PetscCall(PetscObjectQuery((PetscObject)A, "Zoomviewer", (PetscObject *)&viewer)); 1281 PetscCall(PetscViewerGetFormat(viewer, &format)); 1282 PetscCall(PetscDrawGetCoordinates(draw, &xl, &yl, &xr, &yr)); 1283 1284 /* loop over matrix elements drawing boxes */ 1285 1286 if (format != PETSC_VIEWER_DRAW_CONTOUR) { 1287 PetscDrawCollectiveBegin(draw); 1288 /* Blue for negative, Cyan for zero and Red for positive */ 1289 color = PETSC_DRAW_BLUE; 1290 for (i = 0; i < m; i++) { 1291 shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */ 1292 y_l = m - i - 1.0; 1293 y_r = y_l + 1.0; 1294 for (j = 0; j < a->rlen[i]; j++) { 1295 x_l = a->colidx[shift + j * 8]; 1296 x_r = x_l + 1.0; 1297 if (PetscRealPart(a->val[shift + 8 * j]) >= 0.) continue; 1298 PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color)); 1299 } 1300 } 1301 color = PETSC_DRAW_CYAN; 1302 for (i = 0; i < m; i++) { 1303 shift = a->sliidx[i >> 3] + (i & 0x07); 1304 y_l = m - i - 1.0; 1305 y_r = y_l + 1.0; 1306 for (j = 0; j < a->rlen[i]; j++) { 1307 x_l = a->colidx[shift + j * 8]; 1308 x_r = x_l + 1.0; 1309 if (a->val[shift + 8 * j] != 0.) continue; 1310 PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color)); 1311 } 1312 } 1313 color = PETSC_DRAW_RED; 1314 for (i = 0; i < m; i++) { 1315 shift = a->sliidx[i >> 3] + (i & 0x07); 1316 y_l = m - i - 1.0; 1317 y_r = y_l + 1.0; 1318 for (j = 0; j < a->rlen[i]; j++) { 1319 x_l = a->colidx[shift + j * 8]; 1320 x_r = x_l + 1.0; 1321 if (PetscRealPart(a->val[shift + 8 * j]) <= 0.) continue; 1322 PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color)); 1323 } 1324 } 1325 PetscDrawCollectiveEnd(draw); 1326 } else { 1327 /* use contour shading to indicate magnitude of values */ 1328 /* first determine max of all nonzero values */ 1329 PetscReal minv = 0.0, maxv = 0.0; 1330 PetscInt count = 0; 1331 PetscDraw popup; 1332 for (i = 0; i < a->sliidx[a->totalslices]; i++) { 1333 if (PetscAbsScalar(a->val[i]) > maxv) maxv = PetscAbsScalar(a->val[i]); 1334 } 1335 if (minv >= maxv) maxv = minv + PETSC_SMALL; 1336 PetscCall(PetscDrawGetPopup(draw, &popup)); 1337 PetscCall(PetscDrawScalePopup(popup, minv, maxv)); 1338 1339 PetscDrawCollectiveBegin(draw); 1340 for (i = 0; i < m; i++) { 1341 shift = a->sliidx[i >> 3] + (i & 0x07); 1342 y_l = m - i - 1.0; 1343 y_r = y_l + 1.0; 1344 for (j = 0; j < a->rlen[i]; j++) { 1345 x_l = a->colidx[shift + j * 8]; 1346 x_r = x_l + 1.0; 1347 color = PetscDrawRealToColor(PetscAbsScalar(a->val[count]), minv, maxv); 1348 PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color)); 1349 count++; 1350 } 1351 } 1352 PetscDrawCollectiveEnd(draw); 1353 } 1354 PetscFunctionReturn(PETSC_SUCCESS); 1355 } 1356 1357 #include <petscdraw.h> 1358 PetscErrorCode MatView_SeqSELL_Draw(Mat A, PetscViewer viewer) 1359 { 1360 PetscDraw draw; 1361 PetscReal xr, yr, xl, yl, h, w; 1362 PetscBool isnull; 1363 1364 PetscFunctionBegin; 1365 PetscCall(PetscViewerDrawGetDraw(viewer, 0, &draw)); 1366 PetscCall(PetscDrawIsNull(draw, &isnull)); 1367 if (isnull) PetscFunctionReturn(PETSC_SUCCESS); 1368 1369 xr = A->cmap->n; 1370 yr = A->rmap->n; 1371 h = yr / 10.0; 1372 w = xr / 10.0; 1373 xr += w; 1374 yr += h; 1375 xl = -w; 1376 yl = -h; 1377 PetscCall(PetscDrawSetCoordinates(draw, xl, yl, xr, yr)); 1378 PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", (PetscObject)viewer)); 1379 PetscCall(PetscDrawZoom(draw, MatView_SeqSELL_Draw_Zoom, A)); 1380 PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", NULL)); 1381 PetscCall(PetscDrawSave(draw)); 1382 PetscFunctionReturn(PETSC_SUCCESS); 1383 } 1384 1385 PetscErrorCode MatView_SeqSELL(Mat A, PetscViewer viewer) 1386 { 1387 PetscBool iascii, isbinary, isdraw; 1388 1389 PetscFunctionBegin; 1390 PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii)); 1391 PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERBINARY, &isbinary)); 1392 PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERDRAW, &isdraw)); 1393 if (iascii) { 1394 PetscCall(MatView_SeqSELL_ASCII(A, viewer)); 1395 } else if (isbinary) { 1396 /* PetscCall(MatView_SeqSELL_Binary(A,viewer)); */ 1397 } else if (isdraw) PetscCall(MatView_SeqSELL_Draw(A, viewer)); 1398 PetscFunctionReturn(PETSC_SUCCESS); 1399 } 1400 1401 PetscErrorCode MatAssemblyEnd_SeqSELL(Mat A, MatAssemblyType mode) 1402 { 1403 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1404 PetscInt i, shift, row_in_slice, row, nrow, *cp, lastcol, j, k; 1405 MatScalar *vp; 1406 1407 PetscFunctionBegin; 1408 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS); 1409 /* To do: compress out the unused elements */ 1410 PetscCall(MatMarkDiagonal_SeqSELL(A)); 1411 PetscCall(PetscInfo(A, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: %" PetscInt_FMT " allocated %" PetscInt_FMT " used (%" PetscInt_FMT " nonzeros+%" PetscInt_FMT " paddedzeros)\n", A->rmap->n, A->cmap->n, a->maxallocmat, a->sliidx[a->totalslices], a->nz, a->sliidx[a->totalslices] - a->nz)); 1412 PetscCall(PetscInfo(A, "Number of mallocs during MatSetValues() is %" PetscInt_FMT "\n", a->reallocs)); 1413 PetscCall(PetscInfo(A, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", a->rlenmax)); 1414 /* Set unused slots for column indices to last valid column index. Set unused slots for values to zero. This allows for a use of unmasked intrinsics -> higher performance */ 1415 for (i = 0; i < a->totalslices; ++i) { 1416 shift = a->sliidx[i]; /* starting index of the slice */ 1417 cp = a->colidx + shift; /* pointer to the column indices of the slice */ 1418 vp = a->val + shift; /* pointer to the nonzero values of the slice */ 1419 for (row_in_slice = 0; row_in_slice < 8; ++row_in_slice) { /* loop over rows in the slice */ 1420 row = 8 * i + row_in_slice; 1421 nrow = a->rlen[row]; /* number of nonzeros in row */ 1422 /* 1423 Search for the nearest nonzero. Normally setting the index to zero may cause extra communication. 1424 But if the entire slice are empty, it is fine to use 0 since the index will not be loaded. 1425 */ 1426 lastcol = 0; 1427 if (nrow > 0) { /* nonempty row */ 1428 lastcol = cp[8 * (nrow - 1) + row_in_slice]; /* use the index from the last nonzero at current row */ 1429 } else if (!row_in_slice) { /* first row of the currect slice is empty */ 1430 for (j = 1; j < 8; j++) { 1431 if (a->rlen[8 * i + j]) { 1432 lastcol = cp[j]; 1433 break; 1434 } 1435 } 1436 } else { 1437 if (a->sliidx[i + 1] != shift) lastcol = cp[row_in_slice - 1]; /* use the index from the previous row */ 1438 } 1439 1440 for (k = nrow; k < (a->sliidx[i + 1] - shift) / 8; ++k) { 1441 cp[8 * k + row_in_slice] = lastcol; 1442 vp[8 * k + row_in_slice] = (MatScalar)0; 1443 } 1444 } 1445 } 1446 1447 A->info.mallocs += a->reallocs; 1448 a->reallocs = 0; 1449 1450 PetscCall(MatSeqSELLInvalidateDiagonal(A)); 1451 PetscFunctionReturn(PETSC_SUCCESS); 1452 } 1453 1454 PetscErrorCode MatGetInfo_SeqSELL(Mat A, MatInfoType flag, MatInfo *info) 1455 { 1456 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1457 1458 PetscFunctionBegin; 1459 info->block_size = 1.0; 1460 info->nz_allocated = a->maxallocmat; 1461 info->nz_used = a->sliidx[a->totalslices]; /* include padding zeros */ 1462 info->nz_unneeded = (a->maxallocmat - a->sliidx[a->totalslices]); 1463 info->assemblies = A->num_ass; 1464 info->mallocs = A->info.mallocs; 1465 info->memory = 0; /* REVIEW ME */ 1466 if (A->factortype) { 1467 info->fill_ratio_given = A->info.fill_ratio_given; 1468 info->fill_ratio_needed = A->info.fill_ratio_needed; 1469 info->factor_mallocs = A->info.factor_mallocs; 1470 } else { 1471 info->fill_ratio_given = 0; 1472 info->fill_ratio_needed = 0; 1473 info->factor_mallocs = 0; 1474 } 1475 PetscFunctionReturn(PETSC_SUCCESS); 1476 } 1477 1478 PetscErrorCode MatSetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], const PetscScalar v[], InsertMode is) 1479 { 1480 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1481 PetscInt shift, i, k, l, low, high, t, ii, row, col, nrow; 1482 PetscInt *cp, nonew = a->nonew, lastcol = -1; 1483 MatScalar *vp, value; 1484 1485 PetscFunctionBegin; 1486 for (k = 0; k < m; k++) { /* loop over added rows */ 1487 row = im[k]; 1488 if (row < 0) continue; 1489 PetscCheck(row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->n - 1); 1490 shift = a->sliidx[row >> 3] + (row & 0x07); /* starting index of the row */ 1491 cp = a->colidx + shift; /* pointer to the row */ 1492 vp = a->val + shift; /* pointer to the row */ 1493 nrow = a->rlen[row]; 1494 low = 0; 1495 high = nrow; 1496 1497 for (l = 0; l < n; l++) { /* loop over added columns */ 1498 col = in[l]; 1499 if (col < 0) continue; 1500 PetscCheck(col < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Col too large: row %" PetscInt_FMT " max %" PetscInt_FMT, col, A->cmap->n - 1); 1501 if (a->roworiented) { 1502 value = v[l + k * n]; 1503 } else { 1504 value = v[k + l * m]; 1505 } 1506 if ((value == 0.0 && a->ignorezeroentries) && (is == ADD_VALUES)) continue; 1507 1508 /* search in this row for the specified column, i indicates the column to be set */ 1509 if (col <= lastcol) low = 0; 1510 else high = nrow; 1511 lastcol = col; 1512 while (high - low > 5) { 1513 t = (low + high) / 2; 1514 if (*(cp + t * 8) > col) high = t; 1515 else low = t; 1516 } 1517 for (i = low; i < high; i++) { 1518 if (*(cp + i * 8) > col) break; 1519 if (*(cp + i * 8) == col) { 1520 if (is == ADD_VALUES) *(vp + i * 8) += value; 1521 else *(vp + i * 8) = value; 1522 low = i + 1; 1523 goto noinsert; 1524 } 1525 } 1526 if (value == 0.0 && a->ignorezeroentries) goto noinsert; 1527 if (nonew == 1) goto noinsert; 1528 PetscCheck(nonew != -1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Inserting a new nonzero (%" PetscInt_FMT ", %" PetscInt_FMT ") in the matrix", row, col); 1529 /* If the current row length exceeds the slice width (e.g. nrow==slice_width), allocate a new space, otherwise do nothing */ 1530 MatSeqXSELLReallocateSELL(A, A->rmap->n, 1, nrow, a->sliidx, row / 8, row, col, a->colidx, a->val, cp, vp, nonew, MatScalar); 1531 /* add the new nonzero to the high position, shift the remaining elements in current row to the right by one slot */ 1532 for (ii = nrow - 1; ii >= i; ii--) { 1533 *(cp + (ii + 1) * 8) = *(cp + ii * 8); 1534 *(vp + (ii + 1) * 8) = *(vp + ii * 8); 1535 } 1536 a->rlen[row]++; 1537 *(cp + i * 8) = col; 1538 *(vp + i * 8) = value; 1539 a->nz++; 1540 A->nonzerostate++; 1541 low = i + 1; 1542 high++; 1543 nrow++; 1544 noinsert:; 1545 } 1546 a->rlen[row] = nrow; 1547 } 1548 PetscFunctionReturn(PETSC_SUCCESS); 1549 } 1550 1551 PetscErrorCode MatCopy_SeqSELL(Mat A, Mat B, MatStructure str) 1552 { 1553 PetscFunctionBegin; 1554 /* If the two matrices have the same copy implementation, use fast copy. */ 1555 if (str == SAME_NONZERO_PATTERN && (A->ops->copy == B->ops->copy)) { 1556 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1557 Mat_SeqSELL *b = (Mat_SeqSELL *)B->data; 1558 1559 PetscCheck(a->sliidx[a->totalslices] == b->sliidx[b->totalslices], PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Number of nonzeros in two matrices are different"); 1560 PetscCall(PetscArraycpy(b->val, a->val, a->sliidx[a->totalslices])); 1561 } else { 1562 PetscCall(MatCopy_Basic(A, B, str)); 1563 } 1564 PetscFunctionReturn(PETSC_SUCCESS); 1565 } 1566 1567 PetscErrorCode MatSetUp_SeqSELL(Mat A) 1568 { 1569 PetscFunctionBegin; 1570 PetscCall(MatSeqSELLSetPreallocation(A, PETSC_DEFAULT, NULL)); 1571 PetscFunctionReturn(PETSC_SUCCESS); 1572 } 1573 1574 PetscErrorCode MatSeqSELLGetArray_SeqSELL(Mat A, PetscScalar *array[]) 1575 { 1576 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1577 1578 PetscFunctionBegin; 1579 *array = a->val; 1580 PetscFunctionReturn(PETSC_SUCCESS); 1581 } 1582 1583 PetscErrorCode MatSeqSELLRestoreArray_SeqSELL(Mat A, PetscScalar *array[]) 1584 { 1585 PetscFunctionBegin; 1586 PetscFunctionReturn(PETSC_SUCCESS); 1587 } 1588 1589 PetscErrorCode MatRealPart_SeqSELL(Mat A) 1590 { 1591 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1592 PetscInt i; 1593 MatScalar *aval = a->val; 1594 1595 PetscFunctionBegin; 1596 for (i = 0; i < a->sliidx[a->totalslices]; i++) aval[i] = PetscRealPart(aval[i]); 1597 PetscFunctionReturn(PETSC_SUCCESS); 1598 } 1599 1600 PetscErrorCode MatImaginaryPart_SeqSELL(Mat A) 1601 { 1602 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1603 PetscInt i; 1604 MatScalar *aval = a->val; 1605 1606 PetscFunctionBegin; 1607 for (i = 0; i < a->sliidx[a->totalslices]; i++) aval[i] = PetscImaginaryPart(aval[i]); 1608 PetscCall(MatSeqSELLInvalidateDiagonal(A)); 1609 PetscFunctionReturn(PETSC_SUCCESS); 1610 } 1611 1612 PetscErrorCode MatScale_SeqSELL(Mat inA, PetscScalar alpha) 1613 { 1614 Mat_SeqSELL *a = (Mat_SeqSELL *)inA->data; 1615 MatScalar *aval = a->val; 1616 PetscScalar oalpha = alpha; 1617 PetscBLASInt one = 1, size; 1618 1619 PetscFunctionBegin; 1620 PetscCall(PetscBLASIntCast(a->sliidx[a->totalslices], &size)); 1621 PetscCallBLAS("BLASscal", BLASscal_(&size, &oalpha, aval, &one)); 1622 PetscCall(PetscLogFlops(a->nz)); 1623 PetscCall(MatSeqSELLInvalidateDiagonal(inA)); 1624 PetscFunctionReturn(PETSC_SUCCESS); 1625 } 1626 1627 PetscErrorCode MatShift_SeqSELL(Mat Y, PetscScalar a) 1628 { 1629 Mat_SeqSELL *y = (Mat_SeqSELL *)Y->data; 1630 1631 PetscFunctionBegin; 1632 if (!Y->preallocated || !y->nz) PetscCall(MatSeqSELLSetPreallocation(Y, 1, NULL)); 1633 PetscCall(MatShift_Basic(Y, a)); 1634 PetscFunctionReturn(PETSC_SUCCESS); 1635 } 1636 1637 PetscErrorCode MatSOR_SeqSELL(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx) 1638 { 1639 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1640 PetscScalar *x, sum, *t; 1641 const MatScalar *idiag = NULL, *mdiag; 1642 const PetscScalar *b, *xb; 1643 PetscInt n, m = A->rmap->n, i, j, shift; 1644 const PetscInt *diag; 1645 1646 PetscFunctionBegin; 1647 its = its * lits; 1648 1649 if (fshift != a->fshift || omega != a->omega) a->idiagvalid = PETSC_FALSE; /* must recompute idiag[] */ 1650 if (!a->idiagvalid) PetscCall(MatInvertDiagonal_SeqSELL(A, omega, fshift)); 1651 a->fshift = fshift; 1652 a->omega = omega; 1653 1654 diag = a->diag; 1655 t = a->ssor_work; 1656 idiag = a->idiag; 1657 mdiag = a->mdiag; 1658 1659 PetscCall(VecGetArray(xx, &x)); 1660 PetscCall(VecGetArrayRead(bb, &b)); 1661 /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */ 1662 PetscCheck(flag != SOR_APPLY_UPPER, PETSC_COMM_SELF, PETSC_ERR_SUP, "SOR_APPLY_UPPER is not implemented"); 1663 PetscCheck(flag != SOR_APPLY_LOWER, PETSC_COMM_SELF, PETSC_ERR_SUP, "SOR_APPLY_LOWER is not implemented"); 1664 PetscCheck(!(flag & SOR_EISENSTAT), PETSC_COMM_SELF, PETSC_ERR_SUP, "No support yet for Eisenstat"); 1665 1666 if (flag & SOR_ZERO_INITIAL_GUESS) { 1667 if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) { 1668 for (i = 0; i < m; i++) { 1669 shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */ 1670 sum = b[i]; 1671 n = (diag[i] - shift) / 8; 1672 for (j = 0; j < n; j++) sum -= a->val[shift + j * 8] * x[a->colidx[shift + j * 8]]; 1673 t[i] = sum; 1674 x[i] = sum * idiag[i]; 1675 } 1676 xb = t; 1677 PetscCall(PetscLogFlops(a->nz)); 1678 } else xb = b; 1679 if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) { 1680 for (i = m - 1; i >= 0; i--) { 1681 shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */ 1682 sum = xb[i]; 1683 n = a->rlen[i] - (diag[i] - shift) / 8 - 1; 1684 for (j = 1; j <= n; j++) sum -= a->val[diag[i] + j * 8] * x[a->colidx[diag[i] + j * 8]]; 1685 if (xb == b) { 1686 x[i] = sum * idiag[i]; 1687 } else { 1688 x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */ 1689 } 1690 } 1691 PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */ 1692 } 1693 its--; 1694 } 1695 while (its--) { 1696 if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) { 1697 for (i = 0; i < m; i++) { 1698 /* lower */ 1699 shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */ 1700 sum = b[i]; 1701 n = (diag[i] - shift) / 8; 1702 for (j = 0; j < n; j++) sum -= a->val[shift + j * 8] * x[a->colidx[shift + j * 8]]; 1703 t[i] = sum; /* save application of the lower-triangular part */ 1704 /* upper */ 1705 n = a->rlen[i] - (diag[i] - shift) / 8 - 1; 1706 for (j = 1; j <= n; j++) sum -= a->val[diag[i] + j * 8] * x[a->colidx[diag[i] + j * 8]]; 1707 x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */ 1708 } 1709 xb = t; 1710 PetscCall(PetscLogFlops(2.0 * a->nz)); 1711 } else xb = b; 1712 if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) { 1713 for (i = m - 1; i >= 0; i--) { 1714 shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */ 1715 sum = xb[i]; 1716 if (xb == b) { 1717 /* whole matrix (no checkpointing available) */ 1718 n = a->rlen[i]; 1719 for (j = 0; j < n; j++) sum -= a->val[shift + j * 8] * x[a->colidx[shift + j * 8]]; 1720 x[i] = (1. - omega) * x[i] + (sum + mdiag[i] * x[i]) * idiag[i]; 1721 } else { /* lower-triangular part has been saved, so only apply upper-triangular */ 1722 n = a->rlen[i] - (diag[i] - shift) / 8 - 1; 1723 for (j = 1; j <= n; j++) sum -= a->val[diag[i] + j * 8] * x[a->colidx[diag[i] + j * 8]]; 1724 x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */ 1725 } 1726 } 1727 if (xb == b) { 1728 PetscCall(PetscLogFlops(2.0 * a->nz)); 1729 } else { 1730 PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */ 1731 } 1732 } 1733 } 1734 PetscCall(VecRestoreArray(xx, &x)); 1735 PetscCall(VecRestoreArrayRead(bb, &b)); 1736 PetscFunctionReturn(PETSC_SUCCESS); 1737 } 1738 1739 /* -------------------------------------------------------------------*/ 1740 static struct _MatOps MatOps_Values = {MatSetValues_SeqSELL, 1741 MatGetRow_SeqSELL, 1742 MatRestoreRow_SeqSELL, 1743 MatMult_SeqSELL, 1744 /* 4*/ MatMultAdd_SeqSELL, 1745 MatMultTranspose_SeqSELL, 1746 MatMultTransposeAdd_SeqSELL, 1747 NULL, 1748 NULL, 1749 NULL, 1750 /* 10*/ NULL, 1751 NULL, 1752 NULL, 1753 MatSOR_SeqSELL, 1754 NULL, 1755 /* 15*/ MatGetInfo_SeqSELL, 1756 MatEqual_SeqSELL, 1757 MatGetDiagonal_SeqSELL, 1758 MatDiagonalScale_SeqSELL, 1759 NULL, 1760 /* 20*/ NULL, 1761 MatAssemblyEnd_SeqSELL, 1762 MatSetOption_SeqSELL, 1763 MatZeroEntries_SeqSELL, 1764 /* 24*/ NULL, 1765 NULL, 1766 NULL, 1767 NULL, 1768 NULL, 1769 /* 29*/ MatSetUp_SeqSELL, 1770 NULL, 1771 NULL, 1772 NULL, 1773 NULL, 1774 /* 34*/ MatDuplicate_SeqSELL, 1775 NULL, 1776 NULL, 1777 NULL, 1778 NULL, 1779 /* 39*/ NULL, 1780 NULL, 1781 NULL, 1782 MatGetValues_SeqSELL, 1783 MatCopy_SeqSELL, 1784 /* 44*/ NULL, 1785 MatScale_SeqSELL, 1786 MatShift_SeqSELL, 1787 NULL, 1788 NULL, 1789 /* 49*/ NULL, 1790 NULL, 1791 NULL, 1792 NULL, 1793 NULL, 1794 /* 54*/ MatFDColoringCreate_SeqXAIJ, 1795 NULL, 1796 NULL, 1797 NULL, 1798 NULL, 1799 /* 59*/ NULL, 1800 MatDestroy_SeqSELL, 1801 MatView_SeqSELL, 1802 NULL, 1803 NULL, 1804 /* 64*/ NULL, 1805 NULL, 1806 NULL, 1807 NULL, 1808 NULL, 1809 /* 69*/ NULL, 1810 NULL, 1811 NULL, 1812 NULL, 1813 NULL, 1814 /* 74*/ NULL, 1815 MatFDColoringApply_AIJ, /* reuse the FDColoring function for AIJ */ 1816 NULL, 1817 NULL, 1818 NULL, 1819 /* 79*/ NULL, 1820 NULL, 1821 NULL, 1822 NULL, 1823 NULL, 1824 /* 84*/ NULL, 1825 NULL, 1826 NULL, 1827 NULL, 1828 NULL, 1829 /* 89*/ NULL, 1830 NULL, 1831 NULL, 1832 NULL, 1833 NULL, 1834 /* 94*/ NULL, 1835 NULL, 1836 NULL, 1837 NULL, 1838 NULL, 1839 /* 99*/ NULL, 1840 NULL, 1841 NULL, 1842 MatConjugate_SeqSELL, 1843 NULL, 1844 /*104*/ NULL, 1845 NULL, 1846 NULL, 1847 NULL, 1848 NULL, 1849 /*109*/ NULL, 1850 NULL, 1851 NULL, 1852 NULL, 1853 MatMissingDiagonal_SeqSELL, 1854 /*114*/ NULL, 1855 NULL, 1856 NULL, 1857 NULL, 1858 NULL, 1859 /*119*/ NULL, 1860 NULL, 1861 NULL, 1862 NULL, 1863 NULL, 1864 /*124*/ NULL, 1865 NULL, 1866 NULL, 1867 NULL, 1868 NULL, 1869 /*129*/ NULL, 1870 NULL, 1871 NULL, 1872 NULL, 1873 NULL, 1874 /*134*/ NULL, 1875 NULL, 1876 NULL, 1877 NULL, 1878 NULL, 1879 /*139*/ NULL, 1880 NULL, 1881 NULL, 1882 MatFDColoringSetUp_SeqXAIJ, 1883 NULL, 1884 /*144*/ NULL, 1885 NULL, 1886 NULL, 1887 NULL, 1888 NULL, 1889 NULL, 1890 /*150*/ NULL, 1891 NULL}; 1892 1893 PetscErrorCode MatStoreValues_SeqSELL(Mat mat) 1894 { 1895 Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data; 1896 1897 PetscFunctionBegin; 1898 PetscCheck(a->nonew, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first"); 1899 1900 /* allocate space for values if not already there */ 1901 if (!a->saved_values) { PetscCall(PetscMalloc1(a->sliidx[a->totalslices] + 1, &a->saved_values)); } 1902 1903 /* copy values over */ 1904 PetscCall(PetscArraycpy(a->saved_values, a->val, a->sliidx[a->totalslices])); 1905 PetscFunctionReturn(PETSC_SUCCESS); 1906 } 1907 1908 PetscErrorCode MatRetrieveValues_SeqSELL(Mat mat) 1909 { 1910 Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data; 1911 1912 PetscFunctionBegin; 1913 PetscCheck(a->nonew, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first"); 1914 PetscCheck(a->saved_values, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatStoreValues(A);first"); 1915 PetscCall(PetscArraycpy(a->val, a->saved_values, a->sliidx[a->totalslices])); 1916 PetscFunctionReturn(PETSC_SUCCESS); 1917 } 1918 1919 /*@C 1920 MatSeqSELLRestoreArray - returns access to the array where the data for a `MATSEQSELL` matrix is stored obtained by `MatSeqSELLGetArray()` 1921 1922 Not Collective 1923 1924 Input Parameters: 1925 . mat - a `MATSEQSELL` matrix 1926 . array - pointer to the data 1927 1928 Level: intermediate 1929 1930 .seealso: `Mat`, `MATSEQSELL`, `MatSeqSELLGetArray()`, `MatSeqSELLRestoreArrayF90()` 1931 @*/ 1932 PetscErrorCode MatSeqSELLRestoreArray(Mat A, PetscScalar **array) 1933 { 1934 PetscFunctionBegin; 1935 PetscUseMethod(A, "MatSeqSELLRestoreArray_C", (Mat, PetscScalar **), (A, array)); 1936 PetscFunctionReturn(PETSC_SUCCESS); 1937 } 1938 1939 PETSC_EXTERN PetscErrorCode MatCreate_SeqSELL(Mat B) 1940 { 1941 Mat_SeqSELL *b; 1942 PetscMPIInt size; 1943 1944 PetscFunctionBegin; 1945 PetscCall(PetscCitationsRegister(citation, &cited)); 1946 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size)); 1947 PetscCheck(size <= 1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Comm must be of size 1"); 1948 1949 PetscCall(PetscNew(&b)); 1950 1951 B->data = (void *)b; 1952 1953 PetscCall(PetscMemcpy(B->ops, &MatOps_Values, sizeof(struct _MatOps))); 1954 1955 b->row = NULL; 1956 b->col = NULL; 1957 b->icol = NULL; 1958 b->reallocs = 0; 1959 b->ignorezeroentries = PETSC_FALSE; 1960 b->roworiented = PETSC_TRUE; 1961 b->nonew = 0; 1962 b->diag = NULL; 1963 b->solve_work = NULL; 1964 B->spptr = NULL; 1965 b->saved_values = NULL; 1966 b->idiag = NULL; 1967 b->mdiag = NULL; 1968 b->ssor_work = NULL; 1969 b->omega = 1.0; 1970 b->fshift = 0.0; 1971 b->idiagvalid = PETSC_FALSE; 1972 b->keepnonzeropattern = PETSC_FALSE; 1973 1974 PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQSELL)); 1975 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetArray_C", MatSeqSELLGetArray_SeqSELL)); 1976 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLRestoreArray_C", MatSeqSELLRestoreArray_SeqSELL)); 1977 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatStoreValues_C", MatStoreValues_SeqSELL)); 1978 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatRetrieveValues_C", MatRetrieveValues_SeqSELL)); 1979 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLSetPreallocation_C", MatSeqSELLSetPreallocation_SeqSELL)); 1980 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqsell_seqaij_C", MatConvert_SeqSELL_SeqAIJ)); 1981 PetscFunctionReturn(PETSC_SUCCESS); 1982 } 1983 1984 /* 1985 Given a matrix generated with MatGetFactor() duplicates all the information in A into B 1986 */ 1987 PetscErrorCode MatDuplicateNoCreate_SeqSELL(Mat C, Mat A, MatDuplicateOption cpvalues, PetscBool mallocmatspace) 1988 { 1989 Mat_SeqSELL *c = (Mat_SeqSELL *)C->data, *a = (Mat_SeqSELL *)A->data; 1990 PetscInt i, m = A->rmap->n; 1991 PetscInt totalslices = a->totalslices; 1992 1993 PetscFunctionBegin; 1994 C->factortype = A->factortype; 1995 c->row = NULL; 1996 c->col = NULL; 1997 c->icol = NULL; 1998 c->reallocs = 0; 1999 C->assembled = PETSC_TRUE; 2000 2001 PetscCall(PetscLayoutReference(A->rmap, &C->rmap)); 2002 PetscCall(PetscLayoutReference(A->cmap, &C->cmap)); 2003 2004 PetscCall(PetscMalloc1(8 * totalslices, &c->rlen)); 2005 PetscCall(PetscMalloc1(totalslices + 1, &c->sliidx)); 2006 2007 for (i = 0; i < m; i++) c->rlen[i] = a->rlen[i]; 2008 for (i = 0; i < totalslices + 1; i++) c->sliidx[i] = a->sliidx[i]; 2009 2010 /* allocate the matrix space */ 2011 if (mallocmatspace) { 2012 PetscCall(PetscMalloc2(a->maxallocmat, &c->val, a->maxallocmat, &c->colidx)); 2013 2014 c->singlemalloc = PETSC_TRUE; 2015 2016 if (m > 0) { 2017 PetscCall(PetscArraycpy(c->colidx, a->colidx, a->maxallocmat)); 2018 if (cpvalues == MAT_COPY_VALUES) { 2019 PetscCall(PetscArraycpy(c->val, a->val, a->maxallocmat)); 2020 } else { 2021 PetscCall(PetscArrayzero(c->val, a->maxallocmat)); 2022 } 2023 } 2024 } 2025 2026 c->ignorezeroentries = a->ignorezeroentries; 2027 c->roworiented = a->roworiented; 2028 c->nonew = a->nonew; 2029 if (a->diag) { 2030 PetscCall(PetscMalloc1(m, &c->diag)); 2031 for (i = 0; i < m; i++) c->diag[i] = a->diag[i]; 2032 } else c->diag = NULL; 2033 2034 c->solve_work = NULL; 2035 c->saved_values = NULL; 2036 c->idiag = NULL; 2037 c->ssor_work = NULL; 2038 c->keepnonzeropattern = a->keepnonzeropattern; 2039 c->free_val = PETSC_TRUE; 2040 c->free_colidx = PETSC_TRUE; 2041 2042 c->maxallocmat = a->maxallocmat; 2043 c->maxallocrow = a->maxallocrow; 2044 c->rlenmax = a->rlenmax; 2045 c->nz = a->nz; 2046 C->preallocated = PETSC_TRUE; 2047 2048 c->nonzerorowcnt = a->nonzerorowcnt; 2049 C->nonzerostate = A->nonzerostate; 2050 2051 PetscCall(PetscFunctionListDuplicate(((PetscObject)A)->qlist, &((PetscObject)C)->qlist)); 2052 PetscFunctionReturn(PETSC_SUCCESS); 2053 } 2054 2055 PetscErrorCode MatDuplicate_SeqSELL(Mat A, MatDuplicateOption cpvalues, Mat *B) 2056 { 2057 PetscFunctionBegin; 2058 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B)); 2059 PetscCall(MatSetSizes(*B, A->rmap->n, A->cmap->n, A->rmap->n, A->cmap->n)); 2060 if (!(A->rmap->n % A->rmap->bs) && !(A->cmap->n % A->cmap->bs)) PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 2061 PetscCall(MatSetType(*B, ((PetscObject)A)->type_name)); 2062 PetscCall(MatDuplicateNoCreate_SeqSELL(*B, A, cpvalues, PETSC_TRUE)); 2063 PetscFunctionReturn(PETSC_SUCCESS); 2064 } 2065 2066 /*MC 2067 MATSEQSELL - MATSEQSELL = "seqsell" - A matrix type to be used for sequential sparse matrices, 2068 based on the sliced Ellpack format 2069 2070 Options Database Keys: 2071 . -mat_type seqsell - sets the matrix type to "`MATSEQELL` during a call to `MatSetFromOptions()` 2072 2073 Level: beginner 2074 2075 .seealso: `Mat`, `MatCreateSeqSell()`, `MATSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATAIJ`, `MATMPIAIJ` 2076 M*/ 2077 2078 /*MC 2079 MATSELL - MATSELL = "sell" - A matrix type to be used for sparse matrices. 2080 2081 This matrix type is identical to `MATSEQSELL` when constructed with a single process communicator, 2082 and `MATMPISELL` otherwise. As a result, for single process communicators, 2083 `MatSeqSELLSetPreallocation()` is supported, and similarly `MatMPISELLSetPreallocation()` is supported 2084 for communicators controlling multiple processes. It is recommended that you call both of 2085 the above preallocation routines for simplicity. 2086 2087 Options Database Keys: 2088 . -mat_type sell - sets the matrix type to "sell" during a call to MatSetFromOptions() 2089 2090 Level: beginner 2091 2092 Notes: 2093 This format is only supported for real scalars, double precision, and 32 bit indices (the defaults). 2094 2095 It can provide better performance on Intel and AMD processes with AVX2 or AVX512 support for matrices that have a similar number of 2096 non-zeros in contiguous groups of rows. However if the computation is memory bandwidth limited it may not provide much improvement. 2097 2098 Developer Notes: 2099 On Intel (and AMD) systems some of the matrix operations use SIMD (AVX) instructions to achieve higher performance. 2100 2101 The sparse matrix format is as follows. For simplicity we assume a slice size of 2, it is actually 8 2102 .vb 2103 (2 0 3 4) 2104 Consider the matrix A = (5 0 6 0) 2105 (0 0 7 8) 2106 (0 0 9 9) 2107 2108 symbolically the Ellpack format can be written as 2109 2110 (2 3 4 |) (0 2 3 |) 2111 v = (5 6 0 |) colidx = (0 2 2 |) 2112 -------- --------- 2113 (7 8 |) (2 3 |) 2114 (9 9 |) (2 3 |) 2115 2116 The data for 2 contiguous rows of the matrix are stored together (in column-major format) (with any left-over rows handled as a special case). 2117 Any of the rows in a slice fewer columns than the rest of the slice (row 1 above) are padded with a previous valid column in their "extra" colidx[] locations and 2118 zeros in their "extra" v locations so that the matrix operations do not need special code to handle different length rows within the 2 rows in a slice. 2119 2120 The one-dimensional representation of v used in the code is (2 5 3 6 4 0 7 9 8 9) and for colidx is (0 0 2 2 3 2 2 2 3 3) 2121 2122 .ve 2123 2124 See MatMult_SeqSELL() for how this format is used with the SIMD operations to achieve high performance. 2125 2126 References: 2127 . * - Hong Zhang, Richard T. Mills, Karl Rupp, and Barry F. Smith, Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512}, 2128 Proceedings of the 47th International Conference on Parallel Processing, 2018. 2129 2130 .seealso: `Mat`, `MatCreateSeqSELL()`, `MatCreateSeqAIJ()`, `MatCreateSell()`, `MATSEQSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATMPIAIJ`, `MATAIJ` 2131 M*/ 2132 2133 /*@C 2134 MatCreateSeqSELL - Creates a sparse matrix in `MATSEQSELL` format. 2135 2136 Collective on comm 2137 2138 Input Parameters: 2139 + comm - MPI communicator, set to `PETSC_COMM_SELF` 2140 . m - number of rows 2141 . n - number of columns 2142 . rlenmax - maximum number of nonzeros in a row 2143 - rlen - array containing the number of nonzeros in the various rows 2144 (possibly different for each row) or NULL 2145 2146 Output Parameter: 2147 . A - the matrix 2148 2149 It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 2150 MatXXXXSetPreallocation() paradigm instead of this routine directly. 2151 [MatXXXXSetPreallocation() is, for example, `MatSeqSELLSetPreallocation()`] 2152 2153 Notes: 2154 If nnz is given then nz is ignored 2155 2156 Specify the preallocated storage with either rlenmax or rlen (not both). 2157 Set rlenmax = `PETSC_DEFAULT` and rlen = NULL for PETSc to control dynamic memory 2158 allocation. For large problems you MUST preallocate memory or you 2159 will get TERRIBLE performance, see the users' manual chapter on matrices. 2160 2161 Level: intermediate 2162 2163 .seealso: `Mat`, `MATSEQSELL`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatSeqSELLSetPreallocation()`, `MATSELL`, `MATSEQSELL`, `MATMPISELL` 2164 @*/ 2165 PetscErrorCode MatCreateSeqSELL(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt maxallocrow, const PetscInt rlen[], Mat *A) 2166 { 2167 PetscFunctionBegin; 2168 PetscCall(MatCreate(comm, A)); 2169 PetscCall(MatSetSizes(*A, m, n, m, n)); 2170 PetscCall(MatSetType(*A, MATSEQSELL)); 2171 PetscCall(MatSeqSELLSetPreallocation_SeqSELL(*A, maxallocrow, rlen)); 2172 PetscFunctionReturn(PETSC_SUCCESS); 2173 } 2174 2175 PetscErrorCode MatEqual_SeqSELL(Mat A, Mat B, PetscBool *flg) 2176 { 2177 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data, *b = (Mat_SeqSELL *)B->data; 2178 PetscInt totalslices = a->totalslices; 2179 2180 PetscFunctionBegin; 2181 /* If the matrix dimensions are not equal,or no of nonzeros */ 2182 if ((A->rmap->n != B->rmap->n) || (A->cmap->n != B->cmap->n) || (a->nz != b->nz) || (a->rlenmax != b->rlenmax)) { 2183 *flg = PETSC_FALSE; 2184 PetscFunctionReturn(PETSC_SUCCESS); 2185 } 2186 /* if the a->colidx are the same */ 2187 PetscCall(PetscArraycmp(a->colidx, b->colidx, a->sliidx[totalslices], flg)); 2188 if (!*flg) PetscFunctionReturn(PETSC_SUCCESS); 2189 /* if a->val are the same */ 2190 PetscCall(PetscArraycmp(a->val, b->val, a->sliidx[totalslices], flg)); 2191 PetscFunctionReturn(PETSC_SUCCESS); 2192 } 2193 2194 PetscErrorCode MatSeqSELLInvalidateDiagonal(Mat A) 2195 { 2196 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 2197 2198 PetscFunctionBegin; 2199 a->idiagvalid = PETSC_FALSE; 2200 PetscFunctionReturn(PETSC_SUCCESS); 2201 } 2202 2203 PetscErrorCode MatConjugate_SeqSELL(Mat A) 2204 { 2205 #if defined(PETSC_USE_COMPLEX) 2206 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 2207 PetscInt i; 2208 PetscScalar *val = a->val; 2209 2210 PetscFunctionBegin; 2211 for (i = 0; i < a->sliidx[a->totalslices]; i++) val[i] = PetscConj(val[i]); 2212 #else 2213 PetscFunctionBegin; 2214 #endif 2215 PetscFunctionReturn(PETSC_SUCCESS); 2216 } 2217