1 2 /* 3 Defines the basic matrix operations for the SELL matrix storage format. 4 */ 5 #include <../src/mat/impls/sell/seq/sell.h> /*I "petscmat.h" I*/ 6 #include <petscblaslapack.h> 7 #include <petsc/private/kernels/blocktranspose.h> 8 9 static PetscBool cited = PETSC_FALSE; 10 static const char citation[] = "@inproceedings{ZhangELLPACK2018,\n" 11 " author = {Hong Zhang and Richard T. Mills and Karl Rupp and Barry F. Smith},\n" 12 " title = {Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512}},\n" 13 " booktitle = {Proceedings of the 47th International Conference on Parallel Processing},\n" 14 " year = 2018\n" 15 "}\n"; 16 17 #if defined(PETSC_HAVE_IMMINTRIN_H) && (defined(__AVX512F__) || (defined(__AVX2__) && defined(__FMA__)) || defined(__AVX__)) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 18 19 #include <immintrin.h> 20 21 #if !defined(_MM_SCALE_8) 22 #define _MM_SCALE_8 8 23 #endif 24 25 #if defined(__AVX512F__) 26 /* these do not work 27 vec_idx = _mm512_loadunpackhi_epi32(vec_idx,acolidx); 28 vec_vals = _mm512_loadunpackhi_pd(vec_vals,aval); 29 */ 30 #define AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \ 31 /* if the mask bit is set, copy from acolidx, otherwise from vec_idx */ \ 32 vec_idx = _mm256_loadu_si256((__m256i const *)acolidx); \ 33 vec_vals = _mm512_loadu_pd(aval); \ 34 vec_x = _mm512_i32gather_pd(vec_idx, x, _MM_SCALE_8); \ 35 vec_y = _mm512_fmadd_pd(vec_x, vec_vals, vec_y) 36 #elif defined(__AVX2__) && defined(__FMA__) 37 #define AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \ 38 vec_vals = _mm256_loadu_pd(aval); \ 39 vec_idx = _mm_loadu_si128((__m128i const *)acolidx); /* SSE2 */ \ 40 vec_x = _mm256_i32gather_pd(x, vec_idx, _MM_SCALE_8); \ 41 vec_y = _mm256_fmadd_pd(vec_x, vec_vals, vec_y) 42 #endif 43 #endif /* PETSC_HAVE_IMMINTRIN_H */ 44 45 /*@C 46 MatSeqSELLSetPreallocation - For good matrix assembly performance 47 the user should preallocate the matrix storage by setting the parameter nz 48 (or the array nnz). By setting these parameters accurately, performance 49 during matrix assembly can be increased significantly. 50 51 Collective 52 53 Input Parameters: 54 + B - The `MATSEQSELL` matrix 55 . rlenmax - number of nonzeros per row (same for all rows) 56 - rlen - array containing the number of nonzeros in the various rows 57 (possibly different for each row) or `NULL` 58 59 Level: intermediate 60 61 Notes: 62 If `rlen` is given then `rlenmax` is ignored. 63 64 Specify the preallocated storage with either `rlenmax` or `rlen` (not both). 65 Set `rlenmax` = `PETSC_DEFAULT` and `rlen` = `NULL` for PETSc to control dynamic memory 66 allocation. For large problems you MUST preallocate memory or you 67 will get TERRIBLE performance, see the users' manual chapter on matrices. 68 69 You can call `MatGetInfo()` to get information on how effective the preallocation was; 70 for example the fields mallocs,nz_allocated,nz_used,nz_unneeded; 71 You can also run with the option `-info` and look for messages with the string 72 malloc in them to see if additional memory allocation was needed. 73 74 Developers Note: 75 Use `rlenmax` of `MAT_SKIP_ALLOCATION` to not allocate any space for the matrix 76 entries or columns indices. 77 78 The maximum number of nonzeos in any row should be as accurate as possible. 79 If it is underestimated, you will get bad performance due to reallocation 80 (`MatSeqXSELLReallocateSELL()`). 81 82 .seealso: `Mat`, `MATSEQSELL`, `MATSELL`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatGetInfo()` 83 @*/ 84 PetscErrorCode MatSeqSELLSetPreallocation(Mat B, PetscInt rlenmax, const PetscInt rlen[]) 85 { 86 PetscFunctionBegin; 87 PetscValidHeaderSpecific(B, MAT_CLASSID, 1); 88 PetscValidType(B, 1); 89 PetscTryMethod(B, "MatSeqSELLSetPreallocation_C", (Mat, PetscInt, const PetscInt[]), (B, rlenmax, rlen)); 90 PetscFunctionReturn(0); 91 } 92 93 PetscErrorCode MatSeqSELLSetPreallocation_SeqSELL(Mat B, PetscInt maxallocrow, const PetscInt rlen[]) 94 { 95 Mat_SeqSELL *b; 96 PetscInt i, j, totalslices; 97 PetscBool skipallocation = PETSC_FALSE, realalloc = PETSC_FALSE; 98 99 PetscFunctionBegin; 100 if (maxallocrow >= 0 || rlen) realalloc = PETSC_TRUE; 101 if (maxallocrow == MAT_SKIP_ALLOCATION) { 102 skipallocation = PETSC_TRUE; 103 maxallocrow = 0; 104 } 105 106 PetscCall(PetscLayoutSetUp(B->rmap)); 107 PetscCall(PetscLayoutSetUp(B->cmap)); 108 109 /* FIXME: if one preallocates more space than needed, the matrix does not shrink automatically, but for best performance it should */ 110 if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 5; 111 PetscCheck(maxallocrow >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "maxallocrow cannot be less than 0: value %" PetscInt_FMT, maxallocrow); 112 if (rlen) { 113 for (i = 0; i < B->rmap->n; i++) { 114 PetscCheck(rlen[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "rlen cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, rlen[i]); 115 PetscCheck(rlen[i] <= B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "rlen cannot be greater than row length: local row %" PetscInt_FMT " value %" PetscInt_FMT " rowlength %" PetscInt_FMT, i, rlen[i], B->cmap->n); 116 } 117 } 118 119 B->preallocated = PETSC_TRUE; 120 121 b = (Mat_SeqSELL *)B->data; 122 123 totalslices = PetscCeilInt(B->rmap->n, 8); 124 b->totalslices = totalslices; 125 if (!skipallocation) { 126 if (B->rmap->n & 0x07) PetscCall(PetscInfo(B, "Padding rows to the SEQSELL matrix because the number of rows is not the multiple of 8 (value %" PetscInt_FMT ")\n", B->rmap->n)); 127 128 if (!b->sliidx) { /* sliidx gives the starting index of each slice, the last element is the total space allocated */ 129 PetscCall(PetscMalloc1(totalslices + 1, &b->sliidx)); 130 } 131 if (!rlen) { /* if rlen is not provided, allocate same space for all the slices */ 132 if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 10; 133 else if (maxallocrow < 0) maxallocrow = 1; 134 for (i = 0; i <= totalslices; i++) b->sliidx[i] = i * 8 * maxallocrow; 135 } else { 136 maxallocrow = 0; 137 b->sliidx[0] = 0; 138 for (i = 1; i < totalslices; i++) { 139 b->sliidx[i] = 0; 140 for (j = 0; j < 8; j++) b->sliidx[i] = PetscMax(b->sliidx[i], rlen[8 * (i - 1) + j]); 141 maxallocrow = PetscMax(b->sliidx[i], maxallocrow); 142 PetscCall(PetscIntSumError(b->sliidx[i - 1], 8 * b->sliidx[i], &b->sliidx[i])); 143 } 144 /* last slice */ 145 b->sliidx[totalslices] = 0; 146 for (j = (totalslices - 1) * 8; j < B->rmap->n; j++) b->sliidx[totalslices] = PetscMax(b->sliidx[totalslices], rlen[j]); 147 maxallocrow = PetscMax(b->sliidx[totalslices], maxallocrow); 148 b->sliidx[totalslices] = b->sliidx[totalslices - 1] + 8 * b->sliidx[totalslices]; 149 } 150 151 /* allocate space for val, colidx, rlen */ 152 /* FIXME: should B's old memory be unlogged? */ 153 PetscCall(MatSeqXSELLFreeSELL(B, &b->val, &b->colidx)); 154 /* FIXME: assuming an element of the bit array takes 8 bits */ 155 PetscCall(PetscMalloc2(b->sliidx[totalslices], &b->val, b->sliidx[totalslices], &b->colidx)); 156 /* b->rlen will count nonzeros in each row so far. We dont copy rlen to b->rlen because the matrix has not been set. */ 157 PetscCall(PetscCalloc1(8 * totalslices, &b->rlen)); 158 159 b->singlemalloc = PETSC_TRUE; 160 b->free_val = PETSC_TRUE; 161 b->free_colidx = PETSC_TRUE; 162 } else { 163 b->free_val = PETSC_FALSE; 164 b->free_colidx = PETSC_FALSE; 165 } 166 167 b->nz = 0; 168 b->maxallocrow = maxallocrow; 169 b->rlenmax = maxallocrow; 170 b->maxallocmat = b->sliidx[totalslices]; 171 B->info.nz_unneeded = (double)b->maxallocmat; 172 if (realalloc) PetscCall(MatSetOption(B, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE)); 173 PetscFunctionReturn(0); 174 } 175 176 PetscErrorCode MatGetRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v) 177 { 178 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 179 PetscInt shift; 180 181 PetscFunctionBegin; 182 PetscCheck(row >= 0 && row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row %" PetscInt_FMT " out of range", row); 183 if (nz) *nz = a->rlen[row]; 184 shift = a->sliidx[row >> 3] + (row & 0x07); 185 if (!a->getrowcols) PetscCall(PetscMalloc2(a->rlenmax, &a->getrowcols, a->rlenmax, &a->getrowvals)); 186 if (idx) { 187 PetscInt j; 188 for (j = 0; j < a->rlen[row]; j++) a->getrowcols[j] = a->colidx[shift + 8 * j]; 189 *idx = a->getrowcols; 190 } 191 if (v) { 192 PetscInt j; 193 for (j = 0; j < a->rlen[row]; j++) a->getrowvals[j] = a->val[shift + 8 * j]; 194 *v = a->getrowvals; 195 } 196 PetscFunctionReturn(0); 197 } 198 199 PetscErrorCode MatRestoreRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v) 200 { 201 PetscFunctionBegin; 202 PetscFunctionReturn(0); 203 } 204 205 PetscErrorCode MatConvert_SeqSELL_SeqAIJ(Mat A, MatType newtype, MatReuse reuse, Mat *newmat) 206 { 207 Mat B; 208 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 209 PetscInt i; 210 211 PetscFunctionBegin; 212 if (reuse == MAT_REUSE_MATRIX) { 213 B = *newmat; 214 PetscCall(MatZeroEntries(B)); 215 } else { 216 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B)); 217 PetscCall(MatSetSizes(B, A->rmap->n, A->cmap->n, A->rmap->N, A->cmap->N)); 218 PetscCall(MatSetType(B, MATSEQAIJ)); 219 PetscCall(MatSeqAIJSetPreallocation(B, 0, a->rlen)); 220 } 221 222 for (i = 0; i < A->rmap->n; i++) { 223 PetscInt nz = 0, *cols = NULL; 224 PetscScalar *vals = NULL; 225 226 PetscCall(MatGetRow_SeqSELL(A, i, &nz, &cols, &vals)); 227 PetscCall(MatSetValues(B, 1, &i, nz, cols, vals, INSERT_VALUES)); 228 PetscCall(MatRestoreRow_SeqSELL(A, i, &nz, &cols, &vals)); 229 } 230 231 PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY)); 232 PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY)); 233 B->rmap->bs = A->rmap->bs; 234 235 if (reuse == MAT_INPLACE_MATRIX) { 236 PetscCall(MatHeaderReplace(A, &B)); 237 } else { 238 *newmat = B; 239 } 240 PetscFunctionReturn(0); 241 } 242 243 #include <../src/mat/impls/aij/seq/aij.h> 244 245 PetscErrorCode MatConvert_SeqAIJ_SeqSELL(Mat A, MatType newtype, MatReuse reuse, Mat *newmat) 246 { 247 Mat B; 248 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 249 PetscInt *ai = a->i, m = A->rmap->N, n = A->cmap->N, i, *rowlengths, row, ncols; 250 const PetscInt *cols; 251 const PetscScalar *vals; 252 253 PetscFunctionBegin; 254 255 if (reuse == MAT_REUSE_MATRIX) { 256 B = *newmat; 257 } else { 258 if (PetscDefined(USE_DEBUG) || !a->ilen) { 259 PetscCall(PetscMalloc1(m, &rowlengths)); 260 for (i = 0; i < m; i++) rowlengths[i] = ai[i + 1] - ai[i]; 261 } 262 if (PetscDefined(USE_DEBUG) && a->ilen) { 263 PetscBool eq; 264 PetscCall(PetscMemcmp(rowlengths, a->ilen, m * sizeof(PetscInt), &eq)); 265 PetscCheck(eq, PETSC_COMM_SELF, PETSC_ERR_PLIB, "SeqAIJ ilen array incorrect"); 266 PetscCall(PetscFree(rowlengths)); 267 rowlengths = a->ilen; 268 } else if (a->ilen) rowlengths = a->ilen; 269 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B)); 270 PetscCall(MatSetSizes(B, m, n, m, n)); 271 PetscCall(MatSetType(B, MATSEQSELL)); 272 PetscCall(MatSeqSELLSetPreallocation(B, 0, rowlengths)); 273 if (rowlengths != a->ilen) PetscCall(PetscFree(rowlengths)); 274 } 275 276 for (row = 0; row < m; row++) { 277 PetscCall(MatGetRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals)); 278 PetscCall(MatSetValues_SeqSELL(B, 1, &row, ncols, cols, vals, INSERT_VALUES)); 279 PetscCall(MatRestoreRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals)); 280 } 281 PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY)); 282 PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY)); 283 B->rmap->bs = A->rmap->bs; 284 285 if (reuse == MAT_INPLACE_MATRIX) { 286 PetscCall(MatHeaderReplace(A, &B)); 287 } else { 288 *newmat = B; 289 } 290 PetscFunctionReturn(0); 291 } 292 293 PetscErrorCode MatMult_SeqSELL(Mat A, Vec xx, Vec yy) 294 { 295 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 296 PetscScalar *y; 297 const PetscScalar *x; 298 const MatScalar *aval = a->val; 299 PetscInt totalslices = a->totalslices; 300 const PetscInt *acolidx = a->colidx; 301 PetscInt i, j; 302 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 303 __m512d vec_x, vec_y, vec_vals; 304 __m256i vec_idx; 305 __mmask8 mask; 306 __m512d vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4; 307 __m256i vec_idx2, vec_idx3, vec_idx4; 308 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 309 __m128i vec_idx; 310 __m256d vec_x, vec_y, vec_y2, vec_vals; 311 MatScalar yval; 312 PetscInt r, rows_left, row, nnz_in_row; 313 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 314 __m128d vec_x_tmp; 315 __m256d vec_x, vec_y, vec_y2, vec_vals; 316 MatScalar yval; 317 PetscInt r, rows_left, row, nnz_in_row; 318 #else 319 PetscScalar sum[8]; 320 #endif 321 322 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 323 #pragma disjoint(*x, *y, *aval) 324 #endif 325 326 PetscFunctionBegin; 327 PetscCall(VecGetArrayRead(xx, &x)); 328 PetscCall(VecGetArray(yy, &y)); 329 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 330 for (i = 0; i < totalslices; i++) { /* loop over slices */ 331 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 332 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 333 334 vec_y = _mm512_setzero_pd(); 335 vec_y2 = _mm512_setzero_pd(); 336 vec_y3 = _mm512_setzero_pd(); 337 vec_y4 = _mm512_setzero_pd(); 338 339 j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice column */ 340 switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) { 341 case 3: 342 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 343 acolidx += 8; 344 aval += 8; 345 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 346 acolidx += 8; 347 aval += 8; 348 AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3); 349 acolidx += 8; 350 aval += 8; 351 j += 3; 352 break; 353 case 2: 354 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 355 acolidx += 8; 356 aval += 8; 357 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 358 acolidx += 8; 359 aval += 8; 360 j += 2; 361 break; 362 case 1: 363 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 364 acolidx += 8; 365 aval += 8; 366 j += 1; 367 break; 368 } 369 #pragma novector 370 for (; j < (a->sliidx[i + 1] >> 3); j += 4) { 371 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 372 acolidx += 8; 373 aval += 8; 374 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 375 acolidx += 8; 376 aval += 8; 377 AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3); 378 acolidx += 8; 379 aval += 8; 380 AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4); 381 acolidx += 8; 382 aval += 8; 383 } 384 385 vec_y = _mm512_add_pd(vec_y, vec_y2); 386 vec_y = _mm512_add_pd(vec_y, vec_y3); 387 vec_y = _mm512_add_pd(vec_y, vec_y4); 388 if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */ 389 mask = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07))); 390 _mm512_mask_storeu_pd(&y[8 * i], mask, vec_y); 391 } else { 392 _mm512_storeu_pd(&y[8 * i], vec_y); 393 } 394 } 395 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 396 for (i = 0; i < totalslices; i++) { /* loop over full slices */ 397 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 398 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 399 400 /* last slice may have padding rows. Don't use vectorization. */ 401 if (i == totalslices - 1 && (A->rmap->n & 0x07)) { 402 rows_left = A->rmap->n - 8 * i; 403 for (r = 0; r < rows_left; ++r) { 404 yval = (MatScalar)0; 405 row = 8 * i + r; 406 nnz_in_row = a->rlen[row]; 407 for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]]; 408 y[row] = yval; 409 } 410 break; 411 } 412 413 vec_y = _mm256_setzero_pd(); 414 vec_y2 = _mm256_setzero_pd(); 415 416 /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */ 417 #pragma novector 418 #pragma unroll(2) 419 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) { 420 AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 421 aval += 4; 422 acolidx += 4; 423 AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y2); 424 aval += 4; 425 acolidx += 4; 426 } 427 428 _mm256_storeu_pd(y + i * 8, vec_y); 429 _mm256_storeu_pd(y + i * 8 + 4, vec_y2); 430 } 431 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 432 for (i = 0; i < totalslices; i++) { /* loop over full slices */ 433 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 434 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 435 436 vec_y = _mm256_setzero_pd(); 437 vec_y2 = _mm256_setzero_pd(); 438 439 /* last slice may have padding rows. Don't use vectorization. */ 440 if (i == totalslices - 1 && (A->rmap->n & 0x07)) { 441 rows_left = A->rmap->n - 8 * i; 442 for (r = 0; r < rows_left; ++r) { 443 yval = (MatScalar)0; 444 row = 8 * i + r; 445 nnz_in_row = a->rlen[row]; 446 for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]]; 447 y[row] = yval; 448 } 449 break; 450 } 451 452 /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */ 453 #pragma novector 454 #pragma unroll(2) 455 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) { 456 vec_vals = _mm256_loadu_pd(aval); 457 vec_x_tmp = _mm_setzero_pd(); 458 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 459 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 460 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0); 461 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 462 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 463 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1); 464 vec_y = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y); 465 aval += 4; 466 467 vec_vals = _mm256_loadu_pd(aval); 468 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 469 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 470 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0); 471 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 472 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 473 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1); 474 vec_y2 = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2); 475 aval += 4; 476 } 477 478 _mm256_storeu_pd(y + i * 8, vec_y); 479 _mm256_storeu_pd(y + i * 8 + 4, vec_y2); 480 } 481 #else 482 for (i = 0; i < totalslices; i++) { /* loop over slices */ 483 for (j = 0; j < 8; j++) sum[j] = 0.0; 484 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) { 485 sum[0] += aval[j] * x[acolidx[j]]; 486 sum[1] += aval[j + 1] * x[acolidx[j + 1]]; 487 sum[2] += aval[j + 2] * x[acolidx[j + 2]]; 488 sum[3] += aval[j + 3] * x[acolidx[j + 3]]; 489 sum[4] += aval[j + 4] * x[acolidx[j + 4]]; 490 sum[5] += aval[j + 5] * x[acolidx[j + 5]]; 491 sum[6] += aval[j + 6] * x[acolidx[j + 6]]; 492 sum[7] += aval[j + 7] * x[acolidx[j + 7]]; 493 } 494 if (i == totalslices - 1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */ 495 for (j = 0; j < (A->rmap->n & 0x07); j++) y[8 * i + j] = sum[j]; 496 } else { 497 for (j = 0; j < 8; j++) y[8 * i + j] = sum[j]; 498 } 499 } 500 #endif 501 502 PetscCall(PetscLogFlops(2.0 * a->nz - a->nonzerorowcnt)); /* theoretical minimal FLOPs */ 503 PetscCall(VecRestoreArrayRead(xx, &x)); 504 PetscCall(VecRestoreArray(yy, &y)); 505 PetscFunctionReturn(0); 506 } 507 508 #include <../src/mat/impls/aij/seq/ftn-kernels/fmultadd.h> 509 PetscErrorCode MatMultAdd_SeqSELL(Mat A, Vec xx, Vec yy, Vec zz) 510 { 511 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 512 PetscScalar *y, *z; 513 const PetscScalar *x; 514 const MatScalar *aval = a->val; 515 PetscInt totalslices = a->totalslices; 516 const PetscInt *acolidx = a->colidx; 517 PetscInt i, j; 518 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 519 __m512d vec_x, vec_y, vec_vals; 520 __m256i vec_idx; 521 __mmask8 mask; 522 __m512d vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4; 523 __m256i vec_idx2, vec_idx3, vec_idx4; 524 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 525 __m128d vec_x_tmp; 526 __m256d vec_x, vec_y, vec_y2, vec_vals; 527 MatScalar yval; 528 PetscInt r, row, nnz_in_row; 529 #else 530 PetscScalar sum[8]; 531 #endif 532 533 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 534 #pragma disjoint(*x, *y, *aval) 535 #endif 536 537 PetscFunctionBegin; 538 PetscCall(VecGetArrayRead(xx, &x)); 539 PetscCall(VecGetArrayPair(yy, zz, &y, &z)); 540 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 541 for (i = 0; i < totalslices; i++) { /* loop over slices */ 542 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 543 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 544 545 if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */ 546 mask = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07))); 547 vec_y = _mm512_mask_loadu_pd(vec_y, mask, &y[8 * i]); 548 } else { 549 vec_y = _mm512_loadu_pd(&y[8 * i]); 550 } 551 vec_y2 = _mm512_setzero_pd(); 552 vec_y3 = _mm512_setzero_pd(); 553 vec_y4 = _mm512_setzero_pd(); 554 555 j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice column */ 556 switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) { 557 case 3: 558 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 559 acolidx += 8; 560 aval += 8; 561 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 562 acolidx += 8; 563 aval += 8; 564 AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3); 565 acolidx += 8; 566 aval += 8; 567 j += 3; 568 break; 569 case 2: 570 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 571 acolidx += 8; 572 aval += 8; 573 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 574 acolidx += 8; 575 aval += 8; 576 j += 2; 577 break; 578 case 1: 579 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 580 acolidx += 8; 581 aval += 8; 582 j += 1; 583 break; 584 } 585 #pragma novector 586 for (; j < (a->sliidx[i + 1] >> 3); j += 4) { 587 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 588 acolidx += 8; 589 aval += 8; 590 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 591 acolidx += 8; 592 aval += 8; 593 AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3); 594 acolidx += 8; 595 aval += 8; 596 AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4); 597 acolidx += 8; 598 aval += 8; 599 } 600 601 vec_y = _mm512_add_pd(vec_y, vec_y2); 602 vec_y = _mm512_add_pd(vec_y, vec_y3); 603 vec_y = _mm512_add_pd(vec_y, vec_y4); 604 if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */ 605 _mm512_mask_storeu_pd(&z[8 * i], mask, vec_y); 606 } else { 607 _mm512_storeu_pd(&z[8 * i], vec_y); 608 } 609 } 610 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 611 for (i = 0; i < totalslices; i++) { /* loop over full slices */ 612 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 613 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 614 615 /* last slice may have padding rows. Don't use vectorization. */ 616 if (i == totalslices - 1 && (A->rmap->n & 0x07)) { 617 for (r = 0; r < (A->rmap->n & 0x07); ++r) { 618 row = 8 * i + r; 619 yval = (MatScalar)0.0; 620 nnz_in_row = a->rlen[row]; 621 for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]]; 622 z[row] = y[row] + yval; 623 } 624 break; 625 } 626 627 vec_y = _mm256_loadu_pd(y + 8 * i); 628 vec_y2 = _mm256_loadu_pd(y + 8 * i + 4); 629 630 /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */ 631 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) { 632 vec_vals = _mm256_loadu_pd(aval); 633 vec_x_tmp = _mm_setzero_pd(); 634 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 635 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 636 vec_x = _mm256_setzero_pd(); 637 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0); 638 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 639 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 640 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1); 641 vec_y = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y); 642 aval += 4; 643 644 vec_vals = _mm256_loadu_pd(aval); 645 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 646 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 647 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0); 648 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 649 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 650 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1); 651 vec_y2 = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2); 652 aval += 4; 653 } 654 655 _mm256_storeu_pd(z + i * 8, vec_y); 656 _mm256_storeu_pd(z + i * 8 + 4, vec_y2); 657 } 658 #else 659 for (i = 0; i < totalslices; i++) { /* loop over slices */ 660 for (j = 0; j < 8; j++) sum[j] = 0.0; 661 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) { 662 sum[0] += aval[j] * x[acolidx[j]]; 663 sum[1] += aval[j + 1] * x[acolidx[j + 1]]; 664 sum[2] += aval[j + 2] * x[acolidx[j + 2]]; 665 sum[3] += aval[j + 3] * x[acolidx[j + 3]]; 666 sum[4] += aval[j + 4] * x[acolidx[j + 4]]; 667 sum[5] += aval[j + 5] * x[acolidx[j + 5]]; 668 sum[6] += aval[j + 6] * x[acolidx[j + 6]]; 669 sum[7] += aval[j + 7] * x[acolidx[j + 7]]; 670 } 671 if (i == totalslices - 1 && (A->rmap->n & 0x07)) { 672 for (j = 0; j < (A->rmap->n & 0x07); j++) z[8 * i + j] = y[8 * i + j] + sum[j]; 673 } else { 674 for (j = 0; j < 8; j++) z[8 * i + j] = y[8 * i + j] + sum[j]; 675 } 676 } 677 #endif 678 679 PetscCall(PetscLogFlops(2.0 * a->nz)); 680 PetscCall(VecRestoreArrayRead(xx, &x)); 681 PetscCall(VecRestoreArrayPair(yy, zz, &y, &z)); 682 PetscFunctionReturn(0); 683 } 684 685 PetscErrorCode MatMultTransposeAdd_SeqSELL(Mat A, Vec xx, Vec zz, Vec yy) 686 { 687 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 688 PetscScalar *y; 689 const PetscScalar *x; 690 const MatScalar *aval = a->val; 691 const PetscInt *acolidx = a->colidx; 692 PetscInt i, j, r, row, nnz_in_row, totalslices = a->totalslices; 693 694 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 695 #pragma disjoint(*x, *y, *aval) 696 #endif 697 698 PetscFunctionBegin; 699 if (A->symmetric == PETSC_BOOL3_TRUE) { 700 PetscCall(MatMultAdd_SeqSELL(A, xx, zz, yy)); 701 PetscFunctionReturn(0); 702 } 703 if (zz != yy) PetscCall(VecCopy(zz, yy)); 704 PetscCall(VecGetArrayRead(xx, &x)); 705 PetscCall(VecGetArray(yy, &y)); 706 for (i = 0; i < a->totalslices; i++) { /* loop over slices */ 707 if (i == totalslices - 1 && (A->rmap->n & 0x07)) { 708 for (r = 0; r < (A->rmap->n & 0x07); ++r) { 709 row = 8 * i + r; 710 nnz_in_row = a->rlen[row]; 711 for (j = 0; j < nnz_in_row; ++j) y[acolidx[8 * j + r]] += aval[8 * j + r] * x[row]; 712 } 713 break; 714 } 715 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) { 716 y[acolidx[j]] += aval[j] * x[8 * i]; 717 y[acolidx[j + 1]] += aval[j + 1] * x[8 * i + 1]; 718 y[acolidx[j + 2]] += aval[j + 2] * x[8 * i + 2]; 719 y[acolidx[j + 3]] += aval[j + 3] * x[8 * i + 3]; 720 y[acolidx[j + 4]] += aval[j + 4] * x[8 * i + 4]; 721 y[acolidx[j + 5]] += aval[j + 5] * x[8 * i + 5]; 722 y[acolidx[j + 6]] += aval[j + 6] * x[8 * i + 6]; 723 y[acolidx[j + 7]] += aval[j + 7] * x[8 * i + 7]; 724 } 725 } 726 PetscCall(PetscLogFlops(2.0 * a->sliidx[a->totalslices])); 727 PetscCall(VecRestoreArrayRead(xx, &x)); 728 PetscCall(VecRestoreArray(yy, &y)); 729 PetscFunctionReturn(0); 730 } 731 732 PetscErrorCode MatMultTranspose_SeqSELL(Mat A, Vec xx, Vec yy) 733 { 734 PetscFunctionBegin; 735 if (A->symmetric == PETSC_BOOL3_TRUE) { 736 PetscCall(MatMult_SeqSELL(A, xx, yy)); 737 } else { 738 PetscCall(VecSet(yy, 0.0)); 739 PetscCall(MatMultTransposeAdd_SeqSELL(A, xx, yy, yy)); 740 } 741 PetscFunctionReturn(0); 742 } 743 744 /* 745 Checks for missing diagonals 746 */ 747 PetscErrorCode MatMissingDiagonal_SeqSELL(Mat A, PetscBool *missing, PetscInt *d) 748 { 749 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 750 PetscInt *diag, i; 751 752 PetscFunctionBegin; 753 *missing = PETSC_FALSE; 754 if (A->rmap->n > 0 && !(a->colidx)) { 755 *missing = PETSC_TRUE; 756 if (d) *d = 0; 757 PetscCall(PetscInfo(A, "Matrix has no entries therefore is missing diagonal\n")); 758 } else { 759 diag = a->diag; 760 for (i = 0; i < A->rmap->n; i++) { 761 if (diag[i] == -1) { 762 *missing = PETSC_TRUE; 763 if (d) *d = i; 764 PetscCall(PetscInfo(A, "Matrix is missing diagonal number %" PetscInt_FMT "\n", i)); 765 break; 766 } 767 } 768 } 769 PetscFunctionReturn(0); 770 } 771 772 PetscErrorCode MatMarkDiagonal_SeqSELL(Mat A) 773 { 774 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 775 PetscInt i, j, m = A->rmap->n, shift; 776 777 PetscFunctionBegin; 778 if (!a->diag) { 779 PetscCall(PetscMalloc1(m, &a->diag)); 780 a->free_diag = PETSC_TRUE; 781 } 782 for (i = 0; i < m; i++) { /* loop over rows */ 783 shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */ 784 a->diag[i] = -1; 785 for (j = 0; j < a->rlen[i]; j++) { 786 if (a->colidx[shift + j * 8] == i) { 787 a->diag[i] = shift + j * 8; 788 break; 789 } 790 } 791 } 792 PetscFunctionReturn(0); 793 } 794 795 /* 796 Negative shift indicates do not generate an error if there is a zero diagonal, just invert it anyways 797 */ 798 PetscErrorCode MatInvertDiagonal_SeqSELL(Mat A, PetscScalar omega, PetscScalar fshift) 799 { 800 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 801 PetscInt i, *diag, m = A->rmap->n; 802 MatScalar *val = a->val; 803 PetscScalar *idiag, *mdiag; 804 805 PetscFunctionBegin; 806 if (a->idiagvalid) PetscFunctionReturn(0); 807 PetscCall(MatMarkDiagonal_SeqSELL(A)); 808 diag = a->diag; 809 if (!a->idiag) { 810 PetscCall(PetscMalloc3(m, &a->idiag, m, &a->mdiag, m, &a->ssor_work)); 811 val = a->val; 812 } 813 mdiag = a->mdiag; 814 idiag = a->idiag; 815 816 if (omega == 1.0 && PetscRealPart(fshift) <= 0.0) { 817 for (i = 0; i < m; i++) { 818 mdiag[i] = val[diag[i]]; 819 if (!PetscAbsScalar(mdiag[i])) { /* zero diagonal */ 820 if (PetscRealPart(fshift)) { 821 PetscCall(PetscInfo(A, "Zero diagonal on row %" PetscInt_FMT "\n", i)); 822 A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 823 A->factorerror_zeropivot_value = 0.0; 824 A->factorerror_zeropivot_row = i; 825 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Zero diagonal on row %" PetscInt_FMT, i); 826 } 827 idiag[i] = 1.0 / val[diag[i]]; 828 } 829 PetscCall(PetscLogFlops(m)); 830 } else { 831 for (i = 0; i < m; i++) { 832 mdiag[i] = val[diag[i]]; 833 idiag[i] = omega / (fshift + val[diag[i]]); 834 } 835 PetscCall(PetscLogFlops(2.0 * m)); 836 } 837 a->idiagvalid = PETSC_TRUE; 838 PetscFunctionReturn(0); 839 } 840 841 PetscErrorCode MatZeroEntries_SeqSELL(Mat A) 842 { 843 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 844 845 PetscFunctionBegin; 846 PetscCall(PetscArrayzero(a->val, a->sliidx[a->totalslices])); 847 PetscCall(MatSeqSELLInvalidateDiagonal(A)); 848 PetscFunctionReturn(0); 849 } 850 851 PetscErrorCode MatDestroy_SeqSELL(Mat A) 852 { 853 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 854 855 PetscFunctionBegin; 856 #if defined(PETSC_USE_LOG) 857 PetscLogObjectState((PetscObject)A, "Rows=%" PetscInt_FMT ", Cols=%" PetscInt_FMT ", NZ=%" PetscInt_FMT, A->rmap->n, A->cmap->n, a->nz); 858 #endif 859 PetscCall(MatSeqXSELLFreeSELL(A, &a->val, &a->colidx)); 860 PetscCall(ISDestroy(&a->row)); 861 PetscCall(ISDestroy(&a->col)); 862 PetscCall(PetscFree(a->diag)); 863 PetscCall(PetscFree(a->rlen)); 864 PetscCall(PetscFree(a->sliidx)); 865 PetscCall(PetscFree3(a->idiag, a->mdiag, a->ssor_work)); 866 PetscCall(PetscFree(a->solve_work)); 867 PetscCall(ISDestroy(&a->icol)); 868 PetscCall(PetscFree(a->saved_values)); 869 PetscCall(PetscFree2(a->getrowcols, a->getrowvals)); 870 871 PetscCall(PetscFree(A->data)); 872 873 PetscCall(PetscObjectChangeTypeName((PetscObject)A, NULL)); 874 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatStoreValues_C", NULL)); 875 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatRetrieveValues_C", NULL)); 876 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLSetPreallocation_C", NULL)); 877 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetArray_C", NULL)); 878 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLRestoreArray_C", NULL)); 879 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqsell_seqaij_C", NULL)); 880 PetscFunctionReturn(0); 881 } 882 883 PetscErrorCode MatSetOption_SeqSELL(Mat A, MatOption op, PetscBool flg) 884 { 885 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 886 887 PetscFunctionBegin; 888 switch (op) { 889 case MAT_ROW_ORIENTED: 890 a->roworiented = flg; 891 break; 892 case MAT_KEEP_NONZERO_PATTERN: 893 a->keepnonzeropattern = flg; 894 break; 895 case MAT_NEW_NONZERO_LOCATIONS: 896 a->nonew = (flg ? 0 : 1); 897 break; 898 case MAT_NEW_NONZERO_LOCATION_ERR: 899 a->nonew = (flg ? -1 : 0); 900 break; 901 case MAT_NEW_NONZERO_ALLOCATION_ERR: 902 a->nonew = (flg ? -2 : 0); 903 break; 904 case MAT_UNUSED_NONZERO_LOCATION_ERR: 905 a->nounused = (flg ? -1 : 0); 906 break; 907 case MAT_FORCE_DIAGONAL_ENTRIES: 908 case MAT_IGNORE_OFF_PROC_ENTRIES: 909 case MAT_USE_HASH_TABLE: 910 case MAT_SORTED_FULL: 911 PetscCall(PetscInfo(A, "Option %s ignored\n", MatOptions[op])); 912 break; 913 case MAT_SPD: 914 case MAT_SYMMETRIC: 915 case MAT_STRUCTURALLY_SYMMETRIC: 916 case MAT_HERMITIAN: 917 case MAT_SYMMETRY_ETERNAL: 918 case MAT_STRUCTURAL_SYMMETRY_ETERNAL: 919 case MAT_SPD_ETERNAL: 920 /* These options are handled directly by MatSetOption() */ 921 break; 922 default: 923 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unknown option %d", op); 924 } 925 PetscFunctionReturn(0); 926 } 927 928 PetscErrorCode MatGetDiagonal_SeqSELL(Mat A, Vec v) 929 { 930 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 931 PetscInt i, j, n, shift; 932 PetscScalar *x, zero = 0.0; 933 934 PetscFunctionBegin; 935 PetscCall(VecGetLocalSize(v, &n)); 936 PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector"); 937 938 if (A->factortype == MAT_FACTOR_ILU || A->factortype == MAT_FACTOR_LU) { 939 PetscInt *diag = a->diag; 940 PetscCall(VecGetArray(v, &x)); 941 for (i = 0; i < n; i++) x[i] = 1.0 / a->val[diag[i]]; 942 PetscCall(VecRestoreArray(v, &x)); 943 PetscFunctionReturn(0); 944 } 945 946 PetscCall(VecSet(v, zero)); 947 PetscCall(VecGetArray(v, &x)); 948 for (i = 0; i < n; i++) { /* loop over rows */ 949 shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */ 950 x[i] = 0; 951 for (j = 0; j < a->rlen[i]; j++) { 952 if (a->colidx[shift + j * 8] == i) { 953 x[i] = a->val[shift + j * 8]; 954 break; 955 } 956 } 957 } 958 PetscCall(VecRestoreArray(v, &x)); 959 PetscFunctionReturn(0); 960 } 961 962 PetscErrorCode MatDiagonalScale_SeqSELL(Mat A, Vec ll, Vec rr) 963 { 964 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 965 const PetscScalar *l, *r; 966 PetscInt i, j, m, n, row; 967 968 PetscFunctionBegin; 969 if (ll) { 970 /* The local size is used so that VecMPI can be passed to this routine 971 by MatDiagonalScale_MPISELL */ 972 PetscCall(VecGetLocalSize(ll, &m)); 973 PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length"); 974 PetscCall(VecGetArrayRead(ll, &l)); 975 for (i = 0; i < a->totalslices; i++) { /* loop over slices */ 976 if (i == a->totalslices - 1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */ 977 for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) & 0x07)) { 978 if (row < (A->rmap->n & 0x07)) a->val[j] *= l[8 * i + row]; 979 } 980 } else { 981 for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) & 0x07)) a->val[j] *= l[8 * i + row]; 982 } 983 } 984 PetscCall(VecRestoreArrayRead(ll, &l)); 985 PetscCall(PetscLogFlops(a->nz)); 986 } 987 if (rr) { 988 PetscCall(VecGetLocalSize(rr, &n)); 989 PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length"); 990 PetscCall(VecGetArrayRead(rr, &r)); 991 for (i = 0; i < a->totalslices; i++) { /* loop over slices */ 992 if (i == a->totalslices - 1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */ 993 for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) & 0x07)) { 994 if (row < (A->rmap->n & 0x07)) a->val[j] *= r[a->colidx[j]]; 995 } 996 } else { 997 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j++) a->val[j] *= r[a->colidx[j]]; 998 } 999 } 1000 PetscCall(VecRestoreArrayRead(rr, &r)); 1001 PetscCall(PetscLogFlops(a->nz)); 1002 } 1003 PetscCall(MatSeqSELLInvalidateDiagonal(A)); 1004 PetscFunctionReturn(0); 1005 } 1006 1007 PetscErrorCode MatGetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], PetscScalar v[]) 1008 { 1009 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1010 PetscInt *cp, i, k, low, high, t, row, col, l; 1011 PetscInt shift; 1012 MatScalar *vp; 1013 1014 PetscFunctionBegin; 1015 for (k = 0; k < m; k++) { /* loop over requested rows */ 1016 row = im[k]; 1017 if (row < 0) continue; 1018 PetscCheck(row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->n - 1); 1019 shift = a->sliidx[row >> 3] + (row & 0x07); /* starting index of the row */ 1020 cp = a->colidx + shift; /* pointer to the row */ 1021 vp = a->val + shift; /* pointer to the row */ 1022 for (l = 0; l < n; l++) { /* loop over requested columns */ 1023 col = in[l]; 1024 if (col < 0) continue; 1025 PetscCheck(col < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column too large: row %" PetscInt_FMT " max %" PetscInt_FMT, col, A->cmap->n - 1); 1026 high = a->rlen[row]; 1027 low = 0; /* assume unsorted */ 1028 while (high - low > 5) { 1029 t = (low + high) / 2; 1030 if (*(cp + t * 8) > col) high = t; 1031 else low = t; 1032 } 1033 for (i = low; i < high; i++) { 1034 if (*(cp + 8 * i) > col) break; 1035 if (*(cp + 8 * i) == col) { 1036 *v++ = *(vp + 8 * i); 1037 goto finished; 1038 } 1039 } 1040 *v++ = 0.0; 1041 finished:; 1042 } 1043 } 1044 PetscFunctionReturn(0); 1045 } 1046 1047 PetscErrorCode MatView_SeqSELL_ASCII(Mat A, PetscViewer viewer) 1048 { 1049 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1050 PetscInt i, j, m = A->rmap->n, shift; 1051 const char *name; 1052 PetscViewerFormat format; 1053 1054 PetscFunctionBegin; 1055 PetscCall(PetscViewerGetFormat(viewer, &format)); 1056 if (format == PETSC_VIEWER_ASCII_MATLAB) { 1057 PetscInt nofinalvalue = 0; 1058 /* 1059 if (m && ((a->i[m] == a->i[m-1]) || (a->j[a->nz-1] != A->cmap->n-1))) { 1060 nofinalvalue = 1; 1061 } 1062 */ 1063 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1064 PetscCall(PetscViewerASCIIPrintf(viewer, "%% Size = %" PetscInt_FMT " %" PetscInt_FMT " \n", m, A->cmap->n)); 1065 PetscCall(PetscViewerASCIIPrintf(viewer, "%% Nonzeros = %" PetscInt_FMT " \n", a->nz)); 1066 #if defined(PETSC_USE_COMPLEX) 1067 PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",4);\n", a->nz + nofinalvalue)); 1068 #else 1069 PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",3);\n", a->nz + nofinalvalue)); 1070 #endif 1071 PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = [\n")); 1072 1073 for (i = 0; i < m; i++) { 1074 shift = a->sliidx[i >> 3] + (i & 0x07); 1075 for (j = 0; j < a->rlen[i]; j++) { 1076 #if defined(PETSC_USE_COMPLEX) 1077 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %18.16e %18.16e\n", i + 1, a->colidx[shift + 8 * j] + 1, (double)PetscRealPart(a->val[shift + 8 * j]), (double)PetscImaginaryPart(a->val[shift + 8 * j]))); 1078 #else 1079 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %18.16e\n", i + 1, a->colidx[shift + 8 * j] + 1, (double)a->val[shift + 8 * j])); 1080 #endif 1081 } 1082 } 1083 /* 1084 if (nofinalvalue) { 1085 #if defined(PETSC_USE_COMPLEX) 1086 PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %18.16e %18.16e\n",m,A->cmap->n,0.,0.)); 1087 #else 1088 PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %18.16e\n",m,A->cmap->n,0.0)); 1089 #endif 1090 } 1091 */ 1092 PetscCall(PetscObjectGetName((PetscObject)A, &name)); 1093 PetscCall(PetscViewerASCIIPrintf(viewer, "];\n %s = spconvert(zzz);\n", name)); 1094 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1095 } else if (format == PETSC_VIEWER_ASCII_FACTOR_INFO || format == PETSC_VIEWER_ASCII_INFO) { 1096 PetscFunctionReturn(0); 1097 } else if (format == PETSC_VIEWER_ASCII_COMMON) { 1098 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1099 for (i = 0; i < m; i++) { 1100 PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i)); 1101 shift = a->sliidx[i >> 3] + (i & 0x07); 1102 for (j = 0; j < a->rlen[i]; j++) { 1103 #if defined(PETSC_USE_COMPLEX) 1104 if (PetscImaginaryPart(a->val[shift + 8 * j]) > 0.0 && PetscRealPart(a->val[shift + 8 * j]) != 0.0) { 1105 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]), (double)PetscImaginaryPart(a->val[shift + 8 * j]))); 1106 } else if (PetscImaginaryPart(a->val[shift + 8 * j]) < 0.0 && PetscRealPart(a->val[shift + 8 * j]) != 0.0) { 1107 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]), (double)-PetscImaginaryPart(a->val[shift + 8 * j]))); 1108 } else if (PetscRealPart(a->val[shift + 8 * j]) != 0.0) { 1109 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]))); 1110 } 1111 #else 1112 if (a->val[shift + 8 * j] != 0.0) PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + 8 * j], (double)a->val[shift + 8 * j])); 1113 #endif 1114 } 1115 PetscCall(PetscViewerASCIIPrintf(viewer, "\n")); 1116 } 1117 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1118 } else if (format == PETSC_VIEWER_ASCII_DENSE) { 1119 PetscInt cnt = 0, jcnt; 1120 PetscScalar value; 1121 #if defined(PETSC_USE_COMPLEX) 1122 PetscBool realonly = PETSC_TRUE; 1123 for (i = 0; i < a->sliidx[a->totalslices]; i++) { 1124 if (PetscImaginaryPart(a->val[i]) != 0.0) { 1125 realonly = PETSC_FALSE; 1126 break; 1127 } 1128 } 1129 #endif 1130 1131 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1132 for (i = 0; i < m; i++) { 1133 jcnt = 0; 1134 shift = a->sliidx[i >> 3] + (i & 0x07); 1135 for (j = 0; j < A->cmap->n; j++) { 1136 if (jcnt < a->rlen[i] && j == a->colidx[shift + 8 * j]) { 1137 value = a->val[cnt++]; 1138 jcnt++; 1139 } else { 1140 value = 0.0; 1141 } 1142 #if defined(PETSC_USE_COMPLEX) 1143 if (realonly) { 1144 PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)PetscRealPart(value))); 1145 } else { 1146 PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e+%7.5e i ", (double)PetscRealPart(value), (double)PetscImaginaryPart(value))); 1147 } 1148 #else 1149 PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)value)); 1150 #endif 1151 } 1152 PetscCall(PetscViewerASCIIPrintf(viewer, "\n")); 1153 } 1154 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1155 } else if (format == PETSC_VIEWER_ASCII_MATRIXMARKET) { 1156 PetscInt fshift = 1; 1157 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1158 #if defined(PETSC_USE_COMPLEX) 1159 PetscCall(PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate complex general\n")); 1160 #else 1161 PetscCall(PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate real general\n")); 1162 #endif 1163 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %" PetscInt_FMT "\n", m, A->cmap->n, a->nz)); 1164 for (i = 0; i < m; i++) { 1165 shift = a->sliidx[i >> 3] + (i & 0x07); 1166 for (j = 0; j < a->rlen[i]; j++) { 1167 #if defined(PETSC_USE_COMPLEX) 1168 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g %g\n", i + fshift, a->colidx[shift + 8 * j] + fshift, (double)PetscRealPart(a->val[shift + 8 * j]), (double)PetscImaginaryPart(a->val[shift + 8 * j]))); 1169 #else 1170 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g\n", i + fshift, a->colidx[shift + 8 * j] + fshift, (double)a->val[shift + 8 * j])); 1171 #endif 1172 } 1173 } 1174 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1175 } else if (format == PETSC_VIEWER_NATIVE) { 1176 for (i = 0; i < a->totalslices; i++) { /* loop over slices */ 1177 PetscInt row; 1178 PetscCall(PetscViewerASCIIPrintf(viewer, "slice %" PetscInt_FMT ": %" PetscInt_FMT " %" PetscInt_FMT "\n", i, a->sliidx[i], a->sliidx[i + 1])); 1179 for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) & 0x07)) { 1180 #if defined(PETSC_USE_COMPLEX) 1181 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1182 PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g + %g i\n", 8 * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j]))); 1183 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1184 PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g - %g i\n", 8 * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), -(double)PetscImaginaryPart(a->val[j]))); 1185 } else { 1186 PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g\n", 8 * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]))); 1187 } 1188 #else 1189 PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g\n", 8 * i + row, a->colidx[j], (double)a->val[j])); 1190 #endif 1191 } 1192 } 1193 } else { 1194 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1195 if (A->factortype) { 1196 for (i = 0; i < m; i++) { 1197 shift = a->sliidx[i >> 3] + (i & 0x07); 1198 PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i)); 1199 /* L part */ 1200 for (j = shift; j < a->diag[i]; j += 8) { 1201 #if defined(PETSC_USE_COMPLEX) 1202 if (PetscImaginaryPart(a->val[shift + 8 * j]) > 0.0) { 1203 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j]))); 1204 } else if (PetscImaginaryPart(a->val[shift + 8 * j]) < 0.0) { 1205 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j])))); 1206 } else { 1207 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j]))); 1208 } 1209 #else 1210 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j])); 1211 #endif 1212 } 1213 /* diagonal */ 1214 j = a->diag[i]; 1215 #if defined(PETSC_USE_COMPLEX) 1216 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1217 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)PetscImaginaryPart(1.0 / a->val[j]))); 1218 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1219 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)(-PetscImaginaryPart(1.0 / a->val[j])))); 1220 } else { 1221 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]))); 1222 } 1223 #else 1224 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)(1.0 / a->val[j]))); 1225 #endif 1226 1227 /* U part */ 1228 for (j = a->diag[i] + 1; j < shift + 8 * a->rlen[i]; j += 8) { 1229 #if defined(PETSC_USE_COMPLEX) 1230 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1231 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j]))); 1232 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1233 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j])))); 1234 } else { 1235 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j]))); 1236 } 1237 #else 1238 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j])); 1239 #endif 1240 } 1241 PetscCall(PetscViewerASCIIPrintf(viewer, "\n")); 1242 } 1243 } else { 1244 for (i = 0; i < m; i++) { 1245 shift = a->sliidx[i >> 3] + (i & 0x07); 1246 PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i)); 1247 for (j = 0; j < a->rlen[i]; j++) { 1248 #if defined(PETSC_USE_COMPLEX) 1249 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1250 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]), (double)PetscImaginaryPart(a->val[shift + 8 * j]))); 1251 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1252 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]), (double)-PetscImaginaryPart(a->val[shift + 8 * j]))); 1253 } else { 1254 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]))); 1255 } 1256 #else 1257 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + 8 * j], (double)a->val[shift + 8 * j])); 1258 #endif 1259 } 1260 PetscCall(PetscViewerASCIIPrintf(viewer, "\n")); 1261 } 1262 } 1263 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1264 } 1265 PetscCall(PetscViewerFlush(viewer)); 1266 PetscFunctionReturn(0); 1267 } 1268 1269 #include <petscdraw.h> 1270 PetscErrorCode MatView_SeqSELL_Draw_Zoom(PetscDraw draw, void *Aa) 1271 { 1272 Mat A = (Mat)Aa; 1273 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1274 PetscInt i, j, m = A->rmap->n, shift; 1275 int color; 1276 PetscReal xl, yl, xr, yr, x_l, x_r, y_l, y_r; 1277 PetscViewer viewer; 1278 PetscViewerFormat format; 1279 1280 PetscFunctionBegin; 1281 PetscCall(PetscObjectQuery((PetscObject)A, "Zoomviewer", (PetscObject *)&viewer)); 1282 PetscCall(PetscViewerGetFormat(viewer, &format)); 1283 PetscCall(PetscDrawGetCoordinates(draw, &xl, &yl, &xr, &yr)); 1284 1285 /* loop over matrix elements drawing boxes */ 1286 1287 if (format != PETSC_VIEWER_DRAW_CONTOUR) { 1288 PetscDrawCollectiveBegin(draw); 1289 /* Blue for negative, Cyan for zero and Red for positive */ 1290 color = PETSC_DRAW_BLUE; 1291 for (i = 0; i < m; i++) { 1292 shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */ 1293 y_l = m - i - 1.0; 1294 y_r = y_l + 1.0; 1295 for (j = 0; j < a->rlen[i]; j++) { 1296 x_l = a->colidx[shift + j * 8]; 1297 x_r = x_l + 1.0; 1298 if (PetscRealPart(a->val[shift + 8 * j]) >= 0.) continue; 1299 PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color)); 1300 } 1301 } 1302 color = PETSC_DRAW_CYAN; 1303 for (i = 0; i < m; i++) { 1304 shift = a->sliidx[i >> 3] + (i & 0x07); 1305 y_l = m - i - 1.0; 1306 y_r = y_l + 1.0; 1307 for (j = 0; j < a->rlen[i]; j++) { 1308 x_l = a->colidx[shift + j * 8]; 1309 x_r = x_l + 1.0; 1310 if (a->val[shift + 8 * j] != 0.) continue; 1311 PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color)); 1312 } 1313 } 1314 color = PETSC_DRAW_RED; 1315 for (i = 0; i < m; i++) { 1316 shift = a->sliidx[i >> 3] + (i & 0x07); 1317 y_l = m - i - 1.0; 1318 y_r = y_l + 1.0; 1319 for (j = 0; j < a->rlen[i]; j++) { 1320 x_l = a->colidx[shift + j * 8]; 1321 x_r = x_l + 1.0; 1322 if (PetscRealPart(a->val[shift + 8 * j]) <= 0.) continue; 1323 PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color)); 1324 } 1325 } 1326 PetscDrawCollectiveEnd(draw); 1327 } else { 1328 /* use contour shading to indicate magnitude of values */ 1329 /* first determine max of all nonzero values */ 1330 PetscReal minv = 0.0, maxv = 0.0; 1331 PetscInt count = 0; 1332 PetscDraw popup; 1333 for (i = 0; i < a->sliidx[a->totalslices]; i++) { 1334 if (PetscAbsScalar(a->val[i]) > maxv) maxv = PetscAbsScalar(a->val[i]); 1335 } 1336 if (minv >= maxv) maxv = minv + PETSC_SMALL; 1337 PetscCall(PetscDrawGetPopup(draw, &popup)); 1338 PetscCall(PetscDrawScalePopup(popup, minv, maxv)); 1339 1340 PetscDrawCollectiveBegin(draw); 1341 for (i = 0; i < m; i++) { 1342 shift = a->sliidx[i >> 3] + (i & 0x07); 1343 y_l = m - i - 1.0; 1344 y_r = y_l + 1.0; 1345 for (j = 0; j < a->rlen[i]; j++) { 1346 x_l = a->colidx[shift + j * 8]; 1347 x_r = x_l + 1.0; 1348 color = PetscDrawRealToColor(PetscAbsScalar(a->val[count]), minv, maxv); 1349 PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color)); 1350 count++; 1351 } 1352 } 1353 PetscDrawCollectiveEnd(draw); 1354 } 1355 PetscFunctionReturn(0); 1356 } 1357 1358 #include <petscdraw.h> 1359 PetscErrorCode MatView_SeqSELL_Draw(Mat A, PetscViewer viewer) 1360 { 1361 PetscDraw draw; 1362 PetscReal xr, yr, xl, yl, h, w; 1363 PetscBool isnull; 1364 1365 PetscFunctionBegin; 1366 PetscCall(PetscViewerDrawGetDraw(viewer, 0, &draw)); 1367 PetscCall(PetscDrawIsNull(draw, &isnull)); 1368 if (isnull) PetscFunctionReturn(0); 1369 1370 xr = A->cmap->n; 1371 yr = A->rmap->n; 1372 h = yr / 10.0; 1373 w = xr / 10.0; 1374 xr += w; 1375 yr += h; 1376 xl = -w; 1377 yl = -h; 1378 PetscCall(PetscDrawSetCoordinates(draw, xl, yl, xr, yr)); 1379 PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", (PetscObject)viewer)); 1380 PetscCall(PetscDrawZoom(draw, MatView_SeqSELL_Draw_Zoom, A)); 1381 PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", NULL)); 1382 PetscCall(PetscDrawSave(draw)); 1383 PetscFunctionReturn(0); 1384 } 1385 1386 PetscErrorCode MatView_SeqSELL(Mat A, PetscViewer viewer) 1387 { 1388 PetscBool iascii, isbinary, isdraw; 1389 1390 PetscFunctionBegin; 1391 PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii)); 1392 PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERBINARY, &isbinary)); 1393 PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERDRAW, &isdraw)); 1394 if (iascii) { 1395 PetscCall(MatView_SeqSELL_ASCII(A, viewer)); 1396 } else if (isbinary) { 1397 /* PetscCall(MatView_SeqSELL_Binary(A,viewer)); */ 1398 } else if (isdraw) PetscCall(MatView_SeqSELL_Draw(A, viewer)); 1399 PetscFunctionReturn(0); 1400 } 1401 1402 PetscErrorCode MatAssemblyEnd_SeqSELL(Mat A, MatAssemblyType mode) 1403 { 1404 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1405 PetscInt i, shift, row_in_slice, row, nrow, *cp, lastcol, j, k; 1406 MatScalar *vp; 1407 1408 PetscFunctionBegin; 1409 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 1410 /* To do: compress out the unused elements */ 1411 PetscCall(MatMarkDiagonal_SeqSELL(A)); 1412 PetscCall(PetscInfo(A, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: %" PetscInt_FMT " allocated %" PetscInt_FMT " used (%" PetscInt_FMT " nonzeros+%" PetscInt_FMT " paddedzeros)\n", A->rmap->n, A->cmap->n, a->maxallocmat, a->sliidx[a->totalslices], a->nz, a->sliidx[a->totalslices] - a->nz)); 1413 PetscCall(PetscInfo(A, "Number of mallocs during MatSetValues() is %" PetscInt_FMT "\n", a->reallocs)); 1414 PetscCall(PetscInfo(A, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", a->rlenmax)); 1415 /* Set unused slots for column indices to last valid column index. Set unused slots for values to zero. This allows for a use of unmasked intrinsics -> higher performance */ 1416 for (i = 0; i < a->totalslices; ++i) { 1417 shift = a->sliidx[i]; /* starting index of the slice */ 1418 cp = a->colidx + shift; /* pointer to the column indices of the slice */ 1419 vp = a->val + shift; /* pointer to the nonzero values of the slice */ 1420 for (row_in_slice = 0; row_in_slice < 8; ++row_in_slice) { /* loop over rows in the slice */ 1421 row = 8 * i + row_in_slice; 1422 nrow = a->rlen[row]; /* number of nonzeros in row */ 1423 /* 1424 Search for the nearest nonzero. Normally setting the index to zero may cause extra communication. 1425 But if the entire slice are empty, it is fine to use 0 since the index will not be loaded. 1426 */ 1427 lastcol = 0; 1428 if (nrow > 0) { /* nonempty row */ 1429 lastcol = cp[8 * (nrow - 1) + row_in_slice]; /* use the index from the last nonzero at current row */ 1430 } else if (!row_in_slice) { /* first row of the currect slice is empty */ 1431 for (j = 1; j < 8; j++) { 1432 if (a->rlen[8 * i + j]) { 1433 lastcol = cp[j]; 1434 break; 1435 } 1436 } 1437 } else { 1438 if (a->sliidx[i + 1] != shift) lastcol = cp[row_in_slice - 1]; /* use the index from the previous row */ 1439 } 1440 1441 for (k = nrow; k < (a->sliidx[i + 1] - shift) / 8; ++k) { 1442 cp[8 * k + row_in_slice] = lastcol; 1443 vp[8 * k + row_in_slice] = (MatScalar)0; 1444 } 1445 } 1446 } 1447 1448 A->info.mallocs += a->reallocs; 1449 a->reallocs = 0; 1450 1451 PetscCall(MatSeqSELLInvalidateDiagonal(A)); 1452 PetscFunctionReturn(0); 1453 } 1454 1455 PetscErrorCode MatGetInfo_SeqSELL(Mat A, MatInfoType flag, MatInfo *info) 1456 { 1457 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1458 1459 PetscFunctionBegin; 1460 info->block_size = 1.0; 1461 info->nz_allocated = a->maxallocmat; 1462 info->nz_used = a->sliidx[a->totalslices]; /* include padding zeros */ 1463 info->nz_unneeded = (a->maxallocmat - a->sliidx[a->totalslices]); 1464 info->assemblies = A->num_ass; 1465 info->mallocs = A->info.mallocs; 1466 info->memory = 0; /* REVIEW ME */ 1467 if (A->factortype) { 1468 info->fill_ratio_given = A->info.fill_ratio_given; 1469 info->fill_ratio_needed = A->info.fill_ratio_needed; 1470 info->factor_mallocs = A->info.factor_mallocs; 1471 } else { 1472 info->fill_ratio_given = 0; 1473 info->fill_ratio_needed = 0; 1474 info->factor_mallocs = 0; 1475 } 1476 PetscFunctionReturn(0); 1477 } 1478 1479 PetscErrorCode MatSetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], const PetscScalar v[], InsertMode is) 1480 { 1481 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1482 PetscInt shift, i, k, l, low, high, t, ii, row, col, nrow; 1483 PetscInt *cp, nonew = a->nonew, lastcol = -1; 1484 MatScalar *vp, value; 1485 1486 PetscFunctionBegin; 1487 for (k = 0; k < m; k++) { /* loop over added rows */ 1488 row = im[k]; 1489 if (row < 0) continue; 1490 PetscCheck(row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->n - 1); 1491 shift = a->sliidx[row >> 3] + (row & 0x07); /* starting index of the row */ 1492 cp = a->colidx + shift; /* pointer to the row */ 1493 vp = a->val + shift; /* pointer to the row */ 1494 nrow = a->rlen[row]; 1495 low = 0; 1496 high = nrow; 1497 1498 for (l = 0; l < n; l++) { /* loop over added columns */ 1499 col = in[l]; 1500 if (col < 0) continue; 1501 PetscCheck(col < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Col too large: row %" PetscInt_FMT " max %" PetscInt_FMT, col, A->cmap->n - 1); 1502 if (a->roworiented) { 1503 value = v[l + k * n]; 1504 } else { 1505 value = v[k + l * m]; 1506 } 1507 if ((value == 0.0 && a->ignorezeroentries) && (is == ADD_VALUES)) continue; 1508 1509 /* search in this row for the specified column, i indicates the column to be set */ 1510 if (col <= lastcol) low = 0; 1511 else high = nrow; 1512 lastcol = col; 1513 while (high - low > 5) { 1514 t = (low + high) / 2; 1515 if (*(cp + t * 8) > col) high = t; 1516 else low = t; 1517 } 1518 for (i = low; i < high; i++) { 1519 if (*(cp + i * 8) > col) break; 1520 if (*(cp + i * 8) == col) { 1521 if (is == ADD_VALUES) *(vp + i * 8) += value; 1522 else *(vp + i * 8) = value; 1523 low = i + 1; 1524 goto noinsert; 1525 } 1526 } 1527 if (value == 0.0 && a->ignorezeroentries) goto noinsert; 1528 if (nonew == 1) goto noinsert; 1529 PetscCheck(nonew != -1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Inserting a new nonzero (%" PetscInt_FMT ", %" PetscInt_FMT ") in the matrix", row, col); 1530 /* If the current row length exceeds the slice width (e.g. nrow==slice_width), allocate a new space, otherwise do nothing */ 1531 MatSeqXSELLReallocateSELL(A, A->rmap->n, 1, nrow, a->sliidx, row / 8, row, col, a->colidx, a->val, cp, vp, nonew, MatScalar); 1532 /* add the new nonzero to the high position, shift the remaining elements in current row to the right by one slot */ 1533 for (ii = nrow - 1; ii >= i; ii--) { 1534 *(cp + (ii + 1) * 8) = *(cp + ii * 8); 1535 *(vp + (ii + 1) * 8) = *(vp + ii * 8); 1536 } 1537 a->rlen[row]++; 1538 *(cp + i * 8) = col; 1539 *(vp + i * 8) = value; 1540 a->nz++; 1541 A->nonzerostate++; 1542 low = i + 1; 1543 high++; 1544 nrow++; 1545 noinsert:; 1546 } 1547 a->rlen[row] = nrow; 1548 } 1549 PetscFunctionReturn(0); 1550 } 1551 1552 PetscErrorCode MatCopy_SeqSELL(Mat A, Mat B, MatStructure str) 1553 { 1554 PetscFunctionBegin; 1555 /* If the two matrices have the same copy implementation, use fast copy. */ 1556 if (str == SAME_NONZERO_PATTERN && (A->ops->copy == B->ops->copy)) { 1557 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1558 Mat_SeqSELL *b = (Mat_SeqSELL *)B->data; 1559 1560 PetscCheck(a->sliidx[a->totalslices] == b->sliidx[b->totalslices], PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Number of nonzeros in two matrices are different"); 1561 PetscCall(PetscArraycpy(b->val, a->val, a->sliidx[a->totalslices])); 1562 } else { 1563 PetscCall(MatCopy_Basic(A, B, str)); 1564 } 1565 PetscFunctionReturn(0); 1566 } 1567 1568 PetscErrorCode MatSetUp_SeqSELL(Mat A) 1569 { 1570 PetscFunctionBegin; 1571 PetscCall(MatSeqSELLSetPreallocation(A, PETSC_DEFAULT, NULL)); 1572 PetscFunctionReturn(0); 1573 } 1574 1575 PetscErrorCode MatSeqSELLGetArray_SeqSELL(Mat A, PetscScalar *array[]) 1576 { 1577 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1578 1579 PetscFunctionBegin; 1580 *array = a->val; 1581 PetscFunctionReturn(0); 1582 } 1583 1584 PetscErrorCode MatSeqSELLRestoreArray_SeqSELL(Mat A, PetscScalar *array[]) 1585 { 1586 PetscFunctionBegin; 1587 PetscFunctionReturn(0); 1588 } 1589 1590 PetscErrorCode MatRealPart_SeqSELL(Mat A) 1591 { 1592 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1593 PetscInt i; 1594 MatScalar *aval = a->val; 1595 1596 PetscFunctionBegin; 1597 for (i = 0; i < a->sliidx[a->totalslices]; i++) aval[i] = PetscRealPart(aval[i]); 1598 PetscFunctionReturn(0); 1599 } 1600 1601 PetscErrorCode MatImaginaryPart_SeqSELL(Mat A) 1602 { 1603 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1604 PetscInt i; 1605 MatScalar *aval = a->val; 1606 1607 PetscFunctionBegin; 1608 for (i = 0; i < a->sliidx[a->totalslices]; i++) aval[i] = PetscImaginaryPart(aval[i]); 1609 PetscCall(MatSeqSELLInvalidateDiagonal(A)); 1610 PetscFunctionReturn(0); 1611 } 1612 1613 PetscErrorCode MatScale_SeqSELL(Mat inA, PetscScalar alpha) 1614 { 1615 Mat_SeqSELL *a = (Mat_SeqSELL *)inA->data; 1616 MatScalar *aval = a->val; 1617 PetscScalar oalpha = alpha; 1618 PetscBLASInt one = 1, size; 1619 1620 PetscFunctionBegin; 1621 PetscCall(PetscBLASIntCast(a->sliidx[a->totalslices], &size)); 1622 PetscCallBLAS("BLASscal", BLASscal_(&size, &oalpha, aval, &one)); 1623 PetscCall(PetscLogFlops(a->nz)); 1624 PetscCall(MatSeqSELLInvalidateDiagonal(inA)); 1625 PetscFunctionReturn(0); 1626 } 1627 1628 PetscErrorCode MatShift_SeqSELL(Mat Y, PetscScalar a) 1629 { 1630 Mat_SeqSELL *y = (Mat_SeqSELL *)Y->data; 1631 1632 PetscFunctionBegin; 1633 if (!Y->preallocated || !y->nz) PetscCall(MatSeqSELLSetPreallocation(Y, 1, NULL)); 1634 PetscCall(MatShift_Basic(Y, a)); 1635 PetscFunctionReturn(0); 1636 } 1637 1638 PetscErrorCode MatSOR_SeqSELL(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx) 1639 { 1640 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1641 PetscScalar *x, sum, *t; 1642 const MatScalar *idiag = NULL, *mdiag; 1643 const PetscScalar *b, *xb; 1644 PetscInt n, m = A->rmap->n, i, j, shift; 1645 const PetscInt *diag; 1646 1647 PetscFunctionBegin; 1648 its = its * lits; 1649 1650 if (fshift != a->fshift || omega != a->omega) a->idiagvalid = PETSC_FALSE; /* must recompute idiag[] */ 1651 if (!a->idiagvalid) PetscCall(MatInvertDiagonal_SeqSELL(A, omega, fshift)); 1652 a->fshift = fshift; 1653 a->omega = omega; 1654 1655 diag = a->diag; 1656 t = a->ssor_work; 1657 idiag = a->idiag; 1658 mdiag = a->mdiag; 1659 1660 PetscCall(VecGetArray(xx, &x)); 1661 PetscCall(VecGetArrayRead(bb, &b)); 1662 /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */ 1663 PetscCheck(flag != SOR_APPLY_UPPER, PETSC_COMM_SELF, PETSC_ERR_SUP, "SOR_APPLY_UPPER is not implemented"); 1664 PetscCheck(flag != SOR_APPLY_LOWER, PETSC_COMM_SELF, PETSC_ERR_SUP, "SOR_APPLY_LOWER is not implemented"); 1665 PetscCheck(!(flag & SOR_EISENSTAT), PETSC_COMM_SELF, PETSC_ERR_SUP, "No support yet for Eisenstat"); 1666 1667 if (flag & SOR_ZERO_INITIAL_GUESS) { 1668 if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) { 1669 for (i = 0; i < m; i++) { 1670 shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */ 1671 sum = b[i]; 1672 n = (diag[i] - shift) / 8; 1673 for (j = 0; j < n; j++) sum -= a->val[shift + j * 8] * x[a->colidx[shift + j * 8]]; 1674 t[i] = sum; 1675 x[i] = sum * idiag[i]; 1676 } 1677 xb = t; 1678 PetscCall(PetscLogFlops(a->nz)); 1679 } else xb = b; 1680 if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) { 1681 for (i = m - 1; i >= 0; i--) { 1682 shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */ 1683 sum = xb[i]; 1684 n = a->rlen[i] - (diag[i] - shift) / 8 - 1; 1685 for (j = 1; j <= n; j++) sum -= a->val[diag[i] + j * 8] * x[a->colidx[diag[i] + j * 8]]; 1686 if (xb == b) { 1687 x[i] = sum * idiag[i]; 1688 } else { 1689 x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */ 1690 } 1691 } 1692 PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */ 1693 } 1694 its--; 1695 } 1696 while (its--) { 1697 if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) { 1698 for (i = 0; i < m; i++) { 1699 /* lower */ 1700 shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */ 1701 sum = b[i]; 1702 n = (diag[i] - shift) / 8; 1703 for (j = 0; j < n; j++) sum -= a->val[shift + j * 8] * x[a->colidx[shift + j * 8]]; 1704 t[i] = sum; /* save application of the lower-triangular part */ 1705 /* upper */ 1706 n = a->rlen[i] - (diag[i] - shift) / 8 - 1; 1707 for (j = 1; j <= n; j++) sum -= a->val[diag[i] + j * 8] * x[a->colidx[diag[i] + j * 8]]; 1708 x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */ 1709 } 1710 xb = t; 1711 PetscCall(PetscLogFlops(2.0 * a->nz)); 1712 } else xb = b; 1713 if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) { 1714 for (i = m - 1; i >= 0; i--) { 1715 shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */ 1716 sum = xb[i]; 1717 if (xb == b) { 1718 /* whole matrix (no checkpointing available) */ 1719 n = a->rlen[i]; 1720 for (j = 0; j < n; j++) sum -= a->val[shift + j * 8] * x[a->colidx[shift + j * 8]]; 1721 x[i] = (1. - omega) * x[i] + (sum + mdiag[i] * x[i]) * idiag[i]; 1722 } else { /* lower-triangular part has been saved, so only apply upper-triangular */ 1723 n = a->rlen[i] - (diag[i] - shift) / 8 - 1; 1724 for (j = 1; j <= n; j++) sum -= a->val[diag[i] + j * 8] * x[a->colidx[diag[i] + j * 8]]; 1725 x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */ 1726 } 1727 } 1728 if (xb == b) { 1729 PetscCall(PetscLogFlops(2.0 * a->nz)); 1730 } else { 1731 PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */ 1732 } 1733 } 1734 } 1735 PetscCall(VecRestoreArray(xx, &x)); 1736 PetscCall(VecRestoreArrayRead(bb, &b)); 1737 PetscFunctionReturn(0); 1738 } 1739 1740 /* -------------------------------------------------------------------*/ 1741 static struct _MatOps MatOps_Values = {MatSetValues_SeqSELL, 1742 MatGetRow_SeqSELL, 1743 MatRestoreRow_SeqSELL, 1744 MatMult_SeqSELL, 1745 /* 4*/ MatMultAdd_SeqSELL, 1746 MatMultTranspose_SeqSELL, 1747 MatMultTransposeAdd_SeqSELL, 1748 NULL, 1749 NULL, 1750 NULL, 1751 /* 10*/ NULL, 1752 NULL, 1753 NULL, 1754 MatSOR_SeqSELL, 1755 NULL, 1756 /* 15*/ MatGetInfo_SeqSELL, 1757 MatEqual_SeqSELL, 1758 MatGetDiagonal_SeqSELL, 1759 MatDiagonalScale_SeqSELL, 1760 NULL, 1761 /* 20*/ NULL, 1762 MatAssemblyEnd_SeqSELL, 1763 MatSetOption_SeqSELL, 1764 MatZeroEntries_SeqSELL, 1765 /* 24*/ NULL, 1766 NULL, 1767 NULL, 1768 NULL, 1769 NULL, 1770 /* 29*/ MatSetUp_SeqSELL, 1771 NULL, 1772 NULL, 1773 NULL, 1774 NULL, 1775 /* 34*/ MatDuplicate_SeqSELL, 1776 NULL, 1777 NULL, 1778 NULL, 1779 NULL, 1780 /* 39*/ NULL, 1781 NULL, 1782 NULL, 1783 MatGetValues_SeqSELL, 1784 MatCopy_SeqSELL, 1785 /* 44*/ NULL, 1786 MatScale_SeqSELL, 1787 MatShift_SeqSELL, 1788 NULL, 1789 NULL, 1790 /* 49*/ NULL, 1791 NULL, 1792 NULL, 1793 NULL, 1794 NULL, 1795 /* 54*/ MatFDColoringCreate_SeqXAIJ, 1796 NULL, 1797 NULL, 1798 NULL, 1799 NULL, 1800 /* 59*/ NULL, 1801 MatDestroy_SeqSELL, 1802 MatView_SeqSELL, 1803 NULL, 1804 NULL, 1805 /* 64*/ NULL, 1806 NULL, 1807 NULL, 1808 NULL, 1809 NULL, 1810 /* 69*/ NULL, 1811 NULL, 1812 NULL, 1813 NULL, 1814 NULL, 1815 /* 74*/ NULL, 1816 MatFDColoringApply_AIJ, /* reuse the FDColoring function for AIJ */ 1817 NULL, 1818 NULL, 1819 NULL, 1820 /* 79*/ NULL, 1821 NULL, 1822 NULL, 1823 NULL, 1824 NULL, 1825 /* 84*/ NULL, 1826 NULL, 1827 NULL, 1828 NULL, 1829 NULL, 1830 /* 89*/ NULL, 1831 NULL, 1832 NULL, 1833 NULL, 1834 NULL, 1835 /* 94*/ NULL, 1836 NULL, 1837 NULL, 1838 NULL, 1839 NULL, 1840 /* 99*/ NULL, 1841 NULL, 1842 NULL, 1843 MatConjugate_SeqSELL, 1844 NULL, 1845 /*104*/ NULL, 1846 NULL, 1847 NULL, 1848 NULL, 1849 NULL, 1850 /*109*/ NULL, 1851 NULL, 1852 NULL, 1853 NULL, 1854 MatMissingDiagonal_SeqSELL, 1855 /*114*/ NULL, 1856 NULL, 1857 NULL, 1858 NULL, 1859 NULL, 1860 /*119*/ NULL, 1861 NULL, 1862 NULL, 1863 NULL, 1864 NULL, 1865 /*124*/ NULL, 1866 NULL, 1867 NULL, 1868 NULL, 1869 NULL, 1870 /*129*/ NULL, 1871 NULL, 1872 NULL, 1873 NULL, 1874 NULL, 1875 /*134*/ NULL, 1876 NULL, 1877 NULL, 1878 NULL, 1879 NULL, 1880 /*139*/ NULL, 1881 NULL, 1882 NULL, 1883 MatFDColoringSetUp_SeqXAIJ, 1884 NULL, 1885 /*144*/ NULL, 1886 NULL, 1887 NULL, 1888 NULL, 1889 NULL, 1890 NULL, 1891 /*150*/ NULL}; 1892 1893 PetscErrorCode MatStoreValues_SeqSELL(Mat mat) 1894 { 1895 Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data; 1896 1897 PetscFunctionBegin; 1898 PetscCheck(a->nonew, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first"); 1899 1900 /* allocate space for values if not already there */ 1901 if (!a->saved_values) { PetscCall(PetscMalloc1(a->sliidx[a->totalslices] + 1, &a->saved_values)); } 1902 1903 /* copy values over */ 1904 PetscCall(PetscArraycpy(a->saved_values, a->val, a->sliidx[a->totalslices])); 1905 PetscFunctionReturn(0); 1906 } 1907 1908 PetscErrorCode MatRetrieveValues_SeqSELL(Mat mat) 1909 { 1910 Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data; 1911 1912 PetscFunctionBegin; 1913 PetscCheck(a->nonew, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first"); 1914 PetscCheck(a->saved_values, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatStoreValues(A);first"); 1915 PetscCall(PetscArraycpy(a->val, a->saved_values, a->sliidx[a->totalslices])); 1916 PetscFunctionReturn(0); 1917 } 1918 1919 /*@C 1920 MatSeqSELLRestoreArray - returns access to the array where the data for a `MATSEQSELL` matrix is stored obtained by `MatSeqSELLGetArray()` 1921 1922 Not Collective 1923 1924 Input Parameters: 1925 . mat - a `MATSEQSELL` matrix 1926 . array - pointer to the data 1927 1928 Level: intermediate 1929 1930 .seealso: `Mat`, `MATSEQSELL`, `MatSeqSELLGetArray()`, `MatSeqSELLRestoreArrayF90()` 1931 @*/ 1932 PetscErrorCode MatSeqSELLRestoreArray(Mat A, PetscScalar **array) 1933 { 1934 PetscFunctionBegin; 1935 PetscUseMethod(A, "MatSeqSELLRestoreArray_C", (Mat, PetscScalar **), (A, array)); 1936 PetscFunctionReturn(0); 1937 } 1938 1939 PETSC_EXTERN PetscErrorCode MatCreate_SeqSELL(Mat B) 1940 { 1941 Mat_SeqSELL *b; 1942 PetscMPIInt size; 1943 1944 PetscFunctionBegin; 1945 PetscCall(PetscCitationsRegister(citation, &cited)); 1946 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size)); 1947 PetscCheck(size <= 1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Comm must be of size 1"); 1948 1949 PetscCall(PetscNew(&b)); 1950 1951 B->data = (void *)b; 1952 1953 PetscCall(PetscMemcpy(B->ops, &MatOps_Values, sizeof(struct _MatOps))); 1954 1955 b->row = NULL; 1956 b->col = NULL; 1957 b->icol = NULL; 1958 b->reallocs = 0; 1959 b->ignorezeroentries = PETSC_FALSE; 1960 b->roworiented = PETSC_TRUE; 1961 b->nonew = 0; 1962 b->diag = NULL; 1963 b->solve_work = NULL; 1964 B->spptr = NULL; 1965 b->saved_values = NULL; 1966 b->idiag = NULL; 1967 b->mdiag = NULL; 1968 b->ssor_work = NULL; 1969 b->omega = 1.0; 1970 b->fshift = 0.0; 1971 b->idiagvalid = PETSC_FALSE; 1972 b->keepnonzeropattern = PETSC_FALSE; 1973 1974 PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQSELL)); 1975 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetArray_C", MatSeqSELLGetArray_SeqSELL)); 1976 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLRestoreArray_C", MatSeqSELLRestoreArray_SeqSELL)); 1977 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatStoreValues_C", MatStoreValues_SeqSELL)); 1978 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatRetrieveValues_C", MatRetrieveValues_SeqSELL)); 1979 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLSetPreallocation_C", MatSeqSELLSetPreallocation_SeqSELL)); 1980 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqsell_seqaij_C", MatConvert_SeqSELL_SeqAIJ)); 1981 PetscFunctionReturn(0); 1982 } 1983 1984 /* 1985 Given a matrix generated with MatGetFactor() duplicates all the information in A into B 1986 */ 1987 PetscErrorCode MatDuplicateNoCreate_SeqSELL(Mat C, Mat A, MatDuplicateOption cpvalues, PetscBool mallocmatspace) 1988 { 1989 Mat_SeqSELL *c = (Mat_SeqSELL *)C->data, *a = (Mat_SeqSELL *)A->data; 1990 PetscInt i, m = A->rmap->n; 1991 PetscInt totalslices = a->totalslices; 1992 1993 PetscFunctionBegin; 1994 C->factortype = A->factortype; 1995 c->row = NULL; 1996 c->col = NULL; 1997 c->icol = NULL; 1998 c->reallocs = 0; 1999 C->assembled = PETSC_TRUE; 2000 2001 PetscCall(PetscLayoutReference(A->rmap, &C->rmap)); 2002 PetscCall(PetscLayoutReference(A->cmap, &C->cmap)); 2003 2004 PetscCall(PetscMalloc1(8 * totalslices, &c->rlen)); 2005 PetscCall(PetscMalloc1(totalslices + 1, &c->sliidx)); 2006 2007 for (i = 0; i < m; i++) c->rlen[i] = a->rlen[i]; 2008 for (i = 0; i < totalslices + 1; i++) c->sliidx[i] = a->sliidx[i]; 2009 2010 /* allocate the matrix space */ 2011 if (mallocmatspace) { 2012 PetscCall(PetscMalloc2(a->maxallocmat, &c->val, a->maxallocmat, &c->colidx)); 2013 2014 c->singlemalloc = PETSC_TRUE; 2015 2016 if (m > 0) { 2017 PetscCall(PetscArraycpy(c->colidx, a->colidx, a->maxallocmat)); 2018 if (cpvalues == MAT_COPY_VALUES) { 2019 PetscCall(PetscArraycpy(c->val, a->val, a->maxallocmat)); 2020 } else { 2021 PetscCall(PetscArrayzero(c->val, a->maxallocmat)); 2022 } 2023 } 2024 } 2025 2026 c->ignorezeroentries = a->ignorezeroentries; 2027 c->roworiented = a->roworiented; 2028 c->nonew = a->nonew; 2029 if (a->diag) { 2030 PetscCall(PetscMalloc1(m, &c->diag)); 2031 for (i = 0; i < m; i++) c->diag[i] = a->diag[i]; 2032 } else c->diag = NULL; 2033 2034 c->solve_work = NULL; 2035 c->saved_values = NULL; 2036 c->idiag = NULL; 2037 c->ssor_work = NULL; 2038 c->keepnonzeropattern = a->keepnonzeropattern; 2039 c->free_val = PETSC_TRUE; 2040 c->free_colidx = PETSC_TRUE; 2041 2042 c->maxallocmat = a->maxallocmat; 2043 c->maxallocrow = a->maxallocrow; 2044 c->rlenmax = a->rlenmax; 2045 c->nz = a->nz; 2046 C->preallocated = PETSC_TRUE; 2047 2048 c->nonzerorowcnt = a->nonzerorowcnt; 2049 C->nonzerostate = A->nonzerostate; 2050 2051 PetscCall(PetscFunctionListDuplicate(((PetscObject)A)->qlist, &((PetscObject)C)->qlist)); 2052 PetscFunctionReturn(0); 2053 } 2054 2055 PetscErrorCode MatDuplicate_SeqSELL(Mat A, MatDuplicateOption cpvalues, Mat *B) 2056 { 2057 PetscFunctionBegin; 2058 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B)); 2059 PetscCall(MatSetSizes(*B, A->rmap->n, A->cmap->n, A->rmap->n, A->cmap->n)); 2060 if (!(A->rmap->n % A->rmap->bs) && !(A->cmap->n % A->cmap->bs)) PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 2061 PetscCall(MatSetType(*B, ((PetscObject)A)->type_name)); 2062 PetscCall(MatDuplicateNoCreate_SeqSELL(*B, A, cpvalues, PETSC_TRUE)); 2063 PetscFunctionReturn(0); 2064 } 2065 2066 /*MC 2067 MATSEQSELL - MATSEQSELL = "seqsell" - A matrix type to be used for sequential sparse matrices, 2068 based on the sliced Ellpack format 2069 2070 Options Database Keys: 2071 . -mat_type seqsell - sets the matrix type to "`MATSEQELL` during a call to `MatSetFromOptions()` 2072 2073 Level: beginner 2074 2075 .seealso: `Mat`, `MatCreateSeqSell()`, `MATSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATAIJ`, `MATMPIAIJ` 2076 M*/ 2077 2078 /*MC 2079 MATSELL - MATSELL = "sell" - A matrix type to be used for sparse matrices. 2080 2081 This matrix type is identical to `MATSEQSELL` when constructed with a single process communicator, 2082 and `MATMPISELL` otherwise. As a result, for single process communicators, 2083 `MatSeqSELLSetPreallocation()` is supported, and similarly `MatMPISELLSetPreallocation()` is supported 2084 for communicators controlling multiple processes. It is recommended that you call both of 2085 the above preallocation routines for simplicity. 2086 2087 Options Database Keys: 2088 . -mat_type sell - sets the matrix type to "sell" during a call to MatSetFromOptions() 2089 2090 Level: beginner 2091 2092 Notes: 2093 This format is only supported for real scalars, double precision, and 32 bit indices (the defaults). 2094 2095 It can provide better performance on Intel and AMD processes with AVX2 or AVX512 support for matrices that have a similar number of 2096 non-zeros in contiguous groups of rows. However if the computation is memory bandwidth limited it may not provide much improvement. 2097 2098 Developer Notes: 2099 On Intel (and AMD) systems some of the matrix operations use SIMD (AVX) instructions to achieve higher performance. 2100 2101 The sparse matrix format is as follows. For simplicity we assume a slice size of 2, it is actually 8 2102 .vb 2103 (2 0 3 4) 2104 Consider the matrix A = (5 0 6 0) 2105 (0 0 7 8) 2106 (0 0 9 9) 2107 2108 symbolically the Ellpack format can be written as 2109 2110 (2 3 4 |) (0 2 3 |) 2111 v = (5 6 0 |) colidx = (0 2 2 |) 2112 -------- --------- 2113 (7 8 |) (2 3 |) 2114 (9 9 |) (2 3 |) 2115 2116 The data for 2 contiguous rows of the matrix are stored together (in column-major format) (with any left-over rows handled as a special case). 2117 Any of the rows in a slice fewer columns than the rest of the slice (row 1 above) are padded with a previous valid column in their "extra" colidx[] locations and 2118 zeros in their "extra" v locations so that the matrix operations do not need special code to handle different length rows within the 2 rows in a slice. 2119 2120 The one-dimensional representation of v used in the code is (2 5 3 6 4 0 7 9 8 9) and for colidx is (0 0 2 2 3 2 2 2 3 3) 2121 2122 .ve 2123 2124 See MatMult_SeqSELL() for how this format is used with the SIMD operations to achieve high performance. 2125 2126 References: 2127 . * - Hong Zhang, Richard T. Mills, Karl Rupp, and Barry F. Smith, Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512}, 2128 Proceedings of the 47th International Conference on Parallel Processing, 2018. 2129 2130 .seealso: `Mat`, `MatCreateSeqSELL()`, `MatCreateSeqAIJ()`, `MatCreateSell()`, `MATSEQSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATMPIAIJ`, `MATAIJ` 2131 M*/ 2132 2133 /*@C 2134 MatCreateSeqSELL - Creates a sparse matrix in `MATSEQSELL` format. 2135 2136 Collective on comm 2137 2138 Input Parameters: 2139 + comm - MPI communicator, set to `PETSC_COMM_SELF` 2140 . m - number of rows 2141 . n - number of columns 2142 . rlenmax - maximum number of nonzeros in a row 2143 - rlen - array containing the number of nonzeros in the various rows 2144 (possibly different for each row) or NULL 2145 2146 Output Parameter: 2147 . A - the matrix 2148 2149 It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 2150 MatXXXXSetPreallocation() paradigm instead of this routine directly. 2151 [MatXXXXSetPreallocation() is, for example, `MatSeqSELLSetPreallocation()`] 2152 2153 Notes: 2154 If nnz is given then nz is ignored 2155 2156 Specify the preallocated storage with either rlenmax or rlen (not both). 2157 Set rlenmax = `PETSC_DEFAULT` and rlen = NULL for PETSc to control dynamic memory 2158 allocation. For large problems you MUST preallocate memory or you 2159 will get TERRIBLE performance, see the users' manual chapter on matrices. 2160 2161 Level: intermediate 2162 2163 .seealso: `Mat`, `MATSEQSELL`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatSeqSELLSetPreallocation()`, `MATSELL`, `MATSEQSELL`, `MATMPISELL` 2164 @*/ 2165 PetscErrorCode MatCreateSeqSELL(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt maxallocrow, const PetscInt rlen[], Mat *A) 2166 { 2167 PetscFunctionBegin; 2168 PetscCall(MatCreate(comm, A)); 2169 PetscCall(MatSetSizes(*A, m, n, m, n)); 2170 PetscCall(MatSetType(*A, MATSEQSELL)); 2171 PetscCall(MatSeqSELLSetPreallocation_SeqSELL(*A, maxallocrow, rlen)); 2172 PetscFunctionReturn(0); 2173 } 2174 2175 PetscErrorCode MatEqual_SeqSELL(Mat A, Mat B, PetscBool *flg) 2176 { 2177 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data, *b = (Mat_SeqSELL *)B->data; 2178 PetscInt totalslices = a->totalslices; 2179 2180 PetscFunctionBegin; 2181 /* If the matrix dimensions are not equal,or no of nonzeros */ 2182 if ((A->rmap->n != B->rmap->n) || (A->cmap->n != B->cmap->n) || (a->nz != b->nz) || (a->rlenmax != b->rlenmax)) { 2183 *flg = PETSC_FALSE; 2184 PetscFunctionReturn(0); 2185 } 2186 /* if the a->colidx are the same */ 2187 PetscCall(PetscArraycmp(a->colidx, b->colidx, a->sliidx[totalslices], flg)); 2188 if (!*flg) PetscFunctionReturn(0); 2189 /* if a->val are the same */ 2190 PetscCall(PetscArraycmp(a->val, b->val, a->sliidx[totalslices], flg)); 2191 PetscFunctionReturn(0); 2192 } 2193 2194 PetscErrorCode MatSeqSELLInvalidateDiagonal(Mat A) 2195 { 2196 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 2197 2198 PetscFunctionBegin; 2199 a->idiagvalid = PETSC_FALSE; 2200 PetscFunctionReturn(0); 2201 } 2202 2203 PetscErrorCode MatConjugate_SeqSELL(Mat A) 2204 { 2205 #if defined(PETSC_USE_COMPLEX) 2206 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 2207 PetscInt i; 2208 PetscScalar *val = a->val; 2209 2210 PetscFunctionBegin; 2211 for (i = 0; i < a->sliidx[a->totalslices]; i++) val[i] = PetscConj(val[i]); 2212 #else 2213 PetscFunctionBegin; 2214 #endif 2215 PetscFunctionReturn(0); 2216 } 2217