1 2 /* 3 Defines the basic matrix operations for the SELL matrix storage format. 4 */ 5 #include <../src/mat/impls/sell/seq/sell.h> /*I "petscmat.h" I*/ 6 #include <petscblaslapack.h> 7 #include <petsc/private/kernels/blocktranspose.h> 8 9 static PetscBool cited = PETSC_FALSE; 10 static const char citation[] = "@inproceedings{ZhangELLPACK2018,\n" 11 " author = {Hong Zhang and Richard T. Mills and Karl Rupp and Barry F. Smith},\n" 12 " title = {Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512}},\n" 13 " booktitle = {Proceedings of the 47th International Conference on Parallel Processing},\n" 14 " year = 2018\n" 15 "}\n"; 16 17 #if defined(PETSC_HAVE_IMMINTRIN_H) && (defined(__AVX512F__) || (defined(__AVX2__) && defined(__FMA__)) || defined(__AVX__)) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 18 19 #include <immintrin.h> 20 21 #if !defined(_MM_SCALE_8) 22 #define _MM_SCALE_8 8 23 #endif 24 25 #if defined(__AVX512F__) 26 /* these do not work 27 vec_idx = _mm512_loadunpackhi_epi32(vec_idx,acolidx); 28 vec_vals = _mm512_loadunpackhi_pd(vec_vals,aval); 29 */ 30 #define AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \ 31 /* if the mask bit is set, copy from acolidx, otherwise from vec_idx */ \ 32 vec_idx = _mm256_loadu_si256((__m256i const *)acolidx); \ 33 vec_vals = _mm512_loadu_pd(aval); \ 34 vec_x = _mm512_i32gather_pd(vec_idx, x, _MM_SCALE_8); \ 35 vec_y = _mm512_fmadd_pd(vec_x, vec_vals, vec_y) 36 #elif defined(__AVX2__) && defined(__FMA__) 37 #define AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \ 38 vec_vals = _mm256_loadu_pd(aval); \ 39 vec_idx = _mm_loadu_si128((__m128i const *)acolidx); /* SSE2 */ \ 40 vec_x = _mm256_i32gather_pd(x, vec_idx, _MM_SCALE_8); \ 41 vec_y = _mm256_fmadd_pd(vec_x, vec_vals, vec_y) 42 #endif 43 #endif /* PETSC_HAVE_IMMINTRIN_H */ 44 45 /*@C 46 MatSeqSELLSetPreallocation - For good matrix assembly performance 47 the user should preallocate the matrix storage by setting the parameter nz 48 (or the array nnz). By setting these parameters accurately, performance 49 during matrix assembly can be increased significantly. 50 51 Collective 52 53 Input Parameters: 54 + B - The `MATSEQSELL` matrix 55 . nz - number of nonzeros per row (same for all rows) 56 - nnz - array containing the number of nonzeros in the various rows 57 (possibly different for each row) or NULL 58 59 Notes: 60 If nnz is given then nz is ignored. 61 62 Specify the preallocated storage with either nz or nnz (not both). 63 Set nz = `PETSC_DEFAULT` and nnz = NULL for PETSc to control dynamic memory 64 allocation. For large problems you MUST preallocate memory or you 65 will get TERRIBLE performance, see the users' manual chapter on matrices. 66 67 You can call `MatGetInfo()` to get information on how effective the preallocation was; 68 for example the fields mallocs,nz_allocated,nz_used,nz_unneeded; 69 You can also run with the option -info and look for messages with the string 70 malloc in them to see if additional memory allocation was needed. 71 72 Developers: Use nz of `MAT_SKIP_ALLOCATION` to not allocate any space for the matrix 73 entries or columns indices. 74 75 The maximum number of nonzeos in any row should be as accurate as possible. 76 If it is underestimated, you will get bad performance due to reallocation 77 (MatSeqXSELLReallocateSELL). 78 79 Level: intermediate 80 81 .seealso: `MATSEQSELL`, `MATSELL`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatGetInfo()` 82 @*/ 83 PetscErrorCode MatSeqSELLSetPreallocation(Mat B, PetscInt rlenmax, const PetscInt rlen[]) { 84 PetscFunctionBegin; 85 PetscValidHeaderSpecific(B, MAT_CLASSID, 1); 86 PetscValidType(B, 1); 87 PetscTryMethod(B, "MatSeqSELLSetPreallocation_C", (Mat, PetscInt, const PetscInt[]), (B, rlenmax, rlen)); 88 PetscFunctionReturn(0); 89 } 90 91 PetscErrorCode MatSeqSELLSetPreallocation_SeqSELL(Mat B, PetscInt maxallocrow, const PetscInt rlen[]) { 92 Mat_SeqSELL *b; 93 PetscInt i, j, totalslices; 94 PetscBool skipallocation = PETSC_FALSE, realalloc = PETSC_FALSE; 95 96 PetscFunctionBegin; 97 if (maxallocrow >= 0 || rlen) realalloc = PETSC_TRUE; 98 if (maxallocrow == MAT_SKIP_ALLOCATION) { 99 skipallocation = PETSC_TRUE; 100 maxallocrow = 0; 101 } 102 103 PetscCall(PetscLayoutSetUp(B->rmap)); 104 PetscCall(PetscLayoutSetUp(B->cmap)); 105 106 /* FIXME: if one preallocates more space than needed, the matrix does not shrink automatically, but for best performance it should */ 107 if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 5; 108 PetscCheck(maxallocrow >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "maxallocrow cannot be less than 0: value %" PetscInt_FMT, maxallocrow); 109 if (rlen) { 110 for (i = 0; i < B->rmap->n; i++) { 111 PetscCheck(rlen[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "rlen cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, rlen[i]); 112 PetscCheck(rlen[i] <= B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "rlen cannot be greater than row length: local row %" PetscInt_FMT " value %" PetscInt_FMT " rowlength %" PetscInt_FMT, i, rlen[i], B->cmap->n); 113 } 114 } 115 116 B->preallocated = PETSC_TRUE; 117 118 b = (Mat_SeqSELL *)B->data; 119 120 totalslices = PetscCeilInt(B->rmap->n, 8); 121 b->totalslices = totalslices; 122 if (!skipallocation) { 123 if (B->rmap->n & 0x07) PetscCall(PetscInfo(B, "Padding rows to the SEQSELL matrix because the number of rows is not the multiple of 8 (value %" PetscInt_FMT ")\n", B->rmap->n)); 124 125 if (!b->sliidx) { /* sliidx gives the starting index of each slice, the last element is the total space allocated */ 126 PetscCall(PetscMalloc1(totalslices + 1, &b->sliidx)); 127 } 128 if (!rlen) { /* if rlen is not provided, allocate same space for all the slices */ 129 if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 10; 130 else if (maxallocrow < 0) maxallocrow = 1; 131 for (i = 0; i <= totalslices; i++) b->sliidx[i] = i * 8 * maxallocrow; 132 } else { 133 maxallocrow = 0; 134 b->sliidx[0] = 0; 135 for (i = 1; i < totalslices; i++) { 136 b->sliidx[i] = 0; 137 for (j = 0; j < 8; j++) b->sliidx[i] = PetscMax(b->sliidx[i], rlen[8 * (i - 1) + j]); 138 maxallocrow = PetscMax(b->sliidx[i], maxallocrow); 139 PetscCall(PetscIntSumError(b->sliidx[i - 1], 8 * b->sliidx[i], &b->sliidx[i])); 140 } 141 /* last slice */ 142 b->sliidx[totalslices] = 0; 143 for (j = (totalslices - 1) * 8; j < B->rmap->n; j++) b->sliidx[totalslices] = PetscMax(b->sliidx[totalslices], rlen[j]); 144 maxallocrow = PetscMax(b->sliidx[totalslices], maxallocrow); 145 b->sliidx[totalslices] = b->sliidx[totalslices - 1] + 8 * b->sliidx[totalslices]; 146 } 147 148 /* allocate space for val, colidx, rlen */ 149 /* FIXME: should B's old memory be unlogged? */ 150 PetscCall(MatSeqXSELLFreeSELL(B, &b->val, &b->colidx)); 151 /* FIXME: assuming an element of the bit array takes 8 bits */ 152 PetscCall(PetscMalloc2(b->sliidx[totalslices], &b->val, b->sliidx[totalslices], &b->colidx)); 153 /* b->rlen will count nonzeros in each row so far. We dont copy rlen to b->rlen because the matrix has not been set. */ 154 PetscCall(PetscCalloc1(8 * totalslices, &b->rlen)); 155 156 b->singlemalloc = PETSC_TRUE; 157 b->free_val = PETSC_TRUE; 158 b->free_colidx = PETSC_TRUE; 159 } else { 160 b->free_val = PETSC_FALSE; 161 b->free_colidx = PETSC_FALSE; 162 } 163 164 b->nz = 0; 165 b->maxallocrow = maxallocrow; 166 b->rlenmax = maxallocrow; 167 b->maxallocmat = b->sliidx[totalslices]; 168 B->info.nz_unneeded = (double)b->maxallocmat; 169 if (realalloc) PetscCall(MatSetOption(B, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE)); 170 PetscFunctionReturn(0); 171 } 172 173 PetscErrorCode MatGetRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v) { 174 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 175 PetscInt shift; 176 177 PetscFunctionBegin; 178 PetscCheck(row >= 0 && row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row %" PetscInt_FMT " out of range", row); 179 if (nz) *nz = a->rlen[row]; 180 shift = a->sliidx[row >> 3] + (row & 0x07); 181 if (!a->getrowcols) PetscCall(PetscMalloc2(a->rlenmax, &a->getrowcols, a->rlenmax, &a->getrowvals)); 182 if (idx) { 183 PetscInt j; 184 for (j = 0; j < a->rlen[row]; j++) a->getrowcols[j] = a->colidx[shift + 8 * j]; 185 *idx = a->getrowcols; 186 } 187 if (v) { 188 PetscInt j; 189 for (j = 0; j < a->rlen[row]; j++) a->getrowvals[j] = a->val[shift + 8 * j]; 190 *v = a->getrowvals; 191 } 192 PetscFunctionReturn(0); 193 } 194 195 PetscErrorCode MatRestoreRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v) { 196 PetscFunctionBegin; 197 PetscFunctionReturn(0); 198 } 199 200 PetscErrorCode MatConvert_SeqSELL_SeqAIJ(Mat A, MatType newtype, MatReuse reuse, Mat *newmat) { 201 Mat B; 202 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 203 PetscInt i; 204 205 PetscFunctionBegin; 206 if (reuse == MAT_REUSE_MATRIX) { 207 B = *newmat; 208 PetscCall(MatZeroEntries(B)); 209 } else { 210 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B)); 211 PetscCall(MatSetSizes(B, A->rmap->n, A->cmap->n, A->rmap->N, A->cmap->N)); 212 PetscCall(MatSetType(B, MATSEQAIJ)); 213 PetscCall(MatSeqAIJSetPreallocation(B, 0, a->rlen)); 214 } 215 216 for (i = 0; i < A->rmap->n; i++) { 217 PetscInt nz = 0, *cols = NULL; 218 PetscScalar *vals = NULL; 219 220 PetscCall(MatGetRow_SeqSELL(A, i, &nz, &cols, &vals)); 221 PetscCall(MatSetValues(B, 1, &i, nz, cols, vals, INSERT_VALUES)); 222 PetscCall(MatRestoreRow_SeqSELL(A, i, &nz, &cols, &vals)); 223 } 224 225 PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY)); 226 PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY)); 227 B->rmap->bs = A->rmap->bs; 228 229 if (reuse == MAT_INPLACE_MATRIX) { 230 PetscCall(MatHeaderReplace(A, &B)); 231 } else { 232 *newmat = B; 233 } 234 PetscFunctionReturn(0); 235 } 236 237 #include <../src/mat/impls/aij/seq/aij.h> 238 239 PetscErrorCode MatConvert_SeqAIJ_SeqSELL(Mat A, MatType newtype, MatReuse reuse, Mat *newmat) { 240 Mat B; 241 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 242 PetscInt *ai = a->i, m = A->rmap->N, n = A->cmap->N, i, *rowlengths, row, ncols; 243 const PetscInt *cols; 244 const PetscScalar *vals; 245 246 PetscFunctionBegin; 247 248 if (reuse == MAT_REUSE_MATRIX) { 249 B = *newmat; 250 } else { 251 if (PetscDefined(USE_DEBUG) || !a->ilen) { 252 PetscCall(PetscMalloc1(m, &rowlengths)); 253 for (i = 0; i < m; i++) rowlengths[i] = ai[i + 1] - ai[i]; 254 } 255 if (PetscDefined(USE_DEBUG) && a->ilen) { 256 PetscBool eq; 257 PetscCall(PetscMemcmp(rowlengths, a->ilen, m * sizeof(PetscInt), &eq)); 258 PetscCheck(eq, PETSC_COMM_SELF, PETSC_ERR_PLIB, "SeqAIJ ilen array incorrect"); 259 PetscCall(PetscFree(rowlengths)); 260 rowlengths = a->ilen; 261 } else if (a->ilen) rowlengths = a->ilen; 262 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B)); 263 PetscCall(MatSetSizes(B, m, n, m, n)); 264 PetscCall(MatSetType(B, MATSEQSELL)); 265 PetscCall(MatSeqSELLSetPreallocation(B, 0, rowlengths)); 266 if (rowlengths != a->ilen) PetscCall(PetscFree(rowlengths)); 267 } 268 269 for (row = 0; row < m; row++) { 270 PetscCall(MatGetRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals)); 271 PetscCall(MatSetValues_SeqSELL(B, 1, &row, ncols, cols, vals, INSERT_VALUES)); 272 PetscCall(MatRestoreRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals)); 273 } 274 PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY)); 275 PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY)); 276 B->rmap->bs = A->rmap->bs; 277 278 if (reuse == MAT_INPLACE_MATRIX) { 279 PetscCall(MatHeaderReplace(A, &B)); 280 } else { 281 *newmat = B; 282 } 283 PetscFunctionReturn(0); 284 } 285 286 PetscErrorCode MatMult_SeqSELL(Mat A, Vec xx, Vec yy) { 287 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 288 PetscScalar *y; 289 const PetscScalar *x; 290 const MatScalar *aval = a->val; 291 PetscInt totalslices = a->totalslices; 292 const PetscInt *acolidx = a->colidx; 293 PetscInt i, j; 294 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 295 __m512d vec_x, vec_y, vec_vals; 296 __m256i vec_idx; 297 __mmask8 mask; 298 __m512d vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4; 299 __m256i vec_idx2, vec_idx3, vec_idx4; 300 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 301 __m128i vec_idx; 302 __m256d vec_x, vec_y, vec_y2, vec_vals; 303 MatScalar yval; 304 PetscInt r, rows_left, row, nnz_in_row; 305 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 306 __m128d vec_x_tmp; 307 __m256d vec_x, vec_y, vec_y2, vec_vals; 308 MatScalar yval; 309 PetscInt r, rows_left, row, nnz_in_row; 310 #else 311 PetscScalar sum[8]; 312 #endif 313 314 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 315 #pragma disjoint(*x, *y, *aval) 316 #endif 317 318 PetscFunctionBegin; 319 PetscCall(VecGetArrayRead(xx, &x)); 320 PetscCall(VecGetArray(yy, &y)); 321 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 322 for (i = 0; i < totalslices; i++) { /* loop over slices */ 323 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 324 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 325 326 vec_y = _mm512_setzero_pd(); 327 vec_y2 = _mm512_setzero_pd(); 328 vec_y3 = _mm512_setzero_pd(); 329 vec_y4 = _mm512_setzero_pd(); 330 331 j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice columnn */ 332 switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) { 333 case 3: 334 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 335 acolidx += 8; 336 aval += 8; 337 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 338 acolidx += 8; 339 aval += 8; 340 AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3); 341 acolidx += 8; 342 aval += 8; 343 j += 3; 344 break; 345 case 2: 346 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 347 acolidx += 8; 348 aval += 8; 349 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 350 acolidx += 8; 351 aval += 8; 352 j += 2; 353 break; 354 case 1: 355 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 356 acolidx += 8; 357 aval += 8; 358 j += 1; 359 break; 360 } 361 #pragma novector 362 for (; j < (a->sliidx[i + 1] >> 3); j += 4) { 363 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 364 acolidx += 8; 365 aval += 8; 366 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 367 acolidx += 8; 368 aval += 8; 369 AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3); 370 acolidx += 8; 371 aval += 8; 372 AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4); 373 acolidx += 8; 374 aval += 8; 375 } 376 377 vec_y = _mm512_add_pd(vec_y, vec_y2); 378 vec_y = _mm512_add_pd(vec_y, vec_y3); 379 vec_y = _mm512_add_pd(vec_y, vec_y4); 380 if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */ 381 mask = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07))); 382 _mm512_mask_storeu_pd(&y[8 * i], mask, vec_y); 383 } else { 384 _mm512_storeu_pd(&y[8 * i], vec_y); 385 } 386 } 387 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 388 for (i = 0; i < totalslices; i++) { /* loop over full slices */ 389 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 390 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 391 392 /* last slice may have padding rows. Don't use vectorization. */ 393 if (i == totalslices - 1 && (A->rmap->n & 0x07)) { 394 rows_left = A->rmap->n - 8 * i; 395 for (r = 0; r < rows_left; ++r) { 396 yval = (MatScalar)0; 397 row = 8 * i + r; 398 nnz_in_row = a->rlen[row]; 399 for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]]; 400 y[row] = yval; 401 } 402 break; 403 } 404 405 vec_y = _mm256_setzero_pd(); 406 vec_y2 = _mm256_setzero_pd(); 407 408 /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */ 409 #pragma novector 410 #pragma unroll(2) 411 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) { 412 AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 413 aval += 4; 414 acolidx += 4; 415 AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y2); 416 aval += 4; 417 acolidx += 4; 418 } 419 420 _mm256_storeu_pd(y + i * 8, vec_y); 421 _mm256_storeu_pd(y + i * 8 + 4, vec_y2); 422 } 423 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 424 for (i = 0; i < totalslices; i++) { /* loop over full slices */ 425 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 426 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 427 428 vec_y = _mm256_setzero_pd(); 429 vec_y2 = _mm256_setzero_pd(); 430 431 /* last slice may have padding rows. Don't use vectorization. */ 432 if (i == totalslices - 1 && (A->rmap->n & 0x07)) { 433 rows_left = A->rmap->n - 8 * i; 434 for (r = 0; r < rows_left; ++r) { 435 yval = (MatScalar)0; 436 row = 8 * i + r; 437 nnz_in_row = a->rlen[row]; 438 for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]]; 439 y[row] = yval; 440 } 441 break; 442 } 443 444 /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */ 445 #pragma novector 446 #pragma unroll(2) 447 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) { 448 vec_vals = _mm256_loadu_pd(aval); 449 vec_x_tmp = _mm_setzero_pd(); 450 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 451 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 452 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0); 453 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 454 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 455 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1); 456 vec_y = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y); 457 aval += 4; 458 459 vec_vals = _mm256_loadu_pd(aval); 460 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 461 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 462 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0); 463 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 464 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 465 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1); 466 vec_y2 = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2); 467 aval += 4; 468 } 469 470 _mm256_storeu_pd(y + i * 8, vec_y); 471 _mm256_storeu_pd(y + i * 8 + 4, vec_y2); 472 } 473 #else 474 for (i = 0; i < totalslices; i++) { /* loop over slices */ 475 for (j = 0; j < 8; j++) sum[j] = 0.0; 476 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) { 477 sum[0] += aval[j] * x[acolidx[j]]; 478 sum[1] += aval[j + 1] * x[acolidx[j + 1]]; 479 sum[2] += aval[j + 2] * x[acolidx[j + 2]]; 480 sum[3] += aval[j + 3] * x[acolidx[j + 3]]; 481 sum[4] += aval[j + 4] * x[acolidx[j + 4]]; 482 sum[5] += aval[j + 5] * x[acolidx[j + 5]]; 483 sum[6] += aval[j + 6] * x[acolidx[j + 6]]; 484 sum[7] += aval[j + 7] * x[acolidx[j + 7]]; 485 } 486 if (i == totalslices - 1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */ 487 for (j = 0; j < (A->rmap->n & 0x07); j++) y[8 * i + j] = sum[j]; 488 } else { 489 for (j = 0; j < 8; j++) y[8 * i + j] = sum[j]; 490 } 491 } 492 #endif 493 494 PetscCall(PetscLogFlops(2.0 * a->nz - a->nonzerorowcnt)); /* theoretical minimal FLOPs */ 495 PetscCall(VecRestoreArrayRead(xx, &x)); 496 PetscCall(VecRestoreArray(yy, &y)); 497 PetscFunctionReturn(0); 498 } 499 500 #include <../src/mat/impls/aij/seq/ftn-kernels/fmultadd.h> 501 PetscErrorCode MatMultAdd_SeqSELL(Mat A, Vec xx, Vec yy, Vec zz) { 502 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 503 PetscScalar *y, *z; 504 const PetscScalar *x; 505 const MatScalar *aval = a->val; 506 PetscInt totalslices = a->totalslices; 507 const PetscInt *acolidx = a->colidx; 508 PetscInt i, j; 509 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 510 __m512d vec_x, vec_y, vec_vals; 511 __m256i vec_idx; 512 __mmask8 mask; 513 __m512d vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4; 514 __m256i vec_idx2, vec_idx3, vec_idx4; 515 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 516 __m128d vec_x_tmp; 517 __m256d vec_x, vec_y, vec_y2, vec_vals; 518 MatScalar yval; 519 PetscInt r, row, nnz_in_row; 520 #else 521 PetscScalar sum[8]; 522 #endif 523 524 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 525 #pragma disjoint(*x, *y, *aval) 526 #endif 527 528 PetscFunctionBegin; 529 PetscCall(VecGetArrayRead(xx, &x)); 530 PetscCall(VecGetArrayPair(yy, zz, &y, &z)); 531 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 532 for (i = 0; i < totalslices; i++) { /* loop over slices */ 533 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 534 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 535 536 if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */ 537 mask = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07))); 538 vec_y = _mm512_mask_loadu_pd(vec_y, mask, &y[8 * i]); 539 } else { 540 vec_y = _mm512_loadu_pd(&y[8 * i]); 541 } 542 vec_y2 = _mm512_setzero_pd(); 543 vec_y3 = _mm512_setzero_pd(); 544 vec_y4 = _mm512_setzero_pd(); 545 546 j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice columnn */ 547 switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) { 548 case 3: 549 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 550 acolidx += 8; 551 aval += 8; 552 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 553 acolidx += 8; 554 aval += 8; 555 AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3); 556 acolidx += 8; 557 aval += 8; 558 j += 3; 559 break; 560 case 2: 561 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 562 acolidx += 8; 563 aval += 8; 564 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 565 acolidx += 8; 566 aval += 8; 567 j += 2; 568 break; 569 case 1: 570 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 571 acolidx += 8; 572 aval += 8; 573 j += 1; 574 break; 575 } 576 #pragma novector 577 for (; j < (a->sliidx[i + 1] >> 3); j += 4) { 578 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 579 acolidx += 8; 580 aval += 8; 581 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 582 acolidx += 8; 583 aval += 8; 584 AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3); 585 acolidx += 8; 586 aval += 8; 587 AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4); 588 acolidx += 8; 589 aval += 8; 590 } 591 592 vec_y = _mm512_add_pd(vec_y, vec_y2); 593 vec_y = _mm512_add_pd(vec_y, vec_y3); 594 vec_y = _mm512_add_pd(vec_y, vec_y4); 595 if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */ 596 _mm512_mask_storeu_pd(&z[8 * i], mask, vec_y); 597 } else { 598 _mm512_storeu_pd(&z[8 * i], vec_y); 599 } 600 } 601 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 602 for (i = 0; i < totalslices; i++) { /* loop over full slices */ 603 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 604 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 605 606 /* last slice may have padding rows. Don't use vectorization. */ 607 if (i == totalslices - 1 && (A->rmap->n & 0x07)) { 608 for (r = 0; r < (A->rmap->n & 0x07); ++r) { 609 row = 8 * i + r; 610 yval = (MatScalar)0.0; 611 nnz_in_row = a->rlen[row]; 612 for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]]; 613 z[row] = y[row] + yval; 614 } 615 break; 616 } 617 618 vec_y = _mm256_loadu_pd(y + 8 * i); 619 vec_y2 = _mm256_loadu_pd(y + 8 * i + 4); 620 621 /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */ 622 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) { 623 vec_vals = _mm256_loadu_pd(aval); 624 vec_x_tmp = _mm_setzero_pd(); 625 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 626 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 627 vec_x = _mm256_setzero_pd(); 628 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0); 629 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 630 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 631 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1); 632 vec_y = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y); 633 aval += 4; 634 635 vec_vals = _mm256_loadu_pd(aval); 636 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 637 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 638 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0); 639 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 640 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 641 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1); 642 vec_y2 = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2); 643 aval += 4; 644 } 645 646 _mm256_storeu_pd(z + i * 8, vec_y); 647 _mm256_storeu_pd(z + i * 8 + 4, vec_y2); 648 } 649 #else 650 for (i = 0; i < totalslices; i++) { /* loop over slices */ 651 for (j = 0; j < 8; j++) sum[j] = 0.0; 652 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) { 653 sum[0] += aval[j] * x[acolidx[j]]; 654 sum[1] += aval[j + 1] * x[acolidx[j + 1]]; 655 sum[2] += aval[j + 2] * x[acolidx[j + 2]]; 656 sum[3] += aval[j + 3] * x[acolidx[j + 3]]; 657 sum[4] += aval[j + 4] * x[acolidx[j + 4]]; 658 sum[5] += aval[j + 5] * x[acolidx[j + 5]]; 659 sum[6] += aval[j + 6] * x[acolidx[j + 6]]; 660 sum[7] += aval[j + 7] * x[acolidx[j + 7]]; 661 } 662 if (i == totalslices - 1 && (A->rmap->n & 0x07)) { 663 for (j = 0; j < (A->rmap->n & 0x07); j++) z[8 * i + j] = y[8 * i + j] + sum[j]; 664 } else { 665 for (j = 0; j < 8; j++) z[8 * i + j] = y[8 * i + j] + sum[j]; 666 } 667 } 668 #endif 669 670 PetscCall(PetscLogFlops(2.0 * a->nz)); 671 PetscCall(VecRestoreArrayRead(xx, &x)); 672 PetscCall(VecRestoreArrayPair(yy, zz, &y, &z)); 673 PetscFunctionReturn(0); 674 } 675 676 PetscErrorCode MatMultTransposeAdd_SeqSELL(Mat A, Vec xx, Vec zz, Vec yy) { 677 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 678 PetscScalar *y; 679 const PetscScalar *x; 680 const MatScalar *aval = a->val; 681 const PetscInt *acolidx = a->colidx; 682 PetscInt i, j, r, row, nnz_in_row, totalslices = a->totalslices; 683 684 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 685 #pragma disjoint(*x, *y, *aval) 686 #endif 687 688 PetscFunctionBegin; 689 if (A->symmetric == PETSC_BOOL3_TRUE) { 690 PetscCall(MatMultAdd_SeqSELL(A, xx, zz, yy)); 691 PetscFunctionReturn(0); 692 } 693 if (zz != yy) PetscCall(VecCopy(zz, yy)); 694 PetscCall(VecGetArrayRead(xx, &x)); 695 PetscCall(VecGetArray(yy, &y)); 696 for (i = 0; i < a->totalslices; i++) { /* loop over slices */ 697 if (i == totalslices - 1 && (A->rmap->n & 0x07)) { 698 for (r = 0; r < (A->rmap->n & 0x07); ++r) { 699 row = 8 * i + r; 700 nnz_in_row = a->rlen[row]; 701 for (j = 0; j < nnz_in_row; ++j) y[acolidx[8 * j + r]] += aval[8 * j + r] * x[row]; 702 } 703 break; 704 } 705 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) { 706 y[acolidx[j]] += aval[j] * x[8 * i]; 707 y[acolidx[j + 1]] += aval[j + 1] * x[8 * i + 1]; 708 y[acolidx[j + 2]] += aval[j + 2] * x[8 * i + 2]; 709 y[acolidx[j + 3]] += aval[j + 3] * x[8 * i + 3]; 710 y[acolidx[j + 4]] += aval[j + 4] * x[8 * i + 4]; 711 y[acolidx[j + 5]] += aval[j + 5] * x[8 * i + 5]; 712 y[acolidx[j + 6]] += aval[j + 6] * x[8 * i + 6]; 713 y[acolidx[j + 7]] += aval[j + 7] * x[8 * i + 7]; 714 } 715 } 716 PetscCall(PetscLogFlops(2.0 * a->sliidx[a->totalslices])); 717 PetscCall(VecRestoreArrayRead(xx, &x)); 718 PetscCall(VecRestoreArray(yy, &y)); 719 PetscFunctionReturn(0); 720 } 721 722 PetscErrorCode MatMultTranspose_SeqSELL(Mat A, Vec xx, Vec yy) { 723 PetscFunctionBegin; 724 if (A->symmetric == PETSC_BOOL3_TRUE) { 725 PetscCall(MatMult_SeqSELL(A, xx, yy)); 726 } else { 727 PetscCall(VecSet(yy, 0.0)); 728 PetscCall(MatMultTransposeAdd_SeqSELL(A, xx, yy, yy)); 729 } 730 PetscFunctionReturn(0); 731 } 732 733 /* 734 Checks for missing diagonals 735 */ 736 PetscErrorCode MatMissingDiagonal_SeqSELL(Mat A, PetscBool *missing, PetscInt *d) { 737 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 738 PetscInt *diag, i; 739 740 PetscFunctionBegin; 741 *missing = PETSC_FALSE; 742 if (A->rmap->n > 0 && !(a->colidx)) { 743 *missing = PETSC_TRUE; 744 if (d) *d = 0; 745 PetscCall(PetscInfo(A, "Matrix has no entries therefore is missing diagonal\n")); 746 } else { 747 diag = a->diag; 748 for (i = 0; i < A->rmap->n; i++) { 749 if (diag[i] == -1) { 750 *missing = PETSC_TRUE; 751 if (d) *d = i; 752 PetscCall(PetscInfo(A, "Matrix is missing diagonal number %" PetscInt_FMT "\n", i)); 753 break; 754 } 755 } 756 } 757 PetscFunctionReturn(0); 758 } 759 760 PetscErrorCode MatMarkDiagonal_SeqSELL(Mat A) { 761 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 762 PetscInt i, j, m = A->rmap->n, shift; 763 764 PetscFunctionBegin; 765 if (!a->diag) { 766 PetscCall(PetscMalloc1(m, &a->diag)); 767 a->free_diag = PETSC_TRUE; 768 } 769 for (i = 0; i < m; i++) { /* loop over rows */ 770 shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */ 771 a->diag[i] = -1; 772 for (j = 0; j < a->rlen[i]; j++) { 773 if (a->colidx[shift + j * 8] == i) { 774 a->diag[i] = shift + j * 8; 775 break; 776 } 777 } 778 } 779 PetscFunctionReturn(0); 780 } 781 782 /* 783 Negative shift indicates do not generate an error if there is a zero diagonal, just invert it anyways 784 */ 785 PetscErrorCode MatInvertDiagonal_SeqSELL(Mat A, PetscScalar omega, PetscScalar fshift) { 786 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 787 PetscInt i, *diag, m = A->rmap->n; 788 MatScalar *val = a->val; 789 PetscScalar *idiag, *mdiag; 790 791 PetscFunctionBegin; 792 if (a->idiagvalid) PetscFunctionReturn(0); 793 PetscCall(MatMarkDiagonal_SeqSELL(A)); 794 diag = a->diag; 795 if (!a->idiag) { 796 PetscCall(PetscMalloc3(m, &a->idiag, m, &a->mdiag, m, &a->ssor_work)); 797 val = a->val; 798 } 799 mdiag = a->mdiag; 800 idiag = a->idiag; 801 802 if (omega == 1.0 && PetscRealPart(fshift) <= 0.0) { 803 for (i = 0; i < m; i++) { 804 mdiag[i] = val[diag[i]]; 805 if (!PetscAbsScalar(mdiag[i])) { /* zero diagonal */ 806 if (PetscRealPart(fshift)) { 807 PetscCall(PetscInfo(A, "Zero diagonal on row %" PetscInt_FMT "\n", i)); 808 A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 809 A->factorerror_zeropivot_value = 0.0; 810 A->factorerror_zeropivot_row = i; 811 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Zero diagonal on row %" PetscInt_FMT, i); 812 } 813 idiag[i] = 1.0 / val[diag[i]]; 814 } 815 PetscCall(PetscLogFlops(m)); 816 } else { 817 for (i = 0; i < m; i++) { 818 mdiag[i] = val[diag[i]]; 819 idiag[i] = omega / (fshift + val[diag[i]]); 820 } 821 PetscCall(PetscLogFlops(2.0 * m)); 822 } 823 a->idiagvalid = PETSC_TRUE; 824 PetscFunctionReturn(0); 825 } 826 827 PetscErrorCode MatZeroEntries_SeqSELL(Mat A) { 828 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 829 830 PetscFunctionBegin; 831 PetscCall(PetscArrayzero(a->val, a->sliidx[a->totalslices])); 832 PetscCall(MatSeqSELLInvalidateDiagonal(A)); 833 PetscFunctionReturn(0); 834 } 835 836 PetscErrorCode MatDestroy_SeqSELL(Mat A) { 837 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 838 839 PetscFunctionBegin; 840 #if defined(PETSC_USE_LOG) 841 PetscLogObjectState((PetscObject)A, "Rows=%" PetscInt_FMT ", Cols=%" PetscInt_FMT ", NZ=%" PetscInt_FMT, A->rmap->n, A->cmap->n, a->nz); 842 #endif 843 PetscCall(MatSeqXSELLFreeSELL(A, &a->val, &a->colidx)); 844 PetscCall(ISDestroy(&a->row)); 845 PetscCall(ISDestroy(&a->col)); 846 PetscCall(PetscFree(a->diag)); 847 PetscCall(PetscFree(a->rlen)); 848 PetscCall(PetscFree(a->sliidx)); 849 PetscCall(PetscFree3(a->idiag, a->mdiag, a->ssor_work)); 850 PetscCall(PetscFree(a->solve_work)); 851 PetscCall(ISDestroy(&a->icol)); 852 PetscCall(PetscFree(a->saved_values)); 853 PetscCall(PetscFree2(a->getrowcols, a->getrowvals)); 854 855 PetscCall(PetscFree(A->data)); 856 857 PetscCall(PetscObjectChangeTypeName((PetscObject)A, NULL)); 858 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatStoreValues_C", NULL)); 859 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatRetrieveValues_C", NULL)); 860 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLSetPreallocation_C", NULL)); 861 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetArray_C", NULL)); 862 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLRestoreArray_C", NULL)); 863 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqsell_seqaij_C", NULL)); 864 PetscFunctionReturn(0); 865 } 866 867 PetscErrorCode MatSetOption_SeqSELL(Mat A, MatOption op, PetscBool flg) { 868 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 869 870 PetscFunctionBegin; 871 switch (op) { 872 case MAT_ROW_ORIENTED: a->roworiented = flg; break; 873 case MAT_KEEP_NONZERO_PATTERN: a->keepnonzeropattern = flg; break; 874 case MAT_NEW_NONZERO_LOCATIONS: a->nonew = (flg ? 0 : 1); break; 875 case MAT_NEW_NONZERO_LOCATION_ERR: a->nonew = (flg ? -1 : 0); break; 876 case MAT_NEW_NONZERO_ALLOCATION_ERR: a->nonew = (flg ? -2 : 0); break; 877 case MAT_UNUSED_NONZERO_LOCATION_ERR: a->nounused = (flg ? -1 : 0); break; 878 case MAT_FORCE_DIAGONAL_ENTRIES: 879 case MAT_IGNORE_OFF_PROC_ENTRIES: 880 case MAT_USE_HASH_TABLE: 881 case MAT_SORTED_FULL: PetscCall(PetscInfo(A, "Option %s ignored\n", MatOptions[op])); break; 882 case MAT_SPD: 883 case MAT_SYMMETRIC: 884 case MAT_STRUCTURALLY_SYMMETRIC: 885 case MAT_HERMITIAN: 886 case MAT_SYMMETRY_ETERNAL: 887 case MAT_STRUCTURAL_SYMMETRY_ETERNAL: 888 case MAT_SPD_ETERNAL: 889 /* These options are handled directly by MatSetOption() */ 890 break; 891 default: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unknown option %d", op); 892 } 893 PetscFunctionReturn(0); 894 } 895 896 PetscErrorCode MatGetDiagonal_SeqSELL(Mat A, Vec v) { 897 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 898 PetscInt i, j, n, shift; 899 PetscScalar *x, zero = 0.0; 900 901 PetscFunctionBegin; 902 PetscCall(VecGetLocalSize(v, &n)); 903 PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector"); 904 905 if (A->factortype == MAT_FACTOR_ILU || A->factortype == MAT_FACTOR_LU) { 906 PetscInt *diag = a->diag; 907 PetscCall(VecGetArray(v, &x)); 908 for (i = 0; i < n; i++) x[i] = 1.0 / a->val[diag[i]]; 909 PetscCall(VecRestoreArray(v, &x)); 910 PetscFunctionReturn(0); 911 } 912 913 PetscCall(VecSet(v, zero)); 914 PetscCall(VecGetArray(v, &x)); 915 for (i = 0; i < n; i++) { /* loop over rows */ 916 shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */ 917 x[i] = 0; 918 for (j = 0; j < a->rlen[i]; j++) { 919 if (a->colidx[shift + j * 8] == i) { 920 x[i] = a->val[shift + j * 8]; 921 break; 922 } 923 } 924 } 925 PetscCall(VecRestoreArray(v, &x)); 926 PetscFunctionReturn(0); 927 } 928 929 PetscErrorCode MatDiagonalScale_SeqSELL(Mat A, Vec ll, Vec rr) { 930 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 931 const PetscScalar *l, *r; 932 PetscInt i, j, m, n, row; 933 934 PetscFunctionBegin; 935 if (ll) { 936 /* The local size is used so that VecMPI can be passed to this routine 937 by MatDiagonalScale_MPISELL */ 938 PetscCall(VecGetLocalSize(ll, &m)); 939 PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length"); 940 PetscCall(VecGetArrayRead(ll, &l)); 941 for (i = 0; i < a->totalslices; i++) { /* loop over slices */ 942 if (i == a->totalslices - 1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */ 943 for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) & 0x07)) { 944 if (row < (A->rmap->n & 0x07)) a->val[j] *= l[8 * i + row]; 945 } 946 } else { 947 for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) & 0x07)) a->val[j] *= l[8 * i + row]; 948 } 949 } 950 PetscCall(VecRestoreArrayRead(ll, &l)); 951 PetscCall(PetscLogFlops(a->nz)); 952 } 953 if (rr) { 954 PetscCall(VecGetLocalSize(rr, &n)); 955 PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length"); 956 PetscCall(VecGetArrayRead(rr, &r)); 957 for (i = 0; i < a->totalslices; i++) { /* loop over slices */ 958 if (i == a->totalslices - 1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */ 959 for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) & 0x07)) { 960 if (row < (A->rmap->n & 0x07)) a->val[j] *= r[a->colidx[j]]; 961 } 962 } else { 963 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j++) a->val[j] *= r[a->colidx[j]]; 964 } 965 } 966 PetscCall(VecRestoreArrayRead(rr, &r)); 967 PetscCall(PetscLogFlops(a->nz)); 968 } 969 PetscCall(MatSeqSELLInvalidateDiagonal(A)); 970 PetscFunctionReturn(0); 971 } 972 973 PetscErrorCode MatGetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], PetscScalar v[]) { 974 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 975 PetscInt *cp, i, k, low, high, t, row, col, l; 976 PetscInt shift; 977 MatScalar *vp; 978 979 PetscFunctionBegin; 980 for (k = 0; k < m; k++) { /* loop over requested rows */ 981 row = im[k]; 982 if (row < 0) continue; 983 PetscCheck(row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->n - 1); 984 shift = a->sliidx[row >> 3] + (row & 0x07); /* starting index of the row */ 985 cp = a->colidx + shift; /* pointer to the row */ 986 vp = a->val + shift; /* pointer to the row */ 987 for (l = 0; l < n; l++) { /* loop over requested columns */ 988 col = in[l]; 989 if (col < 0) continue; 990 PetscCheck(col < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column too large: row %" PetscInt_FMT " max %" PetscInt_FMT, col, A->cmap->n - 1); 991 high = a->rlen[row]; 992 low = 0; /* assume unsorted */ 993 while (high - low > 5) { 994 t = (low + high) / 2; 995 if (*(cp + t * 8) > col) high = t; 996 else low = t; 997 } 998 for (i = low; i < high; i++) { 999 if (*(cp + 8 * i) > col) break; 1000 if (*(cp + 8 * i) == col) { 1001 *v++ = *(vp + 8 * i); 1002 goto finished; 1003 } 1004 } 1005 *v++ = 0.0; 1006 finished:; 1007 } 1008 } 1009 PetscFunctionReturn(0); 1010 } 1011 1012 PetscErrorCode MatView_SeqSELL_ASCII(Mat A, PetscViewer viewer) { 1013 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1014 PetscInt i, j, m = A->rmap->n, shift; 1015 const char *name; 1016 PetscViewerFormat format; 1017 1018 PetscFunctionBegin; 1019 PetscCall(PetscViewerGetFormat(viewer, &format)); 1020 if (format == PETSC_VIEWER_ASCII_MATLAB) { 1021 PetscInt nofinalvalue = 0; 1022 /* 1023 if (m && ((a->i[m] == a->i[m-1]) || (a->j[a->nz-1] != A->cmap->n-1))) { 1024 nofinalvalue = 1; 1025 } 1026 */ 1027 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1028 PetscCall(PetscViewerASCIIPrintf(viewer, "%% Size = %" PetscInt_FMT " %" PetscInt_FMT " \n", m, A->cmap->n)); 1029 PetscCall(PetscViewerASCIIPrintf(viewer, "%% Nonzeros = %" PetscInt_FMT " \n", a->nz)); 1030 #if defined(PETSC_USE_COMPLEX) 1031 PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",4);\n", a->nz + nofinalvalue)); 1032 #else 1033 PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",3);\n", a->nz + nofinalvalue)); 1034 #endif 1035 PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = [\n")); 1036 1037 for (i = 0; i < m; i++) { 1038 shift = a->sliidx[i >> 3] + (i & 0x07); 1039 for (j = 0; j < a->rlen[i]; j++) { 1040 #if defined(PETSC_USE_COMPLEX) 1041 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %18.16e %18.16e\n", i + 1, a->colidx[shift + 8 * j] + 1, (double)PetscRealPart(a->val[shift + 8 * j]), (double)PetscImaginaryPart(a->val[shift + 8 * j]))); 1042 #else 1043 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %18.16e\n", i + 1, a->colidx[shift + 8 * j] + 1, (double)a->val[shift + 8 * j])); 1044 #endif 1045 } 1046 } 1047 /* 1048 if (nofinalvalue) { 1049 #if defined(PETSC_USE_COMPLEX) 1050 PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %18.16e %18.16e\n",m,A->cmap->n,0.,0.)); 1051 #else 1052 PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %18.16e\n",m,A->cmap->n,0.0)); 1053 #endif 1054 } 1055 */ 1056 PetscCall(PetscObjectGetName((PetscObject)A, &name)); 1057 PetscCall(PetscViewerASCIIPrintf(viewer, "];\n %s = spconvert(zzz);\n", name)); 1058 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1059 } else if (format == PETSC_VIEWER_ASCII_FACTOR_INFO || format == PETSC_VIEWER_ASCII_INFO) { 1060 PetscFunctionReturn(0); 1061 } else if (format == PETSC_VIEWER_ASCII_COMMON) { 1062 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1063 for (i = 0; i < m; i++) { 1064 PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i)); 1065 shift = a->sliidx[i >> 3] + (i & 0x07); 1066 for (j = 0; j < a->rlen[i]; j++) { 1067 #if defined(PETSC_USE_COMPLEX) 1068 if (PetscImaginaryPart(a->val[shift + 8 * j]) > 0.0 && PetscRealPart(a->val[shift + 8 * j]) != 0.0) { 1069 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]), (double)PetscImaginaryPart(a->val[shift + 8 * j]))); 1070 } else if (PetscImaginaryPart(a->val[shift + 8 * j]) < 0.0 && PetscRealPart(a->val[shift + 8 * j]) != 0.0) { 1071 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]), (double)-PetscImaginaryPart(a->val[shift + 8 * j]))); 1072 } else if (PetscRealPart(a->val[shift + 8 * j]) != 0.0) { 1073 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]))); 1074 } 1075 #else 1076 if (a->val[shift + 8 * j] != 0.0) PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + 8 * j], (double)a->val[shift + 8 * j])); 1077 #endif 1078 } 1079 PetscCall(PetscViewerASCIIPrintf(viewer, "\n")); 1080 } 1081 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1082 } else if (format == PETSC_VIEWER_ASCII_DENSE) { 1083 PetscInt cnt = 0, jcnt; 1084 PetscScalar value; 1085 #if defined(PETSC_USE_COMPLEX) 1086 PetscBool realonly = PETSC_TRUE; 1087 for (i = 0; i < a->sliidx[a->totalslices]; i++) { 1088 if (PetscImaginaryPart(a->val[i]) != 0.0) { 1089 realonly = PETSC_FALSE; 1090 break; 1091 } 1092 } 1093 #endif 1094 1095 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1096 for (i = 0; i < m; i++) { 1097 jcnt = 0; 1098 shift = a->sliidx[i >> 3] + (i & 0x07); 1099 for (j = 0; j < A->cmap->n; j++) { 1100 if (jcnt < a->rlen[i] && j == a->colidx[shift + 8 * j]) { 1101 value = a->val[cnt++]; 1102 jcnt++; 1103 } else { 1104 value = 0.0; 1105 } 1106 #if defined(PETSC_USE_COMPLEX) 1107 if (realonly) { 1108 PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)PetscRealPart(value))); 1109 } else { 1110 PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e+%7.5e i ", (double)PetscRealPart(value), (double)PetscImaginaryPart(value))); 1111 } 1112 #else 1113 PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)value)); 1114 #endif 1115 } 1116 PetscCall(PetscViewerASCIIPrintf(viewer, "\n")); 1117 } 1118 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1119 } else if (format == PETSC_VIEWER_ASCII_MATRIXMARKET) { 1120 PetscInt fshift = 1; 1121 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1122 #if defined(PETSC_USE_COMPLEX) 1123 PetscCall(PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate complex general\n")); 1124 #else 1125 PetscCall(PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate real general\n")); 1126 #endif 1127 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %" PetscInt_FMT "\n", m, A->cmap->n, a->nz)); 1128 for (i = 0; i < m; i++) { 1129 shift = a->sliidx[i >> 3] + (i & 0x07); 1130 for (j = 0; j < a->rlen[i]; j++) { 1131 #if defined(PETSC_USE_COMPLEX) 1132 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g %g\n", i + fshift, a->colidx[shift + 8 * j] + fshift, (double)PetscRealPart(a->val[shift + 8 * j]), (double)PetscImaginaryPart(a->val[shift + 8 * j]))); 1133 #else 1134 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g\n", i + fshift, a->colidx[shift + 8 * j] + fshift, (double)a->val[shift + 8 * j])); 1135 #endif 1136 } 1137 } 1138 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1139 } else if (format == PETSC_VIEWER_NATIVE) { 1140 for (i = 0; i < a->totalslices; i++) { /* loop over slices */ 1141 PetscInt row; 1142 PetscCall(PetscViewerASCIIPrintf(viewer, "slice %" PetscInt_FMT ": %" PetscInt_FMT " %" PetscInt_FMT "\n", i, a->sliidx[i], a->sliidx[i + 1])); 1143 for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) & 0x07)) { 1144 #if defined(PETSC_USE_COMPLEX) 1145 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1146 PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g + %g i\n", 8 * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j]))); 1147 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1148 PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g - %g i\n", 8 * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), -(double)PetscImaginaryPart(a->val[j]))); 1149 } else { 1150 PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g\n", 8 * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]))); 1151 } 1152 #else 1153 PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g\n", 8 * i + row, a->colidx[j], (double)a->val[j])); 1154 #endif 1155 } 1156 } 1157 } else { 1158 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1159 if (A->factortype) { 1160 for (i = 0; i < m; i++) { 1161 shift = a->sliidx[i >> 3] + (i & 0x07); 1162 PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i)); 1163 /* L part */ 1164 for (j = shift; j < a->diag[i]; j += 8) { 1165 #if defined(PETSC_USE_COMPLEX) 1166 if (PetscImaginaryPart(a->val[shift + 8 * j]) > 0.0) { 1167 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j]))); 1168 } else if (PetscImaginaryPart(a->val[shift + 8 * j]) < 0.0) { 1169 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j])))); 1170 } else { 1171 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j]))); 1172 } 1173 #else 1174 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j])); 1175 #endif 1176 } 1177 /* diagonal */ 1178 j = a->diag[i]; 1179 #if defined(PETSC_USE_COMPLEX) 1180 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1181 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)PetscImaginaryPart(1.0 / a->val[j]))); 1182 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1183 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)(-PetscImaginaryPart(1.0 / a->val[j])))); 1184 } else { 1185 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]))); 1186 } 1187 #else 1188 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)(1.0 / a->val[j]))); 1189 #endif 1190 1191 /* U part */ 1192 for (j = a->diag[i] + 1; j < shift + 8 * a->rlen[i]; j += 8) { 1193 #if defined(PETSC_USE_COMPLEX) 1194 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1195 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j]))); 1196 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1197 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j])))); 1198 } else { 1199 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j]))); 1200 } 1201 #else 1202 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j])); 1203 #endif 1204 } 1205 PetscCall(PetscViewerASCIIPrintf(viewer, "\n")); 1206 } 1207 } else { 1208 for (i = 0; i < m; i++) { 1209 shift = a->sliidx[i >> 3] + (i & 0x07); 1210 PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i)); 1211 for (j = 0; j < a->rlen[i]; j++) { 1212 #if defined(PETSC_USE_COMPLEX) 1213 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1214 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]), (double)PetscImaginaryPart(a->val[shift + 8 * j]))); 1215 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1216 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]), (double)-PetscImaginaryPart(a->val[shift + 8 * j]))); 1217 } else { 1218 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]))); 1219 } 1220 #else 1221 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + 8 * j], (double)a->val[shift + 8 * j])); 1222 #endif 1223 } 1224 PetscCall(PetscViewerASCIIPrintf(viewer, "\n")); 1225 } 1226 } 1227 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1228 } 1229 PetscCall(PetscViewerFlush(viewer)); 1230 PetscFunctionReturn(0); 1231 } 1232 1233 #include <petscdraw.h> 1234 PetscErrorCode MatView_SeqSELL_Draw_Zoom(PetscDraw draw, void *Aa) { 1235 Mat A = (Mat)Aa; 1236 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1237 PetscInt i, j, m = A->rmap->n, shift; 1238 int color; 1239 PetscReal xl, yl, xr, yr, x_l, x_r, y_l, y_r; 1240 PetscViewer viewer; 1241 PetscViewerFormat format; 1242 1243 PetscFunctionBegin; 1244 PetscCall(PetscObjectQuery((PetscObject)A, "Zoomviewer", (PetscObject *)&viewer)); 1245 PetscCall(PetscViewerGetFormat(viewer, &format)); 1246 PetscCall(PetscDrawGetCoordinates(draw, &xl, &yl, &xr, &yr)); 1247 1248 /* loop over matrix elements drawing boxes */ 1249 1250 if (format != PETSC_VIEWER_DRAW_CONTOUR) { 1251 PetscDrawCollectiveBegin(draw); 1252 /* Blue for negative, Cyan for zero and Red for positive */ 1253 color = PETSC_DRAW_BLUE; 1254 for (i = 0; i < m; i++) { 1255 shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */ 1256 y_l = m - i - 1.0; 1257 y_r = y_l + 1.0; 1258 for (j = 0; j < a->rlen[i]; j++) { 1259 x_l = a->colidx[shift + j * 8]; 1260 x_r = x_l + 1.0; 1261 if (PetscRealPart(a->val[shift + 8 * j]) >= 0.) continue; 1262 PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color)); 1263 } 1264 } 1265 color = PETSC_DRAW_CYAN; 1266 for (i = 0; i < m; i++) { 1267 shift = a->sliidx[i >> 3] + (i & 0x07); 1268 y_l = m - i - 1.0; 1269 y_r = y_l + 1.0; 1270 for (j = 0; j < a->rlen[i]; j++) { 1271 x_l = a->colidx[shift + j * 8]; 1272 x_r = x_l + 1.0; 1273 if (a->val[shift + 8 * j] != 0.) continue; 1274 PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color)); 1275 } 1276 } 1277 color = PETSC_DRAW_RED; 1278 for (i = 0; i < m; i++) { 1279 shift = a->sliidx[i >> 3] + (i & 0x07); 1280 y_l = m - i - 1.0; 1281 y_r = y_l + 1.0; 1282 for (j = 0; j < a->rlen[i]; j++) { 1283 x_l = a->colidx[shift + j * 8]; 1284 x_r = x_l + 1.0; 1285 if (PetscRealPart(a->val[shift + 8 * j]) <= 0.) continue; 1286 PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color)); 1287 } 1288 } 1289 PetscDrawCollectiveEnd(draw); 1290 } else { 1291 /* use contour shading to indicate magnitude of values */ 1292 /* first determine max of all nonzero values */ 1293 PetscReal minv = 0.0, maxv = 0.0; 1294 PetscInt count = 0; 1295 PetscDraw popup; 1296 for (i = 0; i < a->sliidx[a->totalslices]; i++) { 1297 if (PetscAbsScalar(a->val[i]) > maxv) maxv = PetscAbsScalar(a->val[i]); 1298 } 1299 if (minv >= maxv) maxv = minv + PETSC_SMALL; 1300 PetscCall(PetscDrawGetPopup(draw, &popup)); 1301 PetscCall(PetscDrawScalePopup(popup, minv, maxv)); 1302 1303 PetscDrawCollectiveBegin(draw); 1304 for (i = 0; i < m; i++) { 1305 shift = a->sliidx[i >> 3] + (i & 0x07); 1306 y_l = m - i - 1.0; 1307 y_r = y_l + 1.0; 1308 for (j = 0; j < a->rlen[i]; j++) { 1309 x_l = a->colidx[shift + j * 8]; 1310 x_r = x_l + 1.0; 1311 color = PetscDrawRealToColor(PetscAbsScalar(a->val[count]), minv, maxv); 1312 PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color)); 1313 count++; 1314 } 1315 } 1316 PetscDrawCollectiveEnd(draw); 1317 } 1318 PetscFunctionReturn(0); 1319 } 1320 1321 #include <petscdraw.h> 1322 PetscErrorCode MatView_SeqSELL_Draw(Mat A, PetscViewer viewer) { 1323 PetscDraw draw; 1324 PetscReal xr, yr, xl, yl, h, w; 1325 PetscBool isnull; 1326 1327 PetscFunctionBegin; 1328 PetscCall(PetscViewerDrawGetDraw(viewer, 0, &draw)); 1329 PetscCall(PetscDrawIsNull(draw, &isnull)); 1330 if (isnull) PetscFunctionReturn(0); 1331 1332 xr = A->cmap->n; 1333 yr = A->rmap->n; 1334 h = yr / 10.0; 1335 w = xr / 10.0; 1336 xr += w; 1337 yr += h; 1338 xl = -w; 1339 yl = -h; 1340 PetscCall(PetscDrawSetCoordinates(draw, xl, yl, xr, yr)); 1341 PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", (PetscObject)viewer)); 1342 PetscCall(PetscDrawZoom(draw, MatView_SeqSELL_Draw_Zoom, A)); 1343 PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", NULL)); 1344 PetscCall(PetscDrawSave(draw)); 1345 PetscFunctionReturn(0); 1346 } 1347 1348 PetscErrorCode MatView_SeqSELL(Mat A, PetscViewer viewer) { 1349 PetscBool iascii, isbinary, isdraw; 1350 1351 PetscFunctionBegin; 1352 PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii)); 1353 PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERBINARY, &isbinary)); 1354 PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERDRAW, &isdraw)); 1355 if (iascii) { 1356 PetscCall(MatView_SeqSELL_ASCII(A, viewer)); 1357 } else if (isbinary) { 1358 /* PetscCall(MatView_SeqSELL_Binary(A,viewer)); */ 1359 } else if (isdraw) PetscCall(MatView_SeqSELL_Draw(A, viewer)); 1360 PetscFunctionReturn(0); 1361 } 1362 1363 PetscErrorCode MatAssemblyEnd_SeqSELL(Mat A, MatAssemblyType mode) { 1364 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1365 PetscInt i, shift, row_in_slice, row, nrow, *cp, lastcol, j, k; 1366 MatScalar *vp; 1367 1368 PetscFunctionBegin; 1369 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 1370 /* To do: compress out the unused elements */ 1371 PetscCall(MatMarkDiagonal_SeqSELL(A)); 1372 PetscCall(PetscInfo(A, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: %" PetscInt_FMT " allocated %" PetscInt_FMT " used (%" PetscInt_FMT " nonzeros+%" PetscInt_FMT " paddedzeros)\n", A->rmap->n, A->cmap->n, a->maxallocmat, a->sliidx[a->totalslices], a->nz, a->sliidx[a->totalslices] - a->nz)); 1373 PetscCall(PetscInfo(A, "Number of mallocs during MatSetValues() is %" PetscInt_FMT "\n", a->reallocs)); 1374 PetscCall(PetscInfo(A, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", a->rlenmax)); 1375 /* Set unused slots for column indices to last valid column index. Set unused slots for values to zero. This allows for a use of unmasked intrinsics -> higher performance */ 1376 for (i = 0; i < a->totalslices; ++i) { 1377 shift = a->sliidx[i]; /* starting index of the slice */ 1378 cp = a->colidx + shift; /* pointer to the column indices of the slice */ 1379 vp = a->val + shift; /* pointer to the nonzero values of the slice */ 1380 for (row_in_slice = 0; row_in_slice < 8; ++row_in_slice) { /* loop over rows in the slice */ 1381 row = 8 * i + row_in_slice; 1382 nrow = a->rlen[row]; /* number of nonzeros in row */ 1383 /* 1384 Search for the nearest nonzero. Normally setting the index to zero may cause extra communication. 1385 But if the entire slice are empty, it is fine to use 0 since the index will not be loaded. 1386 */ 1387 lastcol = 0; 1388 if (nrow > 0) { /* nonempty row */ 1389 lastcol = cp[8 * (nrow - 1) + row_in_slice]; /* use the index from the last nonzero at current row */ 1390 } else if (!row_in_slice) { /* first row of the currect slice is empty */ 1391 for (j = 1; j < 8; j++) { 1392 if (a->rlen[8 * i + j]) { 1393 lastcol = cp[j]; 1394 break; 1395 } 1396 } 1397 } else { 1398 if (a->sliidx[i + 1] != shift) lastcol = cp[row_in_slice - 1]; /* use the index from the previous row */ 1399 } 1400 1401 for (k = nrow; k < (a->sliidx[i + 1] - shift) / 8; ++k) { 1402 cp[8 * k + row_in_slice] = lastcol; 1403 vp[8 * k + row_in_slice] = (MatScalar)0; 1404 } 1405 } 1406 } 1407 1408 A->info.mallocs += a->reallocs; 1409 a->reallocs = 0; 1410 1411 PetscCall(MatSeqSELLInvalidateDiagonal(A)); 1412 PetscFunctionReturn(0); 1413 } 1414 1415 PetscErrorCode MatGetInfo_SeqSELL(Mat A, MatInfoType flag, MatInfo *info) { 1416 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1417 1418 PetscFunctionBegin; 1419 info->block_size = 1.0; 1420 info->nz_allocated = a->maxallocmat; 1421 info->nz_used = a->sliidx[a->totalslices]; /* include padding zeros */ 1422 info->nz_unneeded = (a->maxallocmat - a->sliidx[a->totalslices]); 1423 info->assemblies = A->num_ass; 1424 info->mallocs = A->info.mallocs; 1425 info->memory = 0; /* REVIEW ME */ 1426 if (A->factortype) { 1427 info->fill_ratio_given = A->info.fill_ratio_given; 1428 info->fill_ratio_needed = A->info.fill_ratio_needed; 1429 info->factor_mallocs = A->info.factor_mallocs; 1430 } else { 1431 info->fill_ratio_given = 0; 1432 info->fill_ratio_needed = 0; 1433 info->factor_mallocs = 0; 1434 } 1435 PetscFunctionReturn(0); 1436 } 1437 1438 PetscErrorCode MatSetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], const PetscScalar v[], InsertMode is) { 1439 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1440 PetscInt shift, i, k, l, low, high, t, ii, row, col, nrow; 1441 PetscInt *cp, nonew = a->nonew, lastcol = -1; 1442 MatScalar *vp, value; 1443 1444 PetscFunctionBegin; 1445 for (k = 0; k < m; k++) { /* loop over added rows */ 1446 row = im[k]; 1447 if (row < 0) continue; 1448 PetscCheck(row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->n - 1); 1449 shift = a->sliidx[row >> 3] + (row & 0x07); /* starting index of the row */ 1450 cp = a->colidx + shift; /* pointer to the row */ 1451 vp = a->val + shift; /* pointer to the row */ 1452 nrow = a->rlen[row]; 1453 low = 0; 1454 high = nrow; 1455 1456 for (l = 0; l < n; l++) { /* loop over added columns */ 1457 col = in[l]; 1458 if (col < 0) continue; 1459 PetscCheck(col < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Col too large: row %" PetscInt_FMT " max %" PetscInt_FMT, col, A->cmap->n - 1); 1460 if (a->roworiented) { 1461 value = v[l + k * n]; 1462 } else { 1463 value = v[k + l * m]; 1464 } 1465 if ((value == 0.0 && a->ignorezeroentries) && (is == ADD_VALUES)) continue; 1466 1467 /* search in this row for the specified column, i indicates the column to be set */ 1468 if (col <= lastcol) low = 0; 1469 else high = nrow; 1470 lastcol = col; 1471 while (high - low > 5) { 1472 t = (low + high) / 2; 1473 if (*(cp + t * 8) > col) high = t; 1474 else low = t; 1475 } 1476 for (i = low; i < high; i++) { 1477 if (*(cp + i * 8) > col) break; 1478 if (*(cp + i * 8) == col) { 1479 if (is == ADD_VALUES) *(vp + i * 8) += value; 1480 else *(vp + i * 8) = value; 1481 low = i + 1; 1482 goto noinsert; 1483 } 1484 } 1485 if (value == 0.0 && a->ignorezeroentries) goto noinsert; 1486 if (nonew == 1) goto noinsert; 1487 PetscCheck(nonew != -1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Inserting a new nonzero (%" PetscInt_FMT ", %" PetscInt_FMT ") in the matrix", row, col); 1488 /* If the current row length exceeds the slice width (e.g. nrow==slice_width), allocate a new space, otherwise do nothing */ 1489 MatSeqXSELLReallocateSELL(A, A->rmap->n, 1, nrow, a->sliidx, row / 8, row, col, a->colidx, a->val, cp, vp, nonew, MatScalar); 1490 /* add the new nonzero to the high position, shift the remaining elements in current row to the right by one slot */ 1491 for (ii = nrow - 1; ii >= i; ii--) { 1492 *(cp + (ii + 1) * 8) = *(cp + ii * 8); 1493 *(vp + (ii + 1) * 8) = *(vp + ii * 8); 1494 } 1495 a->rlen[row]++; 1496 *(cp + i * 8) = col; 1497 *(vp + i * 8) = value; 1498 a->nz++; 1499 A->nonzerostate++; 1500 low = i + 1; 1501 high++; 1502 nrow++; 1503 noinsert:; 1504 } 1505 a->rlen[row] = nrow; 1506 } 1507 PetscFunctionReturn(0); 1508 } 1509 1510 PetscErrorCode MatCopy_SeqSELL(Mat A, Mat B, MatStructure str) { 1511 PetscFunctionBegin; 1512 /* If the two matrices have the same copy implementation, use fast copy. */ 1513 if (str == SAME_NONZERO_PATTERN && (A->ops->copy == B->ops->copy)) { 1514 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1515 Mat_SeqSELL *b = (Mat_SeqSELL *)B->data; 1516 1517 PetscCheck(a->sliidx[a->totalslices] == b->sliidx[b->totalslices], PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Number of nonzeros in two matrices are different"); 1518 PetscCall(PetscArraycpy(b->val, a->val, a->sliidx[a->totalslices])); 1519 } else { 1520 PetscCall(MatCopy_Basic(A, B, str)); 1521 } 1522 PetscFunctionReturn(0); 1523 } 1524 1525 PetscErrorCode MatSetUp_SeqSELL(Mat A) { 1526 PetscFunctionBegin; 1527 PetscCall(MatSeqSELLSetPreallocation(A, PETSC_DEFAULT, NULL)); 1528 PetscFunctionReturn(0); 1529 } 1530 1531 PetscErrorCode MatSeqSELLGetArray_SeqSELL(Mat A, PetscScalar *array[]) { 1532 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1533 1534 PetscFunctionBegin; 1535 *array = a->val; 1536 PetscFunctionReturn(0); 1537 } 1538 1539 PetscErrorCode MatSeqSELLRestoreArray_SeqSELL(Mat A, PetscScalar *array[]) { 1540 PetscFunctionBegin; 1541 PetscFunctionReturn(0); 1542 } 1543 1544 PetscErrorCode MatRealPart_SeqSELL(Mat A) { 1545 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1546 PetscInt i; 1547 MatScalar *aval = a->val; 1548 1549 PetscFunctionBegin; 1550 for (i = 0; i < a->sliidx[a->totalslices]; i++) aval[i] = PetscRealPart(aval[i]); 1551 PetscFunctionReturn(0); 1552 } 1553 1554 PetscErrorCode MatImaginaryPart_SeqSELL(Mat A) { 1555 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1556 PetscInt i; 1557 MatScalar *aval = a->val; 1558 1559 PetscFunctionBegin; 1560 for (i = 0; i < a->sliidx[a->totalslices]; i++) aval[i] = PetscImaginaryPart(aval[i]); 1561 PetscCall(MatSeqSELLInvalidateDiagonal(A)); 1562 PetscFunctionReturn(0); 1563 } 1564 1565 PetscErrorCode MatScale_SeqSELL(Mat inA, PetscScalar alpha) { 1566 Mat_SeqSELL *a = (Mat_SeqSELL *)inA->data; 1567 MatScalar *aval = a->val; 1568 PetscScalar oalpha = alpha; 1569 PetscBLASInt one = 1, size; 1570 1571 PetscFunctionBegin; 1572 PetscCall(PetscBLASIntCast(a->sliidx[a->totalslices], &size)); 1573 PetscCallBLAS("BLASscal", BLASscal_(&size, &oalpha, aval, &one)); 1574 PetscCall(PetscLogFlops(a->nz)); 1575 PetscCall(MatSeqSELLInvalidateDiagonal(inA)); 1576 PetscFunctionReturn(0); 1577 } 1578 1579 PetscErrorCode MatShift_SeqSELL(Mat Y, PetscScalar a) { 1580 Mat_SeqSELL *y = (Mat_SeqSELL *)Y->data; 1581 1582 PetscFunctionBegin; 1583 if (!Y->preallocated || !y->nz) PetscCall(MatSeqSELLSetPreallocation(Y, 1, NULL)); 1584 PetscCall(MatShift_Basic(Y, a)); 1585 PetscFunctionReturn(0); 1586 } 1587 1588 PetscErrorCode MatSOR_SeqSELL(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx) { 1589 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1590 PetscScalar *x, sum, *t; 1591 const MatScalar *idiag = NULL, *mdiag; 1592 const PetscScalar *b, *xb; 1593 PetscInt n, m = A->rmap->n, i, j, shift; 1594 const PetscInt *diag; 1595 1596 PetscFunctionBegin; 1597 its = its * lits; 1598 1599 if (fshift != a->fshift || omega != a->omega) a->idiagvalid = PETSC_FALSE; /* must recompute idiag[] */ 1600 if (!a->idiagvalid) PetscCall(MatInvertDiagonal_SeqSELL(A, omega, fshift)); 1601 a->fshift = fshift; 1602 a->omega = omega; 1603 1604 diag = a->diag; 1605 t = a->ssor_work; 1606 idiag = a->idiag; 1607 mdiag = a->mdiag; 1608 1609 PetscCall(VecGetArray(xx, &x)); 1610 PetscCall(VecGetArrayRead(bb, &b)); 1611 /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */ 1612 PetscCheck(flag != SOR_APPLY_UPPER, PETSC_COMM_SELF, PETSC_ERR_SUP, "SOR_APPLY_UPPER is not implemented"); 1613 PetscCheck(flag != SOR_APPLY_LOWER, PETSC_COMM_SELF, PETSC_ERR_SUP, "SOR_APPLY_LOWER is not implemented"); 1614 PetscCheck(!(flag & SOR_EISENSTAT), PETSC_COMM_SELF, PETSC_ERR_SUP, "No support yet for Eisenstat"); 1615 1616 if (flag & SOR_ZERO_INITIAL_GUESS) { 1617 if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) { 1618 for (i = 0; i < m; i++) { 1619 shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */ 1620 sum = b[i]; 1621 n = (diag[i] - shift) / 8; 1622 for (j = 0; j < n; j++) sum -= a->val[shift + j * 8] * x[a->colidx[shift + j * 8]]; 1623 t[i] = sum; 1624 x[i] = sum * idiag[i]; 1625 } 1626 xb = t; 1627 PetscCall(PetscLogFlops(a->nz)); 1628 } else xb = b; 1629 if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) { 1630 for (i = m - 1; i >= 0; i--) { 1631 shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */ 1632 sum = xb[i]; 1633 n = a->rlen[i] - (diag[i] - shift) / 8 - 1; 1634 for (j = 1; j <= n; j++) sum -= a->val[diag[i] + j * 8] * x[a->colidx[diag[i] + j * 8]]; 1635 if (xb == b) { 1636 x[i] = sum * idiag[i]; 1637 } else { 1638 x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */ 1639 } 1640 } 1641 PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */ 1642 } 1643 its--; 1644 } 1645 while (its--) { 1646 if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) { 1647 for (i = 0; i < m; i++) { 1648 /* lower */ 1649 shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */ 1650 sum = b[i]; 1651 n = (diag[i] - shift) / 8; 1652 for (j = 0; j < n; j++) sum -= a->val[shift + j * 8] * x[a->colidx[shift + j * 8]]; 1653 t[i] = sum; /* save application of the lower-triangular part */ 1654 /* upper */ 1655 n = a->rlen[i] - (diag[i] - shift) / 8 - 1; 1656 for (j = 1; j <= n; j++) sum -= a->val[diag[i] + j * 8] * x[a->colidx[diag[i] + j * 8]]; 1657 x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */ 1658 } 1659 xb = t; 1660 PetscCall(PetscLogFlops(2.0 * a->nz)); 1661 } else xb = b; 1662 if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) { 1663 for (i = m - 1; i >= 0; i--) { 1664 shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */ 1665 sum = xb[i]; 1666 if (xb == b) { 1667 /* whole matrix (no checkpointing available) */ 1668 n = a->rlen[i]; 1669 for (j = 0; j < n; j++) sum -= a->val[shift + j * 8] * x[a->colidx[shift + j * 8]]; 1670 x[i] = (1. - omega) * x[i] + (sum + mdiag[i] * x[i]) * idiag[i]; 1671 } else { /* lower-triangular part has been saved, so only apply upper-triangular */ 1672 n = a->rlen[i] - (diag[i] - shift) / 8 - 1; 1673 for (j = 1; j <= n; j++) sum -= a->val[diag[i] + j * 8] * x[a->colidx[diag[i] + j * 8]]; 1674 x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */ 1675 } 1676 } 1677 if (xb == b) { 1678 PetscCall(PetscLogFlops(2.0 * a->nz)); 1679 } else { 1680 PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */ 1681 } 1682 } 1683 } 1684 PetscCall(VecRestoreArray(xx, &x)); 1685 PetscCall(VecRestoreArrayRead(bb, &b)); 1686 PetscFunctionReturn(0); 1687 } 1688 1689 /* -------------------------------------------------------------------*/ 1690 static struct _MatOps MatOps_Values = {MatSetValues_SeqSELL, 1691 MatGetRow_SeqSELL, 1692 MatRestoreRow_SeqSELL, 1693 MatMult_SeqSELL, 1694 /* 4*/ MatMultAdd_SeqSELL, 1695 MatMultTranspose_SeqSELL, 1696 MatMultTransposeAdd_SeqSELL, 1697 NULL, 1698 NULL, 1699 NULL, 1700 /* 10*/ NULL, 1701 NULL, 1702 NULL, 1703 MatSOR_SeqSELL, 1704 NULL, 1705 /* 15*/ MatGetInfo_SeqSELL, 1706 MatEqual_SeqSELL, 1707 MatGetDiagonal_SeqSELL, 1708 MatDiagonalScale_SeqSELL, 1709 NULL, 1710 /* 20*/ NULL, 1711 MatAssemblyEnd_SeqSELL, 1712 MatSetOption_SeqSELL, 1713 MatZeroEntries_SeqSELL, 1714 /* 24*/ NULL, 1715 NULL, 1716 NULL, 1717 NULL, 1718 NULL, 1719 /* 29*/ MatSetUp_SeqSELL, 1720 NULL, 1721 NULL, 1722 NULL, 1723 NULL, 1724 /* 34*/ MatDuplicate_SeqSELL, 1725 NULL, 1726 NULL, 1727 NULL, 1728 NULL, 1729 /* 39*/ NULL, 1730 NULL, 1731 NULL, 1732 MatGetValues_SeqSELL, 1733 MatCopy_SeqSELL, 1734 /* 44*/ NULL, 1735 MatScale_SeqSELL, 1736 MatShift_SeqSELL, 1737 NULL, 1738 NULL, 1739 /* 49*/ NULL, 1740 NULL, 1741 NULL, 1742 NULL, 1743 NULL, 1744 /* 54*/ MatFDColoringCreate_SeqXAIJ, 1745 NULL, 1746 NULL, 1747 NULL, 1748 NULL, 1749 /* 59*/ NULL, 1750 MatDestroy_SeqSELL, 1751 MatView_SeqSELL, 1752 NULL, 1753 NULL, 1754 /* 64*/ NULL, 1755 NULL, 1756 NULL, 1757 NULL, 1758 NULL, 1759 /* 69*/ NULL, 1760 NULL, 1761 NULL, 1762 NULL, 1763 NULL, 1764 /* 74*/ NULL, 1765 MatFDColoringApply_AIJ, /* reuse the FDColoring function for AIJ */ 1766 NULL, 1767 NULL, 1768 NULL, 1769 /* 79*/ NULL, 1770 NULL, 1771 NULL, 1772 NULL, 1773 NULL, 1774 /* 84*/ NULL, 1775 NULL, 1776 NULL, 1777 NULL, 1778 NULL, 1779 /* 89*/ NULL, 1780 NULL, 1781 NULL, 1782 NULL, 1783 NULL, 1784 /* 94*/ NULL, 1785 NULL, 1786 NULL, 1787 NULL, 1788 NULL, 1789 /* 99*/ NULL, 1790 NULL, 1791 NULL, 1792 MatConjugate_SeqSELL, 1793 NULL, 1794 /*104*/ NULL, 1795 NULL, 1796 NULL, 1797 NULL, 1798 NULL, 1799 /*109*/ NULL, 1800 NULL, 1801 NULL, 1802 NULL, 1803 MatMissingDiagonal_SeqSELL, 1804 /*114*/ NULL, 1805 NULL, 1806 NULL, 1807 NULL, 1808 NULL, 1809 /*119*/ NULL, 1810 NULL, 1811 NULL, 1812 NULL, 1813 NULL, 1814 /*124*/ NULL, 1815 NULL, 1816 NULL, 1817 NULL, 1818 NULL, 1819 /*129*/ NULL, 1820 NULL, 1821 NULL, 1822 NULL, 1823 NULL, 1824 /*134*/ NULL, 1825 NULL, 1826 NULL, 1827 NULL, 1828 NULL, 1829 /*139*/ NULL, 1830 NULL, 1831 NULL, 1832 MatFDColoringSetUp_SeqXAIJ, 1833 NULL, 1834 /*144*/ NULL, 1835 NULL, 1836 NULL, 1837 NULL, 1838 NULL, 1839 NULL, 1840 /*150*/ NULL}; 1841 1842 PetscErrorCode MatStoreValues_SeqSELL(Mat mat) { 1843 Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data; 1844 1845 PetscFunctionBegin; 1846 PetscCheck(a->nonew, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first"); 1847 1848 /* allocate space for values if not already there */ 1849 if (!a->saved_values) { PetscCall(PetscMalloc1(a->sliidx[a->totalslices] + 1, &a->saved_values)); } 1850 1851 /* copy values over */ 1852 PetscCall(PetscArraycpy(a->saved_values, a->val, a->sliidx[a->totalslices])); 1853 PetscFunctionReturn(0); 1854 } 1855 1856 PetscErrorCode MatRetrieveValues_SeqSELL(Mat mat) { 1857 Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data; 1858 1859 PetscFunctionBegin; 1860 PetscCheck(a->nonew, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first"); 1861 PetscCheck(a->saved_values, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatStoreValues(A);first"); 1862 PetscCall(PetscArraycpy(a->val, a->saved_values, a->sliidx[a->totalslices])); 1863 PetscFunctionReturn(0); 1864 } 1865 1866 /*@C 1867 MatSeqSELLRestoreArray - returns access to the array where the data for a `MATSEQSELL` matrix is stored obtained by `MatSeqSELLGetArray()` 1868 1869 Not Collective 1870 1871 Input Parameters: 1872 . mat - a `MATSEQSELL` matrix 1873 . array - pointer to the data 1874 1875 Level: intermediate 1876 1877 .seealso: `MATSEQSELL`, `MatSeqSELLGetArray()`, `MatSeqSELLRestoreArrayF90()` 1878 @*/ 1879 PetscErrorCode MatSeqSELLRestoreArray(Mat A, PetscScalar **array) { 1880 PetscFunctionBegin; 1881 PetscUseMethod(A, "MatSeqSELLRestoreArray_C", (Mat, PetscScalar **), (A, array)); 1882 PetscFunctionReturn(0); 1883 } 1884 1885 PETSC_EXTERN PetscErrorCode MatCreate_SeqSELL(Mat B) { 1886 Mat_SeqSELL *b; 1887 PetscMPIInt size; 1888 1889 PetscFunctionBegin; 1890 PetscCall(PetscCitationsRegister(citation, &cited)); 1891 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size)); 1892 PetscCheck(size <= 1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Comm must be of size 1"); 1893 1894 PetscCall(PetscNew(&b)); 1895 1896 B->data = (void *)b; 1897 1898 PetscCall(PetscMemcpy(B->ops, &MatOps_Values, sizeof(struct _MatOps))); 1899 1900 b->row = NULL; 1901 b->col = NULL; 1902 b->icol = NULL; 1903 b->reallocs = 0; 1904 b->ignorezeroentries = PETSC_FALSE; 1905 b->roworiented = PETSC_TRUE; 1906 b->nonew = 0; 1907 b->diag = NULL; 1908 b->solve_work = NULL; 1909 B->spptr = NULL; 1910 b->saved_values = NULL; 1911 b->idiag = NULL; 1912 b->mdiag = NULL; 1913 b->ssor_work = NULL; 1914 b->omega = 1.0; 1915 b->fshift = 0.0; 1916 b->idiagvalid = PETSC_FALSE; 1917 b->keepnonzeropattern = PETSC_FALSE; 1918 1919 PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQSELL)); 1920 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetArray_C", MatSeqSELLGetArray_SeqSELL)); 1921 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLRestoreArray_C", MatSeqSELLRestoreArray_SeqSELL)); 1922 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatStoreValues_C", MatStoreValues_SeqSELL)); 1923 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatRetrieveValues_C", MatRetrieveValues_SeqSELL)); 1924 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLSetPreallocation_C", MatSeqSELLSetPreallocation_SeqSELL)); 1925 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqsell_seqaij_C", MatConvert_SeqSELL_SeqAIJ)); 1926 PetscFunctionReturn(0); 1927 } 1928 1929 /* 1930 Given a matrix generated with MatGetFactor() duplicates all the information in A into B 1931 */ 1932 PetscErrorCode MatDuplicateNoCreate_SeqSELL(Mat C, Mat A, MatDuplicateOption cpvalues, PetscBool mallocmatspace) { 1933 Mat_SeqSELL *c = (Mat_SeqSELL *)C->data, *a = (Mat_SeqSELL *)A->data; 1934 PetscInt i, m = A->rmap->n; 1935 PetscInt totalslices = a->totalslices; 1936 1937 PetscFunctionBegin; 1938 C->factortype = A->factortype; 1939 c->row = NULL; 1940 c->col = NULL; 1941 c->icol = NULL; 1942 c->reallocs = 0; 1943 C->assembled = PETSC_TRUE; 1944 1945 PetscCall(PetscLayoutReference(A->rmap, &C->rmap)); 1946 PetscCall(PetscLayoutReference(A->cmap, &C->cmap)); 1947 1948 PetscCall(PetscMalloc1(8 * totalslices, &c->rlen)); 1949 PetscCall(PetscMalloc1(totalslices + 1, &c->sliidx)); 1950 1951 for (i = 0; i < m; i++) c->rlen[i] = a->rlen[i]; 1952 for (i = 0; i < totalslices + 1; i++) c->sliidx[i] = a->sliidx[i]; 1953 1954 /* allocate the matrix space */ 1955 if (mallocmatspace) { 1956 PetscCall(PetscMalloc2(a->maxallocmat, &c->val, a->maxallocmat, &c->colidx)); 1957 1958 c->singlemalloc = PETSC_TRUE; 1959 1960 if (m > 0) { 1961 PetscCall(PetscArraycpy(c->colidx, a->colidx, a->maxallocmat)); 1962 if (cpvalues == MAT_COPY_VALUES) { 1963 PetscCall(PetscArraycpy(c->val, a->val, a->maxallocmat)); 1964 } else { 1965 PetscCall(PetscArrayzero(c->val, a->maxallocmat)); 1966 } 1967 } 1968 } 1969 1970 c->ignorezeroentries = a->ignorezeroentries; 1971 c->roworiented = a->roworiented; 1972 c->nonew = a->nonew; 1973 if (a->diag) { 1974 PetscCall(PetscMalloc1(m, &c->diag)); 1975 for (i = 0; i < m; i++) c->diag[i] = a->diag[i]; 1976 } else c->diag = NULL; 1977 1978 c->solve_work = NULL; 1979 c->saved_values = NULL; 1980 c->idiag = NULL; 1981 c->ssor_work = NULL; 1982 c->keepnonzeropattern = a->keepnonzeropattern; 1983 c->free_val = PETSC_TRUE; 1984 c->free_colidx = PETSC_TRUE; 1985 1986 c->maxallocmat = a->maxallocmat; 1987 c->maxallocrow = a->maxallocrow; 1988 c->rlenmax = a->rlenmax; 1989 c->nz = a->nz; 1990 C->preallocated = PETSC_TRUE; 1991 1992 c->nonzerorowcnt = a->nonzerorowcnt; 1993 C->nonzerostate = A->nonzerostate; 1994 1995 PetscCall(PetscFunctionListDuplicate(((PetscObject)A)->qlist, &((PetscObject)C)->qlist)); 1996 PetscFunctionReturn(0); 1997 } 1998 1999 PetscErrorCode MatDuplicate_SeqSELL(Mat A, MatDuplicateOption cpvalues, Mat *B) { 2000 PetscFunctionBegin; 2001 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B)); 2002 PetscCall(MatSetSizes(*B, A->rmap->n, A->cmap->n, A->rmap->n, A->cmap->n)); 2003 if (!(A->rmap->n % A->rmap->bs) && !(A->cmap->n % A->cmap->bs)) PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 2004 PetscCall(MatSetType(*B, ((PetscObject)A)->type_name)); 2005 PetscCall(MatDuplicateNoCreate_SeqSELL(*B, A, cpvalues, PETSC_TRUE)); 2006 PetscFunctionReturn(0); 2007 } 2008 2009 /*MC 2010 MATSEQSELL - MATSEQSELL = "seqsell" - A matrix type to be used for sequential sparse matrices, 2011 based on the sliced Ellpack format 2012 2013 Options Database Keys: 2014 . -mat_type seqsell - sets the matrix type to "`MATSEQELL` during a call to `MatSetFromOptions()` 2015 2016 Level: beginner 2017 2018 .seealso: `MatCreateSeqSell()`, `MATSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATAIJ`, `MATMPIAIJ` 2019 M*/ 2020 2021 /*MC 2022 MATSELL - MATSELL = "sell" - A matrix type to be used for sparse matrices. 2023 2024 This matrix type is identical to `MATSEQSELL` when constructed with a single process communicator, 2025 and `MATMPISELL` otherwise. As a result, for single process communicators, 2026 `MatSeqSELLSetPreallocation()` is supported, and similarly `MatMPISELLSetPreallocation()` is supported 2027 for communicators controlling multiple processes. It is recommended that you call both of 2028 the above preallocation routines for simplicity. 2029 2030 Options Database Keys: 2031 . -mat_type sell - sets the matrix type to "sell" during a call to MatSetFromOptions() 2032 2033 Level: beginner 2034 2035 Notes: 2036 This format is only supported for real scalars, double precision, and 32 bit indices (the defaults). 2037 2038 It can provide better performance on Intel and AMD processes with AVX2 or AVX512 support for matrices that have a similar number of 2039 non-zeros in contiguous groups of rows. However if the computation is memory bandwidth limited it may not provide much improvement. 2040 2041 Developer Notes: 2042 On Intel (and AMD) systems some of the matrix operations use SIMD (AVX) instructions to achieve higher performance. 2043 2044 The sparse matrix format is as follows. For simplicity we assume a slice size of 2, it is actually 8 2045 .vb 2046 (2 0 3 4) 2047 Consider the matrix A = (5 0 6 0) 2048 (0 0 7 8) 2049 (0 0 9 9) 2050 2051 symbolically the Ellpack format can be written as 2052 2053 (2 3 4 |) (0 2 3 |) 2054 v = (5 6 0 |) colidx = (0 2 2 |) 2055 -------- --------- 2056 (7 8 |) (2 3 |) 2057 (9 9 |) (2 3 |) 2058 2059 The data for 2 contiguous rows of the matrix are stored together (in column-major format) (with any left-over rows handled as a special case). 2060 Any of the rows in a slice fewer columns than the rest of the slice (row 1 above) are padded with a previous valid column in their "extra" colidx[] locations and 2061 zeros in their "extra" v locations so that the matrix operations do not need special code to handle different length rows within the 2 rows in a slice. 2062 2063 The one-dimensional representation of v used in the code is (2 5 3 6 4 0 7 9 8 9) and for colidx is (0 0 2 2 3 2 2 2 3 3) 2064 2065 .ve 2066 2067 See MatMult_SeqSELL() for how this format is used with the SIMD operations to achieve high performance. 2068 2069 References: 2070 . * - Hong Zhang, Richard T. Mills, Karl Rupp, and Barry F. Smith, Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512}, 2071 Proceedings of the 47th International Conference on Parallel Processing, 2018. 2072 2073 .seealso: `MatCreateSeqSELL()`, `MatCreateSeqAIJ()`, `MatCreateSell()`, `MATSEQSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATMPIAIJ`, `MATAIJ` 2074 M*/ 2075 2076 /*@C 2077 MatCreateSeqSELL - Creates a sparse matrix in `MATSEQSELL` format. 2078 2079 Collective on comm 2080 2081 Input Parameters: 2082 + comm - MPI communicator, set to `PETSC_COMM_SELF` 2083 . m - number of rows 2084 . n - number of columns 2085 . rlenmax - maximum number of nonzeros in a row 2086 - rlen - array containing the number of nonzeros in the various rows 2087 (possibly different for each row) or NULL 2088 2089 Output Parameter: 2090 . A - the matrix 2091 2092 It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 2093 MatXXXXSetPreallocation() paradigm instead of this routine directly. 2094 [MatXXXXSetPreallocation() is, for example, `MatSeqSELLSetPreallocation()`] 2095 2096 Notes: 2097 If nnz is given then nz is ignored 2098 2099 Specify the preallocated storage with either rlenmax or rlen (not both). 2100 Set rlenmax = `PETSC_DEFAULT` and rlen = NULL for PETSc to control dynamic memory 2101 allocation. For large problems you MUST preallocate memory or you 2102 will get TERRIBLE performance, see the users' manual chapter on matrices. 2103 2104 Level: intermediate 2105 2106 .seealso: `MATSEQSELL`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatSeqSELLSetPreallocation()`, `MATSELL`, `MATSEQSELL`, `MATMPISELL` 2107 @*/ 2108 PetscErrorCode MatCreateSeqSELL(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt maxallocrow, const PetscInt rlen[], Mat *A) { 2109 PetscFunctionBegin; 2110 PetscCall(MatCreate(comm, A)); 2111 PetscCall(MatSetSizes(*A, m, n, m, n)); 2112 PetscCall(MatSetType(*A, MATSEQSELL)); 2113 PetscCall(MatSeqSELLSetPreallocation_SeqSELL(*A, maxallocrow, rlen)); 2114 PetscFunctionReturn(0); 2115 } 2116 2117 PetscErrorCode MatEqual_SeqSELL(Mat A, Mat B, PetscBool *flg) { 2118 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data, *b = (Mat_SeqSELL *)B->data; 2119 PetscInt totalslices = a->totalslices; 2120 2121 PetscFunctionBegin; 2122 /* If the matrix dimensions are not equal,or no of nonzeros */ 2123 if ((A->rmap->n != B->rmap->n) || (A->cmap->n != B->cmap->n) || (a->nz != b->nz) || (a->rlenmax != b->rlenmax)) { 2124 *flg = PETSC_FALSE; 2125 PetscFunctionReturn(0); 2126 } 2127 /* if the a->colidx are the same */ 2128 PetscCall(PetscArraycmp(a->colidx, b->colidx, a->sliidx[totalslices], flg)); 2129 if (!*flg) PetscFunctionReturn(0); 2130 /* if a->val are the same */ 2131 PetscCall(PetscArraycmp(a->val, b->val, a->sliidx[totalslices], flg)); 2132 PetscFunctionReturn(0); 2133 } 2134 2135 PetscErrorCode MatSeqSELLInvalidateDiagonal(Mat A) { 2136 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 2137 2138 PetscFunctionBegin; 2139 a->idiagvalid = PETSC_FALSE; 2140 PetscFunctionReturn(0); 2141 } 2142 2143 PetscErrorCode MatConjugate_SeqSELL(Mat A) { 2144 #if defined(PETSC_USE_COMPLEX) 2145 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 2146 PetscInt i; 2147 PetscScalar *val = a->val; 2148 2149 PetscFunctionBegin; 2150 for (i = 0; i < a->sliidx[a->totalslices]; i++) val[i] = PetscConj(val[i]); 2151 #else 2152 PetscFunctionBegin; 2153 #endif 2154 PetscFunctionReturn(0); 2155 } 2156