1 2 /* 3 Defines the basic matrix operations for the SELL matrix storage format. 4 */ 5 #include <../src/mat/impls/sell/seq/sell.h> /*I "petscmat.h" I*/ 6 #include <petscblaslapack.h> 7 #include <petsc/private/kernels/blocktranspose.h> 8 9 static PetscBool cited = PETSC_FALSE; 10 static const char citation[] = "@inproceedings{ZhangELLPACK2018,\n" 11 " author = {Hong Zhang and Richard T. Mills and Karl Rupp and Barry F. Smith},\n" 12 " title = {Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512}},\n" 13 " booktitle = {Proceedings of the 47th International Conference on Parallel Processing},\n" 14 " year = 2018\n" 15 "}\n"; 16 17 #if defined(PETSC_HAVE_IMMINTRIN_H) && (defined(__AVX512F__) || (defined(__AVX2__) && defined(__FMA__)) || defined(__AVX__)) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 18 19 #include <immintrin.h> 20 21 #if !defined(_MM_SCALE_8) 22 #define _MM_SCALE_8 8 23 #endif 24 25 #if defined(__AVX512F__) 26 /* these do not work 27 vec_idx = _mm512_loadunpackhi_epi32(vec_idx,acolidx); 28 vec_vals = _mm512_loadunpackhi_pd(vec_vals,aval); 29 */ 30 #define AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \ 31 /* if the mask bit is set, copy from acolidx, otherwise from vec_idx */ \ 32 vec_idx = _mm256_loadu_si256((__m256i const *)acolidx); \ 33 vec_vals = _mm512_loadu_pd(aval); \ 34 vec_x = _mm512_i32gather_pd(vec_idx, x, _MM_SCALE_8); \ 35 vec_y = _mm512_fmadd_pd(vec_x, vec_vals, vec_y) 36 #elif defined(__AVX2__) && defined(__FMA__) 37 #define AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \ 38 vec_vals = _mm256_loadu_pd(aval); \ 39 vec_idx = _mm_loadu_si128((__m128i const *)acolidx); /* SSE2 */ \ 40 vec_x = _mm256_i32gather_pd(x, vec_idx, _MM_SCALE_8); \ 41 vec_y = _mm256_fmadd_pd(vec_x, vec_vals, vec_y) 42 #endif 43 #endif /* PETSC_HAVE_IMMINTRIN_H */ 44 45 /*@C 46 MatSeqSELLSetPreallocation - For good matrix assembly performance 47 the user should preallocate the matrix storage by setting the parameter nz 48 (or the array nnz). By setting these parameters accurately, performance 49 during matrix assembly can be increased significantly. 50 51 Collective 52 53 Input Parameters: 54 + B - The `MATSEQSELL` matrix 55 . nz - number of nonzeros per row (same for all rows) 56 - nnz - array containing the number of nonzeros in the various rows 57 (possibly different for each row) or NULL 58 59 Notes: 60 If nnz is given then nz is ignored. 61 62 Specify the preallocated storage with either nz or nnz (not both). 63 Set nz = `PETSC_DEFAULT` and nnz = NULL for PETSc to control dynamic memory 64 allocation. For large problems you MUST preallocate memory or you 65 will get TERRIBLE performance, see the users' manual chapter on matrices. 66 67 You can call `MatGetInfo()` to get information on how effective the preallocation was; 68 for example the fields mallocs,nz_allocated,nz_used,nz_unneeded; 69 You can also run with the option -info and look for messages with the string 70 malloc in them to see if additional memory allocation was needed. 71 72 Developers: Use nz of `MAT_SKIP_ALLOCATION` to not allocate any space for the matrix 73 entries or columns indices. 74 75 The maximum number of nonzeos in any row should be as accurate as possible. 76 If it is underestimated, you will get bad performance due to reallocation 77 (MatSeqXSELLReallocateSELL). 78 79 Level: intermediate 80 81 .seealso: `MATSEQSELL`, `MATSELL`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatGetInfo()` 82 @*/ 83 PetscErrorCode MatSeqSELLSetPreallocation(Mat B, PetscInt rlenmax, const PetscInt rlen[]) 84 { 85 PetscFunctionBegin; 86 PetscValidHeaderSpecific(B, MAT_CLASSID, 1); 87 PetscValidType(B, 1); 88 PetscTryMethod(B, "MatSeqSELLSetPreallocation_C", (Mat, PetscInt, const PetscInt[]), (B, rlenmax, rlen)); 89 PetscFunctionReturn(0); 90 } 91 92 PetscErrorCode MatSeqSELLSetPreallocation_SeqSELL(Mat B, PetscInt maxallocrow, const PetscInt rlen[]) 93 { 94 Mat_SeqSELL *b; 95 PetscInt i, j, totalslices; 96 PetscBool skipallocation = PETSC_FALSE, realalloc = PETSC_FALSE; 97 98 PetscFunctionBegin; 99 if (maxallocrow >= 0 || rlen) realalloc = PETSC_TRUE; 100 if (maxallocrow == MAT_SKIP_ALLOCATION) { 101 skipallocation = PETSC_TRUE; 102 maxallocrow = 0; 103 } 104 105 PetscCall(PetscLayoutSetUp(B->rmap)); 106 PetscCall(PetscLayoutSetUp(B->cmap)); 107 108 /* FIXME: if one preallocates more space than needed, the matrix does not shrink automatically, but for best performance it should */ 109 if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 5; 110 PetscCheck(maxallocrow >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "maxallocrow cannot be less than 0: value %" PetscInt_FMT, maxallocrow); 111 if (rlen) { 112 for (i = 0; i < B->rmap->n; i++) { 113 PetscCheck(rlen[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "rlen cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, rlen[i]); 114 PetscCheck(rlen[i] <= B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "rlen cannot be greater than row length: local row %" PetscInt_FMT " value %" PetscInt_FMT " rowlength %" PetscInt_FMT, i, rlen[i], B->cmap->n); 115 } 116 } 117 118 B->preallocated = PETSC_TRUE; 119 120 b = (Mat_SeqSELL *)B->data; 121 122 totalslices = PetscCeilInt(B->rmap->n, 8); 123 b->totalslices = totalslices; 124 if (!skipallocation) { 125 if (B->rmap->n & 0x07) PetscCall(PetscInfo(B, "Padding rows to the SEQSELL matrix because the number of rows is not the multiple of 8 (value %" PetscInt_FMT ")\n", B->rmap->n)); 126 127 if (!b->sliidx) { /* sliidx gives the starting index of each slice, the last element is the total space allocated */ 128 PetscCall(PetscMalloc1(totalslices + 1, &b->sliidx)); 129 } 130 if (!rlen) { /* if rlen is not provided, allocate same space for all the slices */ 131 if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 10; 132 else if (maxallocrow < 0) maxallocrow = 1; 133 for (i = 0; i <= totalslices; i++) b->sliidx[i] = i * 8 * maxallocrow; 134 } else { 135 maxallocrow = 0; 136 b->sliidx[0] = 0; 137 for (i = 1; i < totalslices; i++) { 138 b->sliidx[i] = 0; 139 for (j = 0; j < 8; j++) b->sliidx[i] = PetscMax(b->sliidx[i], rlen[8 * (i - 1) + j]); 140 maxallocrow = PetscMax(b->sliidx[i], maxallocrow); 141 PetscCall(PetscIntSumError(b->sliidx[i - 1], 8 * b->sliidx[i], &b->sliidx[i])); 142 } 143 /* last slice */ 144 b->sliidx[totalslices] = 0; 145 for (j = (totalslices - 1) * 8; j < B->rmap->n; j++) b->sliidx[totalslices] = PetscMax(b->sliidx[totalslices], rlen[j]); 146 maxallocrow = PetscMax(b->sliidx[totalslices], maxallocrow); 147 b->sliidx[totalslices] = b->sliidx[totalslices - 1] + 8 * b->sliidx[totalslices]; 148 } 149 150 /* allocate space for val, colidx, rlen */ 151 /* FIXME: should B's old memory be unlogged? */ 152 PetscCall(MatSeqXSELLFreeSELL(B, &b->val, &b->colidx)); 153 /* FIXME: assuming an element of the bit array takes 8 bits */ 154 PetscCall(PetscMalloc2(b->sliidx[totalslices], &b->val, b->sliidx[totalslices], &b->colidx)); 155 /* b->rlen will count nonzeros in each row so far. We dont copy rlen to b->rlen because the matrix has not been set. */ 156 PetscCall(PetscCalloc1(8 * totalslices, &b->rlen)); 157 158 b->singlemalloc = PETSC_TRUE; 159 b->free_val = PETSC_TRUE; 160 b->free_colidx = PETSC_TRUE; 161 } else { 162 b->free_val = PETSC_FALSE; 163 b->free_colidx = PETSC_FALSE; 164 } 165 166 b->nz = 0; 167 b->maxallocrow = maxallocrow; 168 b->rlenmax = maxallocrow; 169 b->maxallocmat = b->sliidx[totalslices]; 170 B->info.nz_unneeded = (double)b->maxallocmat; 171 if (realalloc) PetscCall(MatSetOption(B, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE)); 172 PetscFunctionReturn(0); 173 } 174 175 PetscErrorCode MatGetRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v) 176 { 177 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 178 PetscInt shift; 179 180 PetscFunctionBegin; 181 PetscCheck(row >= 0 && row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row %" PetscInt_FMT " out of range", row); 182 if (nz) *nz = a->rlen[row]; 183 shift = a->sliidx[row >> 3] + (row & 0x07); 184 if (!a->getrowcols) PetscCall(PetscMalloc2(a->rlenmax, &a->getrowcols, a->rlenmax, &a->getrowvals)); 185 if (idx) { 186 PetscInt j; 187 for (j = 0; j < a->rlen[row]; j++) a->getrowcols[j] = a->colidx[shift + 8 * j]; 188 *idx = a->getrowcols; 189 } 190 if (v) { 191 PetscInt j; 192 for (j = 0; j < a->rlen[row]; j++) a->getrowvals[j] = a->val[shift + 8 * j]; 193 *v = a->getrowvals; 194 } 195 PetscFunctionReturn(0); 196 } 197 198 PetscErrorCode MatRestoreRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v) 199 { 200 PetscFunctionBegin; 201 PetscFunctionReturn(0); 202 } 203 204 PetscErrorCode MatConvert_SeqSELL_SeqAIJ(Mat A, MatType newtype, MatReuse reuse, Mat *newmat) 205 { 206 Mat B; 207 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 208 PetscInt i; 209 210 PetscFunctionBegin; 211 if (reuse == MAT_REUSE_MATRIX) { 212 B = *newmat; 213 PetscCall(MatZeroEntries(B)); 214 } else { 215 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B)); 216 PetscCall(MatSetSizes(B, A->rmap->n, A->cmap->n, A->rmap->N, A->cmap->N)); 217 PetscCall(MatSetType(B, MATSEQAIJ)); 218 PetscCall(MatSeqAIJSetPreallocation(B, 0, a->rlen)); 219 } 220 221 for (i = 0; i < A->rmap->n; i++) { 222 PetscInt nz = 0, *cols = NULL; 223 PetscScalar *vals = NULL; 224 225 PetscCall(MatGetRow_SeqSELL(A, i, &nz, &cols, &vals)); 226 PetscCall(MatSetValues(B, 1, &i, nz, cols, vals, INSERT_VALUES)); 227 PetscCall(MatRestoreRow_SeqSELL(A, i, &nz, &cols, &vals)); 228 } 229 230 PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY)); 231 PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY)); 232 B->rmap->bs = A->rmap->bs; 233 234 if (reuse == MAT_INPLACE_MATRIX) { 235 PetscCall(MatHeaderReplace(A, &B)); 236 } else { 237 *newmat = B; 238 } 239 PetscFunctionReturn(0); 240 } 241 242 #include <../src/mat/impls/aij/seq/aij.h> 243 244 PetscErrorCode MatConvert_SeqAIJ_SeqSELL(Mat A, MatType newtype, MatReuse reuse, Mat *newmat) 245 { 246 Mat B; 247 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 248 PetscInt *ai = a->i, m = A->rmap->N, n = A->cmap->N, i, *rowlengths, row, ncols; 249 const PetscInt *cols; 250 const PetscScalar *vals; 251 252 PetscFunctionBegin; 253 254 if (reuse == MAT_REUSE_MATRIX) { 255 B = *newmat; 256 } else { 257 if (PetscDefined(USE_DEBUG) || !a->ilen) { 258 PetscCall(PetscMalloc1(m, &rowlengths)); 259 for (i = 0; i < m; i++) rowlengths[i] = ai[i + 1] - ai[i]; 260 } 261 if (PetscDefined(USE_DEBUG) && a->ilen) { 262 PetscBool eq; 263 PetscCall(PetscMemcmp(rowlengths, a->ilen, m * sizeof(PetscInt), &eq)); 264 PetscCheck(eq, PETSC_COMM_SELF, PETSC_ERR_PLIB, "SeqAIJ ilen array incorrect"); 265 PetscCall(PetscFree(rowlengths)); 266 rowlengths = a->ilen; 267 } else if (a->ilen) rowlengths = a->ilen; 268 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B)); 269 PetscCall(MatSetSizes(B, m, n, m, n)); 270 PetscCall(MatSetType(B, MATSEQSELL)); 271 PetscCall(MatSeqSELLSetPreallocation(B, 0, rowlengths)); 272 if (rowlengths != a->ilen) PetscCall(PetscFree(rowlengths)); 273 } 274 275 for (row = 0; row < m; row++) { 276 PetscCall(MatGetRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals)); 277 PetscCall(MatSetValues_SeqSELL(B, 1, &row, ncols, cols, vals, INSERT_VALUES)); 278 PetscCall(MatRestoreRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals)); 279 } 280 PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY)); 281 PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY)); 282 B->rmap->bs = A->rmap->bs; 283 284 if (reuse == MAT_INPLACE_MATRIX) { 285 PetscCall(MatHeaderReplace(A, &B)); 286 } else { 287 *newmat = B; 288 } 289 PetscFunctionReturn(0); 290 } 291 292 PetscErrorCode MatMult_SeqSELL(Mat A, Vec xx, Vec yy) 293 { 294 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 295 PetscScalar *y; 296 const PetscScalar *x; 297 const MatScalar *aval = a->val; 298 PetscInt totalslices = a->totalslices; 299 const PetscInt *acolidx = a->colidx; 300 PetscInt i, j; 301 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 302 __m512d vec_x, vec_y, vec_vals; 303 __m256i vec_idx; 304 __mmask8 mask; 305 __m512d vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4; 306 __m256i vec_idx2, vec_idx3, vec_idx4; 307 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 308 __m128i vec_idx; 309 __m256d vec_x, vec_y, vec_y2, vec_vals; 310 MatScalar yval; 311 PetscInt r, rows_left, row, nnz_in_row; 312 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 313 __m128d vec_x_tmp; 314 __m256d vec_x, vec_y, vec_y2, vec_vals; 315 MatScalar yval; 316 PetscInt r, rows_left, row, nnz_in_row; 317 #else 318 PetscScalar sum[8]; 319 #endif 320 321 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 322 #pragma disjoint(*x, *y, *aval) 323 #endif 324 325 PetscFunctionBegin; 326 PetscCall(VecGetArrayRead(xx, &x)); 327 PetscCall(VecGetArray(yy, &y)); 328 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 329 for (i = 0; i < totalslices; i++) { /* loop over slices */ 330 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 331 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 332 333 vec_y = _mm512_setzero_pd(); 334 vec_y2 = _mm512_setzero_pd(); 335 vec_y3 = _mm512_setzero_pd(); 336 vec_y4 = _mm512_setzero_pd(); 337 338 j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice columnn */ 339 switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) { 340 case 3: 341 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 342 acolidx += 8; 343 aval += 8; 344 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 345 acolidx += 8; 346 aval += 8; 347 AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3); 348 acolidx += 8; 349 aval += 8; 350 j += 3; 351 break; 352 case 2: 353 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 354 acolidx += 8; 355 aval += 8; 356 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 357 acolidx += 8; 358 aval += 8; 359 j += 2; 360 break; 361 case 1: 362 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 363 acolidx += 8; 364 aval += 8; 365 j += 1; 366 break; 367 } 368 #pragma novector 369 for (; j < (a->sliidx[i + 1] >> 3); j += 4) { 370 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 371 acolidx += 8; 372 aval += 8; 373 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 374 acolidx += 8; 375 aval += 8; 376 AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3); 377 acolidx += 8; 378 aval += 8; 379 AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4); 380 acolidx += 8; 381 aval += 8; 382 } 383 384 vec_y = _mm512_add_pd(vec_y, vec_y2); 385 vec_y = _mm512_add_pd(vec_y, vec_y3); 386 vec_y = _mm512_add_pd(vec_y, vec_y4); 387 if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */ 388 mask = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07))); 389 _mm512_mask_storeu_pd(&y[8 * i], mask, vec_y); 390 } else { 391 _mm512_storeu_pd(&y[8 * i], vec_y); 392 } 393 } 394 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 395 for (i = 0; i < totalslices; i++) { /* loop over full slices */ 396 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 397 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 398 399 /* last slice may have padding rows. Don't use vectorization. */ 400 if (i == totalslices - 1 && (A->rmap->n & 0x07)) { 401 rows_left = A->rmap->n - 8 * i; 402 for (r = 0; r < rows_left; ++r) { 403 yval = (MatScalar)0; 404 row = 8 * i + r; 405 nnz_in_row = a->rlen[row]; 406 for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]]; 407 y[row] = yval; 408 } 409 break; 410 } 411 412 vec_y = _mm256_setzero_pd(); 413 vec_y2 = _mm256_setzero_pd(); 414 415 /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */ 416 #pragma novector 417 #pragma unroll(2) 418 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) { 419 AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 420 aval += 4; 421 acolidx += 4; 422 AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y2); 423 aval += 4; 424 acolidx += 4; 425 } 426 427 _mm256_storeu_pd(y + i * 8, vec_y); 428 _mm256_storeu_pd(y + i * 8 + 4, vec_y2); 429 } 430 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 431 for (i = 0; i < totalslices; i++) { /* loop over full slices */ 432 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 433 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 434 435 vec_y = _mm256_setzero_pd(); 436 vec_y2 = _mm256_setzero_pd(); 437 438 /* last slice may have padding rows. Don't use vectorization. */ 439 if (i == totalslices - 1 && (A->rmap->n & 0x07)) { 440 rows_left = A->rmap->n - 8 * i; 441 for (r = 0; r < rows_left; ++r) { 442 yval = (MatScalar)0; 443 row = 8 * i + r; 444 nnz_in_row = a->rlen[row]; 445 for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]]; 446 y[row] = yval; 447 } 448 break; 449 } 450 451 /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */ 452 #pragma novector 453 #pragma unroll(2) 454 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) { 455 vec_vals = _mm256_loadu_pd(aval); 456 vec_x_tmp = _mm_setzero_pd(); 457 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 458 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 459 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0); 460 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 461 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 462 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1); 463 vec_y = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y); 464 aval += 4; 465 466 vec_vals = _mm256_loadu_pd(aval); 467 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 468 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 469 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0); 470 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 471 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 472 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1); 473 vec_y2 = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2); 474 aval += 4; 475 } 476 477 _mm256_storeu_pd(y + i * 8, vec_y); 478 _mm256_storeu_pd(y + i * 8 + 4, vec_y2); 479 } 480 #else 481 for (i = 0; i < totalslices; i++) { /* loop over slices */ 482 for (j = 0; j < 8; j++) sum[j] = 0.0; 483 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) { 484 sum[0] += aval[j] * x[acolidx[j]]; 485 sum[1] += aval[j + 1] * x[acolidx[j + 1]]; 486 sum[2] += aval[j + 2] * x[acolidx[j + 2]]; 487 sum[3] += aval[j + 3] * x[acolidx[j + 3]]; 488 sum[4] += aval[j + 4] * x[acolidx[j + 4]]; 489 sum[5] += aval[j + 5] * x[acolidx[j + 5]]; 490 sum[6] += aval[j + 6] * x[acolidx[j + 6]]; 491 sum[7] += aval[j + 7] * x[acolidx[j + 7]]; 492 } 493 if (i == totalslices - 1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */ 494 for (j = 0; j < (A->rmap->n & 0x07); j++) y[8 * i + j] = sum[j]; 495 } else { 496 for (j = 0; j < 8; j++) y[8 * i + j] = sum[j]; 497 } 498 } 499 #endif 500 501 PetscCall(PetscLogFlops(2.0 * a->nz - a->nonzerorowcnt)); /* theoretical minimal FLOPs */ 502 PetscCall(VecRestoreArrayRead(xx, &x)); 503 PetscCall(VecRestoreArray(yy, &y)); 504 PetscFunctionReturn(0); 505 } 506 507 #include <../src/mat/impls/aij/seq/ftn-kernels/fmultadd.h> 508 PetscErrorCode MatMultAdd_SeqSELL(Mat A, Vec xx, Vec yy, Vec zz) 509 { 510 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 511 PetscScalar *y, *z; 512 const PetscScalar *x; 513 const MatScalar *aval = a->val; 514 PetscInt totalslices = a->totalslices; 515 const PetscInt *acolidx = a->colidx; 516 PetscInt i, j; 517 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 518 __m512d vec_x, vec_y, vec_vals; 519 __m256i vec_idx; 520 __mmask8 mask; 521 __m512d vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4; 522 __m256i vec_idx2, vec_idx3, vec_idx4; 523 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 524 __m128d vec_x_tmp; 525 __m256d vec_x, vec_y, vec_y2, vec_vals; 526 MatScalar yval; 527 PetscInt r, row, nnz_in_row; 528 #else 529 PetscScalar sum[8]; 530 #endif 531 532 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 533 #pragma disjoint(*x, *y, *aval) 534 #endif 535 536 PetscFunctionBegin; 537 PetscCall(VecGetArrayRead(xx, &x)); 538 PetscCall(VecGetArrayPair(yy, zz, &y, &z)); 539 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 540 for (i = 0; i < totalslices; i++) { /* loop over slices */ 541 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 542 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 543 544 if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */ 545 mask = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07))); 546 vec_y = _mm512_mask_loadu_pd(vec_y, mask, &y[8 * i]); 547 } else { 548 vec_y = _mm512_loadu_pd(&y[8 * i]); 549 } 550 vec_y2 = _mm512_setzero_pd(); 551 vec_y3 = _mm512_setzero_pd(); 552 vec_y4 = _mm512_setzero_pd(); 553 554 j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice columnn */ 555 switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) { 556 case 3: 557 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 558 acolidx += 8; 559 aval += 8; 560 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 561 acolidx += 8; 562 aval += 8; 563 AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3); 564 acolidx += 8; 565 aval += 8; 566 j += 3; 567 break; 568 case 2: 569 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 570 acolidx += 8; 571 aval += 8; 572 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 573 acolidx += 8; 574 aval += 8; 575 j += 2; 576 break; 577 case 1: 578 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 579 acolidx += 8; 580 aval += 8; 581 j += 1; 582 break; 583 } 584 #pragma novector 585 for (; j < (a->sliidx[i + 1] >> 3); j += 4) { 586 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 587 acolidx += 8; 588 aval += 8; 589 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 590 acolidx += 8; 591 aval += 8; 592 AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3); 593 acolidx += 8; 594 aval += 8; 595 AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4); 596 acolidx += 8; 597 aval += 8; 598 } 599 600 vec_y = _mm512_add_pd(vec_y, vec_y2); 601 vec_y = _mm512_add_pd(vec_y, vec_y3); 602 vec_y = _mm512_add_pd(vec_y, vec_y4); 603 if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */ 604 _mm512_mask_storeu_pd(&z[8 * i], mask, vec_y); 605 } else { 606 _mm512_storeu_pd(&z[8 * i], vec_y); 607 } 608 } 609 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 610 for (i = 0; i < totalslices; i++) { /* loop over full slices */ 611 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 612 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 613 614 /* last slice may have padding rows. Don't use vectorization. */ 615 if (i == totalslices - 1 && (A->rmap->n & 0x07)) { 616 for (r = 0; r < (A->rmap->n & 0x07); ++r) { 617 row = 8 * i + r; 618 yval = (MatScalar)0.0; 619 nnz_in_row = a->rlen[row]; 620 for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]]; 621 z[row] = y[row] + yval; 622 } 623 break; 624 } 625 626 vec_y = _mm256_loadu_pd(y + 8 * i); 627 vec_y2 = _mm256_loadu_pd(y + 8 * i + 4); 628 629 /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */ 630 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) { 631 vec_vals = _mm256_loadu_pd(aval); 632 vec_x_tmp = _mm_setzero_pd(); 633 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 634 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 635 vec_x = _mm256_setzero_pd(); 636 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0); 637 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 638 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 639 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1); 640 vec_y = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y); 641 aval += 4; 642 643 vec_vals = _mm256_loadu_pd(aval); 644 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 645 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 646 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0); 647 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 648 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 649 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1); 650 vec_y2 = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2); 651 aval += 4; 652 } 653 654 _mm256_storeu_pd(z + i * 8, vec_y); 655 _mm256_storeu_pd(z + i * 8 + 4, vec_y2); 656 } 657 #else 658 for (i = 0; i < totalslices; i++) { /* loop over slices */ 659 for (j = 0; j < 8; j++) sum[j] = 0.0; 660 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) { 661 sum[0] += aval[j] * x[acolidx[j]]; 662 sum[1] += aval[j + 1] * x[acolidx[j + 1]]; 663 sum[2] += aval[j + 2] * x[acolidx[j + 2]]; 664 sum[3] += aval[j + 3] * x[acolidx[j + 3]]; 665 sum[4] += aval[j + 4] * x[acolidx[j + 4]]; 666 sum[5] += aval[j + 5] * x[acolidx[j + 5]]; 667 sum[6] += aval[j + 6] * x[acolidx[j + 6]]; 668 sum[7] += aval[j + 7] * x[acolidx[j + 7]]; 669 } 670 if (i == totalslices - 1 && (A->rmap->n & 0x07)) { 671 for (j = 0; j < (A->rmap->n & 0x07); j++) z[8 * i + j] = y[8 * i + j] + sum[j]; 672 } else { 673 for (j = 0; j < 8; j++) z[8 * i + j] = y[8 * i + j] + sum[j]; 674 } 675 } 676 #endif 677 678 PetscCall(PetscLogFlops(2.0 * a->nz)); 679 PetscCall(VecRestoreArrayRead(xx, &x)); 680 PetscCall(VecRestoreArrayPair(yy, zz, &y, &z)); 681 PetscFunctionReturn(0); 682 } 683 684 PetscErrorCode MatMultTransposeAdd_SeqSELL(Mat A, Vec xx, Vec zz, Vec yy) 685 { 686 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 687 PetscScalar *y; 688 const PetscScalar *x; 689 const MatScalar *aval = a->val; 690 const PetscInt *acolidx = a->colidx; 691 PetscInt i, j, r, row, nnz_in_row, totalslices = a->totalslices; 692 693 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 694 #pragma disjoint(*x, *y, *aval) 695 #endif 696 697 PetscFunctionBegin; 698 if (A->symmetric == PETSC_BOOL3_TRUE) { 699 PetscCall(MatMultAdd_SeqSELL(A, xx, zz, yy)); 700 PetscFunctionReturn(0); 701 } 702 if (zz != yy) PetscCall(VecCopy(zz, yy)); 703 PetscCall(VecGetArrayRead(xx, &x)); 704 PetscCall(VecGetArray(yy, &y)); 705 for (i = 0; i < a->totalslices; i++) { /* loop over slices */ 706 if (i == totalslices - 1 && (A->rmap->n & 0x07)) { 707 for (r = 0; r < (A->rmap->n & 0x07); ++r) { 708 row = 8 * i + r; 709 nnz_in_row = a->rlen[row]; 710 for (j = 0; j < nnz_in_row; ++j) y[acolidx[8 * j + r]] += aval[8 * j + r] * x[row]; 711 } 712 break; 713 } 714 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) { 715 y[acolidx[j]] += aval[j] * x[8 * i]; 716 y[acolidx[j + 1]] += aval[j + 1] * x[8 * i + 1]; 717 y[acolidx[j + 2]] += aval[j + 2] * x[8 * i + 2]; 718 y[acolidx[j + 3]] += aval[j + 3] * x[8 * i + 3]; 719 y[acolidx[j + 4]] += aval[j + 4] * x[8 * i + 4]; 720 y[acolidx[j + 5]] += aval[j + 5] * x[8 * i + 5]; 721 y[acolidx[j + 6]] += aval[j + 6] * x[8 * i + 6]; 722 y[acolidx[j + 7]] += aval[j + 7] * x[8 * i + 7]; 723 } 724 } 725 PetscCall(PetscLogFlops(2.0 * a->sliidx[a->totalslices])); 726 PetscCall(VecRestoreArrayRead(xx, &x)); 727 PetscCall(VecRestoreArray(yy, &y)); 728 PetscFunctionReturn(0); 729 } 730 731 PetscErrorCode MatMultTranspose_SeqSELL(Mat A, Vec xx, Vec yy) 732 { 733 PetscFunctionBegin; 734 if (A->symmetric == PETSC_BOOL3_TRUE) { 735 PetscCall(MatMult_SeqSELL(A, xx, yy)); 736 } else { 737 PetscCall(VecSet(yy, 0.0)); 738 PetscCall(MatMultTransposeAdd_SeqSELL(A, xx, yy, yy)); 739 } 740 PetscFunctionReturn(0); 741 } 742 743 /* 744 Checks for missing diagonals 745 */ 746 PetscErrorCode MatMissingDiagonal_SeqSELL(Mat A, PetscBool *missing, PetscInt *d) 747 { 748 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 749 PetscInt *diag, i; 750 751 PetscFunctionBegin; 752 *missing = PETSC_FALSE; 753 if (A->rmap->n > 0 && !(a->colidx)) { 754 *missing = PETSC_TRUE; 755 if (d) *d = 0; 756 PetscCall(PetscInfo(A, "Matrix has no entries therefore is missing diagonal\n")); 757 } else { 758 diag = a->diag; 759 for (i = 0; i < A->rmap->n; i++) { 760 if (diag[i] == -1) { 761 *missing = PETSC_TRUE; 762 if (d) *d = i; 763 PetscCall(PetscInfo(A, "Matrix is missing diagonal number %" PetscInt_FMT "\n", i)); 764 break; 765 } 766 } 767 } 768 PetscFunctionReturn(0); 769 } 770 771 PetscErrorCode MatMarkDiagonal_SeqSELL(Mat A) 772 { 773 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 774 PetscInt i, j, m = A->rmap->n, shift; 775 776 PetscFunctionBegin; 777 if (!a->diag) { 778 PetscCall(PetscMalloc1(m, &a->diag)); 779 a->free_diag = PETSC_TRUE; 780 } 781 for (i = 0; i < m; i++) { /* loop over rows */ 782 shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */ 783 a->diag[i] = -1; 784 for (j = 0; j < a->rlen[i]; j++) { 785 if (a->colidx[shift + j * 8] == i) { 786 a->diag[i] = shift + j * 8; 787 break; 788 } 789 } 790 } 791 PetscFunctionReturn(0); 792 } 793 794 /* 795 Negative shift indicates do not generate an error if there is a zero diagonal, just invert it anyways 796 */ 797 PetscErrorCode MatInvertDiagonal_SeqSELL(Mat A, PetscScalar omega, PetscScalar fshift) 798 { 799 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 800 PetscInt i, *diag, m = A->rmap->n; 801 MatScalar *val = a->val; 802 PetscScalar *idiag, *mdiag; 803 804 PetscFunctionBegin; 805 if (a->idiagvalid) PetscFunctionReturn(0); 806 PetscCall(MatMarkDiagonal_SeqSELL(A)); 807 diag = a->diag; 808 if (!a->idiag) { 809 PetscCall(PetscMalloc3(m, &a->idiag, m, &a->mdiag, m, &a->ssor_work)); 810 val = a->val; 811 } 812 mdiag = a->mdiag; 813 idiag = a->idiag; 814 815 if (omega == 1.0 && PetscRealPart(fshift) <= 0.0) { 816 for (i = 0; i < m; i++) { 817 mdiag[i] = val[diag[i]]; 818 if (!PetscAbsScalar(mdiag[i])) { /* zero diagonal */ 819 PetscCheck(PetscRealPart(fshift), PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Zero diagonal on row %" PetscInt_FMT, i); 820 PetscCall(PetscInfo(A, "Zero diagonal on row %" PetscInt_FMT "\n", i)); 821 A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 822 A->factorerror_zeropivot_value = 0.0; 823 A->factorerror_zeropivot_row = i; 824 } 825 idiag[i] = 1.0 / val[diag[i]]; 826 } 827 PetscCall(PetscLogFlops(m)); 828 } else { 829 for (i = 0; i < m; i++) { 830 mdiag[i] = val[diag[i]]; 831 idiag[i] = omega / (fshift + val[diag[i]]); 832 } 833 PetscCall(PetscLogFlops(2.0 * m)); 834 } 835 a->idiagvalid = PETSC_TRUE; 836 PetscFunctionReturn(0); 837 } 838 839 PetscErrorCode MatZeroEntries_SeqSELL(Mat A) 840 { 841 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 842 843 PetscFunctionBegin; 844 PetscCall(PetscArrayzero(a->val, a->sliidx[a->totalslices])); 845 PetscCall(MatSeqSELLInvalidateDiagonal(A)); 846 PetscFunctionReturn(0); 847 } 848 849 PetscErrorCode MatDestroy_SeqSELL(Mat A) 850 { 851 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 852 853 PetscFunctionBegin; 854 #if defined(PETSC_USE_LOG) 855 PetscLogObjectState((PetscObject)A, "Rows=%" PetscInt_FMT ", Cols=%" PetscInt_FMT ", NZ=%" PetscInt_FMT, A->rmap->n, A->cmap->n, a->nz); 856 #endif 857 PetscCall(MatSeqXSELLFreeSELL(A, &a->val, &a->colidx)); 858 PetscCall(ISDestroy(&a->row)); 859 PetscCall(ISDestroy(&a->col)); 860 PetscCall(PetscFree(a->diag)); 861 PetscCall(PetscFree(a->rlen)); 862 PetscCall(PetscFree(a->sliidx)); 863 PetscCall(PetscFree3(a->idiag, a->mdiag, a->ssor_work)); 864 PetscCall(PetscFree(a->solve_work)); 865 PetscCall(ISDestroy(&a->icol)); 866 PetscCall(PetscFree(a->saved_values)); 867 PetscCall(PetscFree2(a->getrowcols, a->getrowvals)); 868 869 PetscCall(PetscFree(A->data)); 870 871 PetscCall(PetscObjectChangeTypeName((PetscObject)A, NULL)); 872 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatStoreValues_C", NULL)); 873 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatRetrieveValues_C", NULL)); 874 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLSetPreallocation_C", NULL)); 875 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetArray_C", NULL)); 876 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLRestoreArray_C", NULL)); 877 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqsell_seqaij_C", NULL)); 878 PetscFunctionReturn(0); 879 } 880 881 PetscErrorCode MatSetOption_SeqSELL(Mat A, MatOption op, PetscBool flg) 882 { 883 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 884 885 PetscFunctionBegin; 886 switch (op) { 887 case MAT_ROW_ORIENTED: 888 a->roworiented = flg; 889 break; 890 case MAT_KEEP_NONZERO_PATTERN: 891 a->keepnonzeropattern = flg; 892 break; 893 case MAT_NEW_NONZERO_LOCATIONS: 894 a->nonew = (flg ? 0 : 1); 895 break; 896 case MAT_NEW_NONZERO_LOCATION_ERR: 897 a->nonew = (flg ? -1 : 0); 898 break; 899 case MAT_NEW_NONZERO_ALLOCATION_ERR: 900 a->nonew = (flg ? -2 : 0); 901 break; 902 case MAT_UNUSED_NONZERO_LOCATION_ERR: 903 a->nounused = (flg ? -1 : 0); 904 break; 905 case MAT_FORCE_DIAGONAL_ENTRIES: 906 case MAT_IGNORE_OFF_PROC_ENTRIES: 907 case MAT_USE_HASH_TABLE: 908 case MAT_SORTED_FULL: 909 PetscCall(PetscInfo(A, "Option %s ignored\n", MatOptions[op])); 910 break; 911 case MAT_SPD: 912 case MAT_SYMMETRIC: 913 case MAT_STRUCTURALLY_SYMMETRIC: 914 case MAT_HERMITIAN: 915 case MAT_SYMMETRY_ETERNAL: 916 case MAT_STRUCTURAL_SYMMETRY_ETERNAL: 917 case MAT_SPD_ETERNAL: 918 /* These options are handled directly by MatSetOption() */ 919 break; 920 default: 921 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unknown option %d", op); 922 } 923 PetscFunctionReturn(0); 924 } 925 926 PetscErrorCode MatGetDiagonal_SeqSELL(Mat A, Vec v) 927 { 928 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 929 PetscInt i, j, n, shift; 930 PetscScalar *x, zero = 0.0; 931 932 PetscFunctionBegin; 933 PetscCall(VecGetLocalSize(v, &n)); 934 PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector"); 935 936 if (A->factortype == MAT_FACTOR_ILU || A->factortype == MAT_FACTOR_LU) { 937 PetscInt *diag = a->diag; 938 PetscCall(VecGetArray(v, &x)); 939 for (i = 0; i < n; i++) x[i] = 1.0 / a->val[diag[i]]; 940 PetscCall(VecRestoreArray(v, &x)); 941 PetscFunctionReturn(0); 942 } 943 944 PetscCall(VecSet(v, zero)); 945 PetscCall(VecGetArray(v, &x)); 946 for (i = 0; i < n; i++) { /* loop over rows */ 947 shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */ 948 x[i] = 0; 949 for (j = 0; j < a->rlen[i]; j++) { 950 if (a->colidx[shift + j * 8] == i) { 951 x[i] = a->val[shift + j * 8]; 952 break; 953 } 954 } 955 } 956 PetscCall(VecRestoreArray(v, &x)); 957 PetscFunctionReturn(0); 958 } 959 960 PetscErrorCode MatDiagonalScale_SeqSELL(Mat A, Vec ll, Vec rr) 961 { 962 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 963 const PetscScalar *l, *r; 964 PetscInt i, j, m, n, row; 965 966 PetscFunctionBegin; 967 if (ll) { 968 /* The local size is used so that VecMPI can be passed to this routine 969 by MatDiagonalScale_MPISELL */ 970 PetscCall(VecGetLocalSize(ll, &m)); 971 PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length"); 972 PetscCall(VecGetArrayRead(ll, &l)); 973 for (i = 0; i < a->totalslices; i++) { /* loop over slices */ 974 if (i == a->totalslices - 1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */ 975 for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) & 0x07)) { 976 if (row < (A->rmap->n & 0x07)) a->val[j] *= l[8 * i + row]; 977 } 978 } else { 979 for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) & 0x07)) a->val[j] *= l[8 * i + row]; 980 } 981 } 982 PetscCall(VecRestoreArrayRead(ll, &l)); 983 PetscCall(PetscLogFlops(a->nz)); 984 } 985 if (rr) { 986 PetscCall(VecGetLocalSize(rr, &n)); 987 PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length"); 988 PetscCall(VecGetArrayRead(rr, &r)); 989 for (i = 0; i < a->totalslices; i++) { /* loop over slices */ 990 if (i == a->totalslices - 1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */ 991 for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) & 0x07)) { 992 if (row < (A->rmap->n & 0x07)) a->val[j] *= r[a->colidx[j]]; 993 } 994 } else { 995 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j++) a->val[j] *= r[a->colidx[j]]; 996 } 997 } 998 PetscCall(VecRestoreArrayRead(rr, &r)); 999 PetscCall(PetscLogFlops(a->nz)); 1000 } 1001 PetscCall(MatSeqSELLInvalidateDiagonal(A)); 1002 PetscFunctionReturn(0); 1003 } 1004 1005 PetscErrorCode MatGetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], PetscScalar v[]) 1006 { 1007 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1008 PetscInt *cp, i, k, low, high, t, row, col, l; 1009 PetscInt shift; 1010 MatScalar *vp; 1011 1012 PetscFunctionBegin; 1013 for (k = 0; k < m; k++) { /* loop over requested rows */ 1014 row = im[k]; 1015 if (row < 0) continue; 1016 PetscCheck(row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->n - 1); 1017 shift = a->sliidx[row >> 3] + (row & 0x07); /* starting index of the row */ 1018 cp = a->colidx + shift; /* pointer to the row */ 1019 vp = a->val + shift; /* pointer to the row */ 1020 for (l = 0; l < n; l++) { /* loop over requested columns */ 1021 col = in[l]; 1022 if (col < 0) continue; 1023 PetscCheck(col < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column too large: row %" PetscInt_FMT " max %" PetscInt_FMT, col, A->cmap->n - 1); 1024 high = a->rlen[row]; 1025 low = 0; /* assume unsorted */ 1026 while (high - low > 5) { 1027 t = (low + high) / 2; 1028 if (*(cp + t * 8) > col) high = t; 1029 else low = t; 1030 } 1031 for (i = low; i < high; i++) { 1032 if (*(cp + 8 * i) > col) break; 1033 if (*(cp + 8 * i) == col) { 1034 *v++ = *(vp + 8 * i); 1035 goto finished; 1036 } 1037 } 1038 *v++ = 0.0; 1039 finished:; 1040 } 1041 } 1042 PetscFunctionReturn(0); 1043 } 1044 1045 PetscErrorCode MatView_SeqSELL_ASCII(Mat A, PetscViewer viewer) 1046 { 1047 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1048 PetscInt i, j, m = A->rmap->n, shift; 1049 const char *name; 1050 PetscViewerFormat format; 1051 1052 PetscFunctionBegin; 1053 PetscCall(PetscViewerGetFormat(viewer, &format)); 1054 if (format == PETSC_VIEWER_ASCII_MATLAB) { 1055 PetscInt nofinalvalue = 0; 1056 /* 1057 if (m && ((a->i[m] == a->i[m-1]) || (a->j[a->nz-1] != A->cmap->n-1))) { 1058 nofinalvalue = 1; 1059 } 1060 */ 1061 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1062 PetscCall(PetscViewerASCIIPrintf(viewer, "%% Size = %" PetscInt_FMT " %" PetscInt_FMT " \n", m, A->cmap->n)); 1063 PetscCall(PetscViewerASCIIPrintf(viewer, "%% Nonzeros = %" PetscInt_FMT " \n", a->nz)); 1064 #if defined(PETSC_USE_COMPLEX) 1065 PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",4);\n", a->nz + nofinalvalue)); 1066 #else 1067 PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",3);\n", a->nz + nofinalvalue)); 1068 #endif 1069 PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = [\n")); 1070 1071 for (i = 0; i < m; i++) { 1072 shift = a->sliidx[i >> 3] + (i & 0x07); 1073 for (j = 0; j < a->rlen[i]; j++) { 1074 #if defined(PETSC_USE_COMPLEX) 1075 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %18.16e %18.16e\n", i + 1, a->colidx[shift + 8 * j] + 1, (double)PetscRealPart(a->val[shift + 8 * j]), (double)PetscImaginaryPart(a->val[shift + 8 * j]))); 1076 #else 1077 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %18.16e\n", i + 1, a->colidx[shift + 8 * j] + 1, (double)a->val[shift + 8 * j])); 1078 #endif 1079 } 1080 } 1081 /* 1082 if (nofinalvalue) { 1083 #if defined(PETSC_USE_COMPLEX) 1084 PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %18.16e %18.16e\n",m,A->cmap->n,0.,0.)); 1085 #else 1086 PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %18.16e\n",m,A->cmap->n,0.0)); 1087 #endif 1088 } 1089 */ 1090 PetscCall(PetscObjectGetName((PetscObject)A, &name)); 1091 PetscCall(PetscViewerASCIIPrintf(viewer, "];\n %s = spconvert(zzz);\n", name)); 1092 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1093 } else if (format == PETSC_VIEWER_ASCII_FACTOR_INFO || format == PETSC_VIEWER_ASCII_INFO) { 1094 PetscFunctionReturn(0); 1095 } else if (format == PETSC_VIEWER_ASCII_COMMON) { 1096 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1097 for (i = 0; i < m; i++) { 1098 PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i)); 1099 shift = a->sliidx[i >> 3] + (i & 0x07); 1100 for (j = 0; j < a->rlen[i]; j++) { 1101 #if defined(PETSC_USE_COMPLEX) 1102 if (PetscImaginaryPart(a->val[shift + 8 * j]) > 0.0 && PetscRealPart(a->val[shift + 8 * j]) != 0.0) { 1103 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]), (double)PetscImaginaryPart(a->val[shift + 8 * j]))); 1104 } else if (PetscImaginaryPart(a->val[shift + 8 * j]) < 0.0 && PetscRealPart(a->val[shift + 8 * j]) != 0.0) { 1105 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]), (double)-PetscImaginaryPart(a->val[shift + 8 * j]))); 1106 } else if (PetscRealPart(a->val[shift + 8 * j]) != 0.0) { 1107 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]))); 1108 } 1109 #else 1110 if (a->val[shift + 8 * j] != 0.0) PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + 8 * j], (double)a->val[shift + 8 * j])); 1111 #endif 1112 } 1113 PetscCall(PetscViewerASCIIPrintf(viewer, "\n")); 1114 } 1115 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1116 } else if (format == PETSC_VIEWER_ASCII_DENSE) { 1117 PetscInt cnt = 0, jcnt; 1118 PetscScalar value; 1119 #if defined(PETSC_USE_COMPLEX) 1120 PetscBool realonly = PETSC_TRUE; 1121 for (i = 0; i < a->sliidx[a->totalslices]; i++) { 1122 if (PetscImaginaryPart(a->val[i]) != 0.0) { 1123 realonly = PETSC_FALSE; 1124 break; 1125 } 1126 } 1127 #endif 1128 1129 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1130 for (i = 0; i < m; i++) { 1131 jcnt = 0; 1132 shift = a->sliidx[i >> 3] + (i & 0x07); 1133 for (j = 0; j < A->cmap->n; j++) { 1134 if (jcnt < a->rlen[i] && j == a->colidx[shift + 8 * j]) { 1135 value = a->val[cnt++]; 1136 jcnt++; 1137 } else { 1138 value = 0.0; 1139 } 1140 #if defined(PETSC_USE_COMPLEX) 1141 if (realonly) { 1142 PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)PetscRealPart(value))); 1143 } else { 1144 PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e+%7.5e i ", (double)PetscRealPart(value), (double)PetscImaginaryPart(value))); 1145 } 1146 #else 1147 PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)value)); 1148 #endif 1149 } 1150 PetscCall(PetscViewerASCIIPrintf(viewer, "\n")); 1151 } 1152 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1153 } else if (format == PETSC_VIEWER_ASCII_MATRIXMARKET) { 1154 PetscInt fshift = 1; 1155 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1156 #if defined(PETSC_USE_COMPLEX) 1157 PetscCall(PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate complex general\n")); 1158 #else 1159 PetscCall(PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate real general\n")); 1160 #endif 1161 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %" PetscInt_FMT "\n", m, A->cmap->n, a->nz)); 1162 for (i = 0; i < m; i++) { 1163 shift = a->sliidx[i >> 3] + (i & 0x07); 1164 for (j = 0; j < a->rlen[i]; j++) { 1165 #if defined(PETSC_USE_COMPLEX) 1166 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g %g\n", i + fshift, a->colidx[shift + 8 * j] + fshift, (double)PetscRealPart(a->val[shift + 8 * j]), (double)PetscImaginaryPart(a->val[shift + 8 * j]))); 1167 #else 1168 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g\n", i + fshift, a->colidx[shift + 8 * j] + fshift, (double)a->val[shift + 8 * j])); 1169 #endif 1170 } 1171 } 1172 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1173 } else if (format == PETSC_VIEWER_NATIVE) { 1174 for (i = 0; i < a->totalslices; i++) { /* loop over slices */ 1175 PetscInt row; 1176 PetscCall(PetscViewerASCIIPrintf(viewer, "slice %" PetscInt_FMT ": %" PetscInt_FMT " %" PetscInt_FMT "\n", i, a->sliidx[i], a->sliidx[i + 1])); 1177 for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) & 0x07)) { 1178 #if defined(PETSC_USE_COMPLEX) 1179 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1180 PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g + %g i\n", 8 * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j]))); 1181 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1182 PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g - %g i\n", 8 * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), -(double)PetscImaginaryPart(a->val[j]))); 1183 } else { 1184 PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g\n", 8 * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]))); 1185 } 1186 #else 1187 PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g\n", 8 * i + row, a->colidx[j], (double)a->val[j])); 1188 #endif 1189 } 1190 } 1191 } else { 1192 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1193 if (A->factortype) { 1194 for (i = 0; i < m; i++) { 1195 shift = a->sliidx[i >> 3] + (i & 0x07); 1196 PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i)); 1197 /* L part */ 1198 for (j = shift; j < a->diag[i]; j += 8) { 1199 #if defined(PETSC_USE_COMPLEX) 1200 if (PetscImaginaryPart(a->val[shift + 8 * j]) > 0.0) { 1201 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j]))); 1202 } else if (PetscImaginaryPart(a->val[shift + 8 * j]) < 0.0) { 1203 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j])))); 1204 } else { 1205 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j]))); 1206 } 1207 #else 1208 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j])); 1209 #endif 1210 } 1211 /* diagonal */ 1212 j = a->diag[i]; 1213 #if defined(PETSC_USE_COMPLEX) 1214 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1215 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)PetscImaginaryPart(1.0 / a->val[j]))); 1216 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1217 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)(-PetscImaginaryPart(1.0 / a->val[j])))); 1218 } else { 1219 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]))); 1220 } 1221 #else 1222 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)(1.0 / a->val[j]))); 1223 #endif 1224 1225 /* U part */ 1226 for (j = a->diag[i] + 1; j < shift + 8 * a->rlen[i]; j += 8) { 1227 #if defined(PETSC_USE_COMPLEX) 1228 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1229 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j]))); 1230 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1231 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j])))); 1232 } else { 1233 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j]))); 1234 } 1235 #else 1236 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j])); 1237 #endif 1238 } 1239 PetscCall(PetscViewerASCIIPrintf(viewer, "\n")); 1240 } 1241 } else { 1242 for (i = 0; i < m; i++) { 1243 shift = a->sliidx[i >> 3] + (i & 0x07); 1244 PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i)); 1245 for (j = 0; j < a->rlen[i]; j++) { 1246 #if defined(PETSC_USE_COMPLEX) 1247 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1248 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]), (double)PetscImaginaryPart(a->val[shift + 8 * j]))); 1249 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1250 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]), (double)-PetscImaginaryPart(a->val[shift + 8 * j]))); 1251 } else { 1252 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]))); 1253 } 1254 #else 1255 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + 8 * j], (double)a->val[shift + 8 * j])); 1256 #endif 1257 } 1258 PetscCall(PetscViewerASCIIPrintf(viewer, "\n")); 1259 } 1260 } 1261 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1262 } 1263 PetscCall(PetscViewerFlush(viewer)); 1264 PetscFunctionReturn(0); 1265 } 1266 1267 #include <petscdraw.h> 1268 PetscErrorCode MatView_SeqSELL_Draw_Zoom(PetscDraw draw, void *Aa) 1269 { 1270 Mat A = (Mat)Aa; 1271 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1272 PetscInt i, j, m = A->rmap->n, shift; 1273 int color; 1274 PetscReal xl, yl, xr, yr, x_l, x_r, y_l, y_r; 1275 PetscViewer viewer; 1276 PetscViewerFormat format; 1277 1278 PetscFunctionBegin; 1279 PetscCall(PetscObjectQuery((PetscObject)A, "Zoomviewer", (PetscObject *)&viewer)); 1280 PetscCall(PetscViewerGetFormat(viewer, &format)); 1281 PetscCall(PetscDrawGetCoordinates(draw, &xl, &yl, &xr, &yr)); 1282 1283 /* loop over matrix elements drawing boxes */ 1284 1285 if (format != PETSC_VIEWER_DRAW_CONTOUR) { 1286 PetscDrawCollectiveBegin(draw); 1287 /* Blue for negative, Cyan for zero and Red for positive */ 1288 color = PETSC_DRAW_BLUE; 1289 for (i = 0; i < m; i++) { 1290 shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */ 1291 y_l = m - i - 1.0; 1292 y_r = y_l + 1.0; 1293 for (j = 0; j < a->rlen[i]; j++) { 1294 x_l = a->colidx[shift + j * 8]; 1295 x_r = x_l + 1.0; 1296 if (PetscRealPart(a->val[shift + 8 * j]) >= 0.) continue; 1297 PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color)); 1298 } 1299 } 1300 color = PETSC_DRAW_CYAN; 1301 for (i = 0; i < m; i++) { 1302 shift = a->sliidx[i >> 3] + (i & 0x07); 1303 y_l = m - i - 1.0; 1304 y_r = y_l + 1.0; 1305 for (j = 0; j < a->rlen[i]; j++) { 1306 x_l = a->colidx[shift + j * 8]; 1307 x_r = x_l + 1.0; 1308 if (a->val[shift + 8 * j] != 0.) continue; 1309 PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color)); 1310 } 1311 } 1312 color = PETSC_DRAW_RED; 1313 for (i = 0; i < m; i++) { 1314 shift = a->sliidx[i >> 3] + (i & 0x07); 1315 y_l = m - i - 1.0; 1316 y_r = y_l + 1.0; 1317 for (j = 0; j < a->rlen[i]; j++) { 1318 x_l = a->colidx[shift + j * 8]; 1319 x_r = x_l + 1.0; 1320 if (PetscRealPart(a->val[shift + 8 * j]) <= 0.) continue; 1321 PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color)); 1322 } 1323 } 1324 PetscDrawCollectiveEnd(draw); 1325 } else { 1326 /* use contour shading to indicate magnitude of values */ 1327 /* first determine max of all nonzero values */ 1328 PetscReal minv = 0.0, maxv = 0.0; 1329 PetscInt count = 0; 1330 PetscDraw popup; 1331 for (i = 0; i < a->sliidx[a->totalslices]; i++) { 1332 if (PetscAbsScalar(a->val[i]) > maxv) maxv = PetscAbsScalar(a->val[i]); 1333 } 1334 if (minv >= maxv) maxv = minv + PETSC_SMALL; 1335 PetscCall(PetscDrawGetPopup(draw, &popup)); 1336 PetscCall(PetscDrawScalePopup(popup, minv, maxv)); 1337 1338 PetscDrawCollectiveBegin(draw); 1339 for (i = 0; i < m; i++) { 1340 shift = a->sliidx[i >> 3] + (i & 0x07); 1341 y_l = m - i - 1.0; 1342 y_r = y_l + 1.0; 1343 for (j = 0; j < a->rlen[i]; j++) { 1344 x_l = a->colidx[shift + j * 8]; 1345 x_r = x_l + 1.0; 1346 color = PetscDrawRealToColor(PetscAbsScalar(a->val[count]), minv, maxv); 1347 PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color)); 1348 count++; 1349 } 1350 } 1351 PetscDrawCollectiveEnd(draw); 1352 } 1353 PetscFunctionReturn(0); 1354 } 1355 1356 #include <petscdraw.h> 1357 PetscErrorCode MatView_SeqSELL_Draw(Mat A, PetscViewer viewer) 1358 { 1359 PetscDraw draw; 1360 PetscReal xr, yr, xl, yl, h, w; 1361 PetscBool isnull; 1362 1363 PetscFunctionBegin; 1364 PetscCall(PetscViewerDrawGetDraw(viewer, 0, &draw)); 1365 PetscCall(PetscDrawIsNull(draw, &isnull)); 1366 if (isnull) PetscFunctionReturn(0); 1367 1368 xr = A->cmap->n; 1369 yr = A->rmap->n; 1370 h = yr / 10.0; 1371 w = xr / 10.0; 1372 xr += w; 1373 yr += h; 1374 xl = -w; 1375 yl = -h; 1376 PetscCall(PetscDrawSetCoordinates(draw, xl, yl, xr, yr)); 1377 PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", (PetscObject)viewer)); 1378 PetscCall(PetscDrawZoom(draw, MatView_SeqSELL_Draw_Zoom, A)); 1379 PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", NULL)); 1380 PetscCall(PetscDrawSave(draw)); 1381 PetscFunctionReturn(0); 1382 } 1383 1384 PetscErrorCode MatView_SeqSELL(Mat A, PetscViewer viewer) 1385 { 1386 PetscBool iascii, isbinary, isdraw; 1387 1388 PetscFunctionBegin; 1389 PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii)); 1390 PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERBINARY, &isbinary)); 1391 PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERDRAW, &isdraw)); 1392 if (iascii) { 1393 PetscCall(MatView_SeqSELL_ASCII(A, viewer)); 1394 } else if (isbinary) { 1395 /* PetscCall(MatView_SeqSELL_Binary(A,viewer)); */ 1396 } else if (isdraw) PetscCall(MatView_SeqSELL_Draw(A, viewer)); 1397 PetscFunctionReturn(0); 1398 } 1399 1400 PetscErrorCode MatAssemblyEnd_SeqSELL(Mat A, MatAssemblyType mode) 1401 { 1402 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1403 PetscInt i, shift, row_in_slice, row, nrow, *cp, lastcol, j, k; 1404 MatScalar *vp; 1405 1406 PetscFunctionBegin; 1407 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 1408 /* To do: compress out the unused elements */ 1409 PetscCall(MatMarkDiagonal_SeqSELL(A)); 1410 PetscCall(PetscInfo(A, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: %" PetscInt_FMT " allocated %" PetscInt_FMT " used (%" PetscInt_FMT " nonzeros+%" PetscInt_FMT " paddedzeros)\n", A->rmap->n, A->cmap->n, a->maxallocmat, a->sliidx[a->totalslices], a->nz, a->sliidx[a->totalslices] - a->nz)); 1411 PetscCall(PetscInfo(A, "Number of mallocs during MatSetValues() is %" PetscInt_FMT "\n", a->reallocs)); 1412 PetscCall(PetscInfo(A, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", a->rlenmax)); 1413 /* Set unused slots for column indices to last valid column index. Set unused slots for values to zero. This allows for a use of unmasked intrinsics -> higher performance */ 1414 for (i = 0; i < a->totalslices; ++i) { 1415 shift = a->sliidx[i]; /* starting index of the slice */ 1416 cp = a->colidx + shift; /* pointer to the column indices of the slice */ 1417 vp = a->val + shift; /* pointer to the nonzero values of the slice */ 1418 for (row_in_slice = 0; row_in_slice < 8; ++row_in_slice) { /* loop over rows in the slice */ 1419 row = 8 * i + row_in_slice; 1420 nrow = a->rlen[row]; /* number of nonzeros in row */ 1421 /* 1422 Search for the nearest nonzero. Normally setting the index to zero may cause extra communication. 1423 But if the entire slice are empty, it is fine to use 0 since the index will not be loaded. 1424 */ 1425 lastcol = 0; 1426 if (nrow > 0) { /* nonempty row */ 1427 lastcol = cp[8 * (nrow - 1) + row_in_slice]; /* use the index from the last nonzero at current row */ 1428 } else if (!row_in_slice) { /* first row of the currect slice is empty */ 1429 for (j = 1; j < 8; j++) { 1430 if (a->rlen[8 * i + j]) { 1431 lastcol = cp[j]; 1432 break; 1433 } 1434 } 1435 } else { 1436 if (a->sliidx[i + 1] != shift) lastcol = cp[row_in_slice - 1]; /* use the index from the previous row */ 1437 } 1438 1439 for (k = nrow; k < (a->sliidx[i + 1] - shift) / 8; ++k) { 1440 cp[8 * k + row_in_slice] = lastcol; 1441 vp[8 * k + row_in_slice] = (MatScalar)0; 1442 } 1443 } 1444 } 1445 1446 A->info.mallocs += a->reallocs; 1447 a->reallocs = 0; 1448 1449 PetscCall(MatSeqSELLInvalidateDiagonal(A)); 1450 PetscFunctionReturn(0); 1451 } 1452 1453 PetscErrorCode MatGetInfo_SeqSELL(Mat A, MatInfoType flag, MatInfo *info) 1454 { 1455 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1456 1457 PetscFunctionBegin; 1458 info->block_size = 1.0; 1459 info->nz_allocated = a->maxallocmat; 1460 info->nz_used = a->sliidx[a->totalslices]; /* include padding zeros */ 1461 info->nz_unneeded = (a->maxallocmat - a->sliidx[a->totalslices]); 1462 info->assemblies = A->num_ass; 1463 info->mallocs = A->info.mallocs; 1464 info->memory = 0; /* REVIEW ME */ 1465 if (A->factortype) { 1466 info->fill_ratio_given = A->info.fill_ratio_given; 1467 info->fill_ratio_needed = A->info.fill_ratio_needed; 1468 info->factor_mallocs = A->info.factor_mallocs; 1469 } else { 1470 info->fill_ratio_given = 0; 1471 info->fill_ratio_needed = 0; 1472 info->factor_mallocs = 0; 1473 } 1474 PetscFunctionReturn(0); 1475 } 1476 1477 PetscErrorCode MatSetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], const PetscScalar v[], InsertMode is) 1478 { 1479 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1480 PetscInt shift, i, k, l, low, high, t, ii, row, col, nrow; 1481 PetscInt *cp, nonew = a->nonew, lastcol = -1; 1482 MatScalar *vp, value; 1483 1484 PetscFunctionBegin; 1485 for (k = 0; k < m; k++) { /* loop over added rows */ 1486 row = im[k]; 1487 if (row < 0) continue; 1488 PetscCheck(row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->n - 1); 1489 shift = a->sliidx[row >> 3] + (row & 0x07); /* starting index of the row */ 1490 cp = a->colidx + shift; /* pointer to the row */ 1491 vp = a->val + shift; /* pointer to the row */ 1492 nrow = a->rlen[row]; 1493 low = 0; 1494 high = nrow; 1495 1496 for (l = 0; l < n; l++) { /* loop over added columns */ 1497 col = in[l]; 1498 if (col < 0) continue; 1499 PetscCheck(col < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Col too large: row %" PetscInt_FMT " max %" PetscInt_FMT, col, A->cmap->n - 1); 1500 if (a->roworiented) { 1501 value = v[l + k * n]; 1502 } else { 1503 value = v[k + l * m]; 1504 } 1505 if ((value == 0.0 && a->ignorezeroentries) && (is == ADD_VALUES)) continue; 1506 1507 /* search in this row for the specified column, i indicates the column to be set */ 1508 if (col <= lastcol) low = 0; 1509 else high = nrow; 1510 lastcol = col; 1511 while (high - low > 5) { 1512 t = (low + high) / 2; 1513 if (*(cp + t * 8) > col) high = t; 1514 else low = t; 1515 } 1516 for (i = low; i < high; i++) { 1517 if (*(cp + i * 8) > col) break; 1518 if (*(cp + i * 8) == col) { 1519 if (is == ADD_VALUES) *(vp + i * 8) += value; 1520 else *(vp + i * 8) = value; 1521 low = i + 1; 1522 goto noinsert; 1523 } 1524 } 1525 if (value == 0.0 && a->ignorezeroentries) goto noinsert; 1526 if (nonew == 1) goto noinsert; 1527 PetscCheck(nonew != -1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Inserting a new nonzero (%" PetscInt_FMT ", %" PetscInt_FMT ") in the matrix", row, col); 1528 /* If the current row length exceeds the slice width (e.g. nrow==slice_width), allocate a new space, otherwise do nothing */ 1529 MatSeqXSELLReallocateSELL(A, A->rmap->n, 1, nrow, a->sliidx, row / 8, row, col, a->colidx, a->val, cp, vp, nonew, MatScalar); 1530 /* add the new nonzero to the high position, shift the remaining elements in current row to the right by one slot */ 1531 for (ii = nrow - 1; ii >= i; ii--) { 1532 *(cp + (ii + 1) * 8) = *(cp + ii * 8); 1533 *(vp + (ii + 1) * 8) = *(vp + ii * 8); 1534 } 1535 a->rlen[row]++; 1536 *(cp + i * 8) = col; 1537 *(vp + i * 8) = value; 1538 a->nz++; 1539 A->nonzerostate++; 1540 low = i + 1; 1541 high++; 1542 nrow++; 1543 noinsert:; 1544 } 1545 a->rlen[row] = nrow; 1546 } 1547 PetscFunctionReturn(0); 1548 } 1549 1550 PetscErrorCode MatCopy_SeqSELL(Mat A, Mat B, MatStructure str) 1551 { 1552 PetscFunctionBegin; 1553 /* If the two matrices have the same copy implementation, use fast copy. */ 1554 if (str == SAME_NONZERO_PATTERN && (A->ops->copy == B->ops->copy)) { 1555 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1556 Mat_SeqSELL *b = (Mat_SeqSELL *)B->data; 1557 1558 PetscCheck(a->sliidx[a->totalslices] == b->sliidx[b->totalslices], PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Number of nonzeros in two matrices are different"); 1559 PetscCall(PetscArraycpy(b->val, a->val, a->sliidx[a->totalslices])); 1560 } else { 1561 PetscCall(MatCopy_Basic(A, B, str)); 1562 } 1563 PetscFunctionReturn(0); 1564 } 1565 1566 PetscErrorCode MatSetUp_SeqSELL(Mat A) 1567 { 1568 PetscFunctionBegin; 1569 PetscCall(MatSeqSELLSetPreallocation(A, PETSC_DEFAULT, NULL)); 1570 PetscFunctionReturn(0); 1571 } 1572 1573 PetscErrorCode MatSeqSELLGetArray_SeqSELL(Mat A, PetscScalar *array[]) 1574 { 1575 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1576 1577 PetscFunctionBegin; 1578 *array = a->val; 1579 PetscFunctionReturn(0); 1580 } 1581 1582 PetscErrorCode MatSeqSELLRestoreArray_SeqSELL(Mat A, PetscScalar *array[]) 1583 { 1584 PetscFunctionBegin; 1585 PetscFunctionReturn(0); 1586 } 1587 1588 PetscErrorCode MatRealPart_SeqSELL(Mat A) 1589 { 1590 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1591 PetscInt i; 1592 MatScalar *aval = a->val; 1593 1594 PetscFunctionBegin; 1595 for (i = 0; i < a->sliidx[a->totalslices]; i++) aval[i] = PetscRealPart(aval[i]); 1596 PetscFunctionReturn(0); 1597 } 1598 1599 PetscErrorCode MatImaginaryPart_SeqSELL(Mat A) 1600 { 1601 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1602 PetscInt i; 1603 MatScalar *aval = a->val; 1604 1605 PetscFunctionBegin; 1606 for (i = 0; i < a->sliidx[a->totalslices]; i++) aval[i] = PetscImaginaryPart(aval[i]); 1607 PetscCall(MatSeqSELLInvalidateDiagonal(A)); 1608 PetscFunctionReturn(0); 1609 } 1610 1611 PetscErrorCode MatScale_SeqSELL(Mat inA, PetscScalar alpha) 1612 { 1613 Mat_SeqSELL *a = (Mat_SeqSELL *)inA->data; 1614 MatScalar *aval = a->val; 1615 PetscScalar oalpha = alpha; 1616 PetscBLASInt one = 1, size; 1617 1618 PetscFunctionBegin; 1619 PetscCall(PetscBLASIntCast(a->sliidx[a->totalslices], &size)); 1620 PetscCallBLAS("BLASscal", BLASscal_(&size, &oalpha, aval, &one)); 1621 PetscCall(PetscLogFlops(a->nz)); 1622 PetscCall(MatSeqSELLInvalidateDiagonal(inA)); 1623 PetscFunctionReturn(0); 1624 } 1625 1626 PetscErrorCode MatShift_SeqSELL(Mat Y, PetscScalar a) 1627 { 1628 Mat_SeqSELL *y = (Mat_SeqSELL *)Y->data; 1629 1630 PetscFunctionBegin; 1631 if (!Y->preallocated || !y->nz) PetscCall(MatSeqSELLSetPreallocation(Y, 1, NULL)); 1632 PetscCall(MatShift_Basic(Y, a)); 1633 PetscFunctionReturn(0); 1634 } 1635 1636 PetscErrorCode MatSOR_SeqSELL(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx) 1637 { 1638 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1639 PetscScalar *x, sum, *t; 1640 const MatScalar *idiag = NULL, *mdiag; 1641 const PetscScalar *b, *xb; 1642 PetscInt n, m = A->rmap->n, i, j, shift; 1643 const PetscInt *diag; 1644 1645 PetscFunctionBegin; 1646 its = its * lits; 1647 1648 if (fshift != a->fshift || omega != a->omega) a->idiagvalid = PETSC_FALSE; /* must recompute idiag[] */ 1649 if (!a->idiagvalid) PetscCall(MatInvertDiagonal_SeqSELL(A, omega, fshift)); 1650 a->fshift = fshift; 1651 a->omega = omega; 1652 1653 diag = a->diag; 1654 t = a->ssor_work; 1655 idiag = a->idiag; 1656 mdiag = a->mdiag; 1657 1658 PetscCall(VecGetArray(xx, &x)); 1659 PetscCall(VecGetArrayRead(bb, &b)); 1660 /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */ 1661 PetscCheck(flag != SOR_APPLY_UPPER, PETSC_COMM_SELF, PETSC_ERR_SUP, "SOR_APPLY_UPPER is not implemented"); 1662 PetscCheck(flag != SOR_APPLY_LOWER, PETSC_COMM_SELF, PETSC_ERR_SUP, "SOR_APPLY_LOWER is not implemented"); 1663 PetscCheck(!(flag & SOR_EISENSTAT), PETSC_COMM_SELF, PETSC_ERR_SUP, "No support yet for Eisenstat"); 1664 1665 if (flag & SOR_ZERO_INITIAL_GUESS) { 1666 if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) { 1667 for (i = 0; i < m; i++) { 1668 shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */ 1669 sum = b[i]; 1670 n = (diag[i] - shift) / 8; 1671 for (j = 0; j < n; j++) sum -= a->val[shift + j * 8] * x[a->colidx[shift + j * 8]]; 1672 t[i] = sum; 1673 x[i] = sum * idiag[i]; 1674 } 1675 xb = t; 1676 PetscCall(PetscLogFlops(a->nz)); 1677 } else xb = b; 1678 if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) { 1679 for (i = m - 1; i >= 0; i--) { 1680 shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */ 1681 sum = xb[i]; 1682 n = a->rlen[i] - (diag[i] - shift) / 8 - 1; 1683 for (j = 1; j <= n; j++) sum -= a->val[diag[i] + j * 8] * x[a->colidx[diag[i] + j * 8]]; 1684 if (xb == b) { 1685 x[i] = sum * idiag[i]; 1686 } else { 1687 x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */ 1688 } 1689 } 1690 PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */ 1691 } 1692 its--; 1693 } 1694 while (its--) { 1695 if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) { 1696 for (i = 0; i < m; i++) { 1697 /* lower */ 1698 shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */ 1699 sum = b[i]; 1700 n = (diag[i] - shift) / 8; 1701 for (j = 0; j < n; j++) sum -= a->val[shift + j * 8] * x[a->colidx[shift + j * 8]]; 1702 t[i] = sum; /* save application of the lower-triangular part */ 1703 /* upper */ 1704 n = a->rlen[i] - (diag[i] - shift) / 8 - 1; 1705 for (j = 1; j <= n; j++) sum -= a->val[diag[i] + j * 8] * x[a->colidx[diag[i] + j * 8]]; 1706 x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */ 1707 } 1708 xb = t; 1709 PetscCall(PetscLogFlops(2.0 * a->nz)); 1710 } else xb = b; 1711 if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) { 1712 for (i = m - 1; i >= 0; i--) { 1713 shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */ 1714 sum = xb[i]; 1715 if (xb == b) { 1716 /* whole matrix (no checkpointing available) */ 1717 n = a->rlen[i]; 1718 for (j = 0; j < n; j++) sum -= a->val[shift + j * 8] * x[a->colidx[shift + j * 8]]; 1719 x[i] = (1. - omega) * x[i] + (sum + mdiag[i] * x[i]) * idiag[i]; 1720 } else { /* lower-triangular part has been saved, so only apply upper-triangular */ 1721 n = a->rlen[i] - (diag[i] - shift) / 8 - 1; 1722 for (j = 1; j <= n; j++) sum -= a->val[diag[i] + j * 8] * x[a->colidx[diag[i] + j * 8]]; 1723 x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */ 1724 } 1725 } 1726 if (xb == b) { 1727 PetscCall(PetscLogFlops(2.0 * a->nz)); 1728 } else { 1729 PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */ 1730 } 1731 } 1732 } 1733 PetscCall(VecRestoreArray(xx, &x)); 1734 PetscCall(VecRestoreArrayRead(bb, &b)); 1735 PetscFunctionReturn(0); 1736 } 1737 1738 /* -------------------------------------------------------------------*/ 1739 static struct _MatOps MatOps_Values = {MatSetValues_SeqSELL, 1740 MatGetRow_SeqSELL, 1741 MatRestoreRow_SeqSELL, 1742 MatMult_SeqSELL, 1743 /* 4*/ MatMultAdd_SeqSELL, 1744 MatMultTranspose_SeqSELL, 1745 MatMultTransposeAdd_SeqSELL, 1746 NULL, 1747 NULL, 1748 NULL, 1749 /* 10*/ NULL, 1750 NULL, 1751 NULL, 1752 MatSOR_SeqSELL, 1753 NULL, 1754 /* 15*/ MatGetInfo_SeqSELL, 1755 MatEqual_SeqSELL, 1756 MatGetDiagonal_SeqSELL, 1757 MatDiagonalScale_SeqSELL, 1758 NULL, 1759 /* 20*/ NULL, 1760 MatAssemblyEnd_SeqSELL, 1761 MatSetOption_SeqSELL, 1762 MatZeroEntries_SeqSELL, 1763 /* 24*/ NULL, 1764 NULL, 1765 NULL, 1766 NULL, 1767 NULL, 1768 /* 29*/ MatSetUp_SeqSELL, 1769 NULL, 1770 NULL, 1771 NULL, 1772 NULL, 1773 /* 34*/ MatDuplicate_SeqSELL, 1774 NULL, 1775 NULL, 1776 NULL, 1777 NULL, 1778 /* 39*/ NULL, 1779 NULL, 1780 NULL, 1781 MatGetValues_SeqSELL, 1782 MatCopy_SeqSELL, 1783 /* 44*/ NULL, 1784 MatScale_SeqSELL, 1785 MatShift_SeqSELL, 1786 NULL, 1787 NULL, 1788 /* 49*/ NULL, 1789 NULL, 1790 NULL, 1791 NULL, 1792 NULL, 1793 /* 54*/ MatFDColoringCreate_SeqXAIJ, 1794 NULL, 1795 NULL, 1796 NULL, 1797 NULL, 1798 /* 59*/ NULL, 1799 MatDestroy_SeqSELL, 1800 MatView_SeqSELL, 1801 NULL, 1802 NULL, 1803 /* 64*/ NULL, 1804 NULL, 1805 NULL, 1806 NULL, 1807 NULL, 1808 /* 69*/ NULL, 1809 NULL, 1810 NULL, 1811 NULL, 1812 NULL, 1813 /* 74*/ NULL, 1814 MatFDColoringApply_AIJ, /* reuse the FDColoring function for AIJ */ 1815 NULL, 1816 NULL, 1817 NULL, 1818 /* 79*/ NULL, 1819 NULL, 1820 NULL, 1821 NULL, 1822 NULL, 1823 /* 84*/ NULL, 1824 NULL, 1825 NULL, 1826 NULL, 1827 NULL, 1828 /* 89*/ NULL, 1829 NULL, 1830 NULL, 1831 NULL, 1832 NULL, 1833 /* 94*/ NULL, 1834 NULL, 1835 NULL, 1836 NULL, 1837 NULL, 1838 /* 99*/ NULL, 1839 NULL, 1840 NULL, 1841 MatConjugate_SeqSELL, 1842 NULL, 1843 /*104*/ NULL, 1844 NULL, 1845 NULL, 1846 NULL, 1847 NULL, 1848 /*109*/ NULL, 1849 NULL, 1850 NULL, 1851 NULL, 1852 MatMissingDiagonal_SeqSELL, 1853 /*114*/ NULL, 1854 NULL, 1855 NULL, 1856 NULL, 1857 NULL, 1858 /*119*/ NULL, 1859 NULL, 1860 NULL, 1861 NULL, 1862 NULL, 1863 /*124*/ NULL, 1864 NULL, 1865 NULL, 1866 NULL, 1867 NULL, 1868 /*129*/ NULL, 1869 NULL, 1870 NULL, 1871 NULL, 1872 NULL, 1873 /*134*/ NULL, 1874 NULL, 1875 NULL, 1876 NULL, 1877 NULL, 1878 /*139*/ NULL, 1879 NULL, 1880 NULL, 1881 MatFDColoringSetUp_SeqXAIJ, 1882 NULL, 1883 /*144*/ NULL, 1884 NULL, 1885 NULL, 1886 NULL, 1887 NULL, 1888 NULL, 1889 /*150*/ NULL, 1890 NULL}; 1891 1892 PetscErrorCode MatStoreValues_SeqSELL(Mat mat) 1893 { 1894 Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data; 1895 1896 PetscFunctionBegin; 1897 PetscCheck(a->nonew, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first"); 1898 1899 /* allocate space for values if not already there */ 1900 if (!a->saved_values) { PetscCall(PetscMalloc1(a->sliidx[a->totalslices] + 1, &a->saved_values)); } 1901 1902 /* copy values over */ 1903 PetscCall(PetscArraycpy(a->saved_values, a->val, a->sliidx[a->totalslices])); 1904 PetscFunctionReturn(0); 1905 } 1906 1907 PetscErrorCode MatRetrieveValues_SeqSELL(Mat mat) 1908 { 1909 Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data; 1910 1911 PetscFunctionBegin; 1912 PetscCheck(a->nonew, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first"); 1913 PetscCheck(a->saved_values, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatStoreValues(A);first"); 1914 PetscCall(PetscArraycpy(a->val, a->saved_values, a->sliidx[a->totalslices])); 1915 PetscFunctionReturn(0); 1916 } 1917 1918 /*@C 1919 MatSeqSELLRestoreArray - returns access to the array where the data for a `MATSEQSELL` matrix is stored obtained by `MatSeqSELLGetArray()` 1920 1921 Not Collective 1922 1923 Input Parameters: 1924 . mat - a `MATSEQSELL` matrix 1925 . array - pointer to the data 1926 1927 Level: intermediate 1928 1929 .seealso: `MATSEQSELL`, `MatSeqSELLGetArray()`, `MatSeqSELLRestoreArrayF90()` 1930 @*/ 1931 PetscErrorCode MatSeqSELLRestoreArray(Mat A, PetscScalar **array) 1932 { 1933 PetscFunctionBegin; 1934 PetscUseMethod(A, "MatSeqSELLRestoreArray_C", (Mat, PetscScalar **), (A, array)); 1935 PetscFunctionReturn(0); 1936 } 1937 1938 PETSC_EXTERN PetscErrorCode MatCreate_SeqSELL(Mat B) 1939 { 1940 Mat_SeqSELL *b; 1941 PetscMPIInt size; 1942 1943 PetscFunctionBegin; 1944 PetscCall(PetscCitationsRegister(citation, &cited)); 1945 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size)); 1946 PetscCheck(size <= 1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Comm must be of size 1"); 1947 1948 PetscCall(PetscNew(&b)); 1949 1950 B->data = (void *)b; 1951 1952 PetscCall(PetscMemcpy(B->ops, &MatOps_Values, sizeof(struct _MatOps))); 1953 1954 b->row = NULL; 1955 b->col = NULL; 1956 b->icol = NULL; 1957 b->reallocs = 0; 1958 b->ignorezeroentries = PETSC_FALSE; 1959 b->roworiented = PETSC_TRUE; 1960 b->nonew = 0; 1961 b->diag = NULL; 1962 b->solve_work = NULL; 1963 B->spptr = NULL; 1964 b->saved_values = NULL; 1965 b->idiag = NULL; 1966 b->mdiag = NULL; 1967 b->ssor_work = NULL; 1968 b->omega = 1.0; 1969 b->fshift = 0.0; 1970 b->idiagvalid = PETSC_FALSE; 1971 b->keepnonzeropattern = PETSC_FALSE; 1972 1973 PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQSELL)); 1974 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetArray_C", MatSeqSELLGetArray_SeqSELL)); 1975 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLRestoreArray_C", MatSeqSELLRestoreArray_SeqSELL)); 1976 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatStoreValues_C", MatStoreValues_SeqSELL)); 1977 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatRetrieveValues_C", MatRetrieveValues_SeqSELL)); 1978 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLSetPreallocation_C", MatSeqSELLSetPreallocation_SeqSELL)); 1979 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqsell_seqaij_C", MatConvert_SeqSELL_SeqAIJ)); 1980 PetscFunctionReturn(0); 1981 } 1982 1983 /* 1984 Given a matrix generated with MatGetFactor() duplicates all the information in A into B 1985 */ 1986 PetscErrorCode MatDuplicateNoCreate_SeqSELL(Mat C, Mat A, MatDuplicateOption cpvalues, PetscBool mallocmatspace) 1987 { 1988 Mat_SeqSELL *c = (Mat_SeqSELL *)C->data, *a = (Mat_SeqSELL *)A->data; 1989 PetscInt i, m = A->rmap->n; 1990 PetscInt totalslices = a->totalslices; 1991 1992 PetscFunctionBegin; 1993 C->factortype = A->factortype; 1994 c->row = NULL; 1995 c->col = NULL; 1996 c->icol = NULL; 1997 c->reallocs = 0; 1998 C->assembled = PETSC_TRUE; 1999 2000 PetscCall(PetscLayoutReference(A->rmap, &C->rmap)); 2001 PetscCall(PetscLayoutReference(A->cmap, &C->cmap)); 2002 2003 PetscCall(PetscMalloc1(8 * totalslices, &c->rlen)); 2004 PetscCall(PetscMalloc1(totalslices + 1, &c->sliidx)); 2005 2006 for (i = 0; i < m; i++) c->rlen[i] = a->rlen[i]; 2007 for (i = 0; i < totalslices + 1; i++) c->sliidx[i] = a->sliidx[i]; 2008 2009 /* allocate the matrix space */ 2010 if (mallocmatspace) { 2011 PetscCall(PetscMalloc2(a->maxallocmat, &c->val, a->maxallocmat, &c->colidx)); 2012 2013 c->singlemalloc = PETSC_TRUE; 2014 2015 if (m > 0) { 2016 PetscCall(PetscArraycpy(c->colidx, a->colidx, a->maxallocmat)); 2017 if (cpvalues == MAT_COPY_VALUES) { 2018 PetscCall(PetscArraycpy(c->val, a->val, a->maxallocmat)); 2019 } else { 2020 PetscCall(PetscArrayzero(c->val, a->maxallocmat)); 2021 } 2022 } 2023 } 2024 2025 c->ignorezeroentries = a->ignorezeroentries; 2026 c->roworiented = a->roworiented; 2027 c->nonew = a->nonew; 2028 if (a->diag) { 2029 PetscCall(PetscMalloc1(m, &c->diag)); 2030 for (i = 0; i < m; i++) c->diag[i] = a->diag[i]; 2031 } else c->diag = NULL; 2032 2033 c->solve_work = NULL; 2034 c->saved_values = NULL; 2035 c->idiag = NULL; 2036 c->ssor_work = NULL; 2037 c->keepnonzeropattern = a->keepnonzeropattern; 2038 c->free_val = PETSC_TRUE; 2039 c->free_colidx = PETSC_TRUE; 2040 2041 c->maxallocmat = a->maxallocmat; 2042 c->maxallocrow = a->maxallocrow; 2043 c->rlenmax = a->rlenmax; 2044 c->nz = a->nz; 2045 C->preallocated = PETSC_TRUE; 2046 2047 c->nonzerorowcnt = a->nonzerorowcnt; 2048 C->nonzerostate = A->nonzerostate; 2049 2050 PetscCall(PetscFunctionListDuplicate(((PetscObject)A)->qlist, &((PetscObject)C)->qlist)); 2051 PetscFunctionReturn(0); 2052 } 2053 2054 PetscErrorCode MatDuplicate_SeqSELL(Mat A, MatDuplicateOption cpvalues, Mat *B) 2055 { 2056 PetscFunctionBegin; 2057 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B)); 2058 PetscCall(MatSetSizes(*B, A->rmap->n, A->cmap->n, A->rmap->n, A->cmap->n)); 2059 if (!(A->rmap->n % A->rmap->bs) && !(A->cmap->n % A->cmap->bs)) PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 2060 PetscCall(MatSetType(*B, ((PetscObject)A)->type_name)); 2061 PetscCall(MatDuplicateNoCreate_SeqSELL(*B, A, cpvalues, PETSC_TRUE)); 2062 PetscFunctionReturn(0); 2063 } 2064 2065 /*MC 2066 MATSEQSELL - MATSEQSELL = "seqsell" - A matrix type to be used for sequential sparse matrices, 2067 based on the sliced Ellpack format 2068 2069 Options Database Keys: 2070 . -mat_type seqsell - sets the matrix type to "`MATSEQELL` during a call to `MatSetFromOptions()` 2071 2072 Level: beginner 2073 2074 .seealso: `MatCreateSeqSell()`, `MATSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATAIJ`, `MATMPIAIJ` 2075 M*/ 2076 2077 /*MC 2078 MATSELL - MATSELL = "sell" - A matrix type to be used for sparse matrices. 2079 2080 This matrix type is identical to `MATSEQSELL` when constructed with a single process communicator, 2081 and `MATMPISELL` otherwise. As a result, for single process communicators, 2082 `MatSeqSELLSetPreallocation()` is supported, and similarly `MatMPISELLSetPreallocation()` is supported 2083 for communicators controlling multiple processes. It is recommended that you call both of 2084 the above preallocation routines for simplicity. 2085 2086 Options Database Keys: 2087 . -mat_type sell - sets the matrix type to "sell" during a call to MatSetFromOptions() 2088 2089 Level: beginner 2090 2091 Notes: 2092 This format is only supported for real scalars, double precision, and 32 bit indices (the defaults). 2093 2094 It can provide better performance on Intel and AMD processes with AVX2 or AVX512 support for matrices that have a similar number of 2095 non-zeros in contiguous groups of rows. However if the computation is memory bandwidth limited it may not provide much improvement. 2096 2097 Developer Notes: 2098 On Intel (and AMD) systems some of the matrix operations use SIMD (AVX) instructions to achieve higher performance. 2099 2100 The sparse matrix format is as follows. For simplicity we assume a slice size of 2, it is actually 8 2101 .vb 2102 (2 0 3 4) 2103 Consider the matrix A = (5 0 6 0) 2104 (0 0 7 8) 2105 (0 0 9 9) 2106 2107 symbolically the Ellpack format can be written as 2108 2109 (2 3 4 |) (0 2 3 |) 2110 v = (5 6 0 |) colidx = (0 2 2 |) 2111 -------- --------- 2112 (7 8 |) (2 3 |) 2113 (9 9 |) (2 3 |) 2114 2115 The data for 2 contiguous rows of the matrix are stored together (in column-major format) (with any left-over rows handled as a special case). 2116 Any of the rows in a slice fewer columns than the rest of the slice (row 1 above) are padded with a previous valid column in their "extra" colidx[] locations and 2117 zeros in their "extra" v locations so that the matrix operations do not need special code to handle different length rows within the 2 rows in a slice. 2118 2119 The one-dimensional representation of v used in the code is (2 5 3 6 4 0 7 9 8 9) and for colidx is (0 0 2 2 3 2 2 2 3 3) 2120 2121 .ve 2122 2123 See MatMult_SeqSELL() for how this format is used with the SIMD operations to achieve high performance. 2124 2125 References: 2126 . * - Hong Zhang, Richard T. Mills, Karl Rupp, and Barry F. Smith, Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512}, 2127 Proceedings of the 47th International Conference on Parallel Processing, 2018. 2128 2129 .seealso: `MatCreateSeqSELL()`, `MatCreateSeqAIJ()`, `MatCreateSell()`, `MATSEQSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATMPIAIJ`, `MATAIJ` 2130 M*/ 2131 2132 /*@C 2133 MatCreateSeqSELL - Creates a sparse matrix in `MATSEQSELL` format. 2134 2135 Collective on comm 2136 2137 Input Parameters: 2138 + comm - MPI communicator, set to `PETSC_COMM_SELF` 2139 . m - number of rows 2140 . n - number of columns 2141 . rlenmax - maximum number of nonzeros in a row 2142 - rlen - array containing the number of nonzeros in the various rows 2143 (possibly different for each row) or NULL 2144 2145 Output Parameter: 2146 . A - the matrix 2147 2148 It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 2149 MatXXXXSetPreallocation() paradigm instead of this routine directly. 2150 [MatXXXXSetPreallocation() is, for example, `MatSeqSELLSetPreallocation()`] 2151 2152 Notes: 2153 If nnz is given then nz is ignored 2154 2155 Specify the preallocated storage with either rlenmax or rlen (not both). 2156 Set rlenmax = `PETSC_DEFAULT` and rlen = NULL for PETSc to control dynamic memory 2157 allocation. For large problems you MUST preallocate memory or you 2158 will get TERRIBLE performance, see the users' manual chapter on matrices. 2159 2160 Level: intermediate 2161 2162 .seealso: `MATSEQSELL`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatSeqSELLSetPreallocation()`, `MATSELL`, `MATSEQSELL`, `MATMPISELL` 2163 @*/ 2164 PetscErrorCode MatCreateSeqSELL(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt maxallocrow, const PetscInt rlen[], Mat *A) 2165 { 2166 PetscFunctionBegin; 2167 PetscCall(MatCreate(comm, A)); 2168 PetscCall(MatSetSizes(*A, m, n, m, n)); 2169 PetscCall(MatSetType(*A, MATSEQSELL)); 2170 PetscCall(MatSeqSELLSetPreallocation_SeqSELL(*A, maxallocrow, rlen)); 2171 PetscFunctionReturn(0); 2172 } 2173 2174 PetscErrorCode MatEqual_SeqSELL(Mat A, Mat B, PetscBool *flg) 2175 { 2176 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data, *b = (Mat_SeqSELL *)B->data; 2177 PetscInt totalslices = a->totalslices; 2178 2179 PetscFunctionBegin; 2180 /* If the matrix dimensions are not equal,or no of nonzeros */ 2181 if ((A->rmap->n != B->rmap->n) || (A->cmap->n != B->cmap->n) || (a->nz != b->nz) || (a->rlenmax != b->rlenmax)) { 2182 *flg = PETSC_FALSE; 2183 PetscFunctionReturn(0); 2184 } 2185 /* if the a->colidx are the same */ 2186 PetscCall(PetscArraycmp(a->colidx, b->colidx, a->sliidx[totalslices], flg)); 2187 if (!*flg) PetscFunctionReturn(0); 2188 /* if a->val are the same */ 2189 PetscCall(PetscArraycmp(a->val, b->val, a->sliidx[totalslices], flg)); 2190 PetscFunctionReturn(0); 2191 } 2192 2193 PetscErrorCode MatSeqSELLInvalidateDiagonal(Mat A) 2194 { 2195 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 2196 2197 PetscFunctionBegin; 2198 a->idiagvalid = PETSC_FALSE; 2199 PetscFunctionReturn(0); 2200 } 2201 2202 PetscErrorCode MatConjugate_SeqSELL(Mat A) 2203 { 2204 #if defined(PETSC_USE_COMPLEX) 2205 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 2206 PetscInt i; 2207 PetscScalar *val = a->val; 2208 2209 PetscFunctionBegin; 2210 for (i = 0; i < a->sliidx[a->totalslices]; i++) val[i] = PetscConj(val[i]); 2211 #else 2212 PetscFunctionBegin; 2213 #endif 2214 PetscFunctionReturn(0); 2215 } 2216