1 /* 2 Defines the basic matrix operations for the SELL matrix storage format. 3 */ 4 #include <../src/mat/impls/sell/seq/sell.h> /*I "petscmat.h" I*/ 5 #include <petscblaslapack.h> 6 #include <petsc/private/kernels/blocktranspose.h> 7 8 static PetscBool cited = PETSC_FALSE; 9 static const char citation[] = "@inproceedings{ZhangELLPACK2018,\n" 10 " author = {Hong Zhang and Richard T. Mills and Karl Rupp and Barry F. Smith},\n" 11 " title = {Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512}},\n" 12 " booktitle = {Proceedings of the 47th International Conference on Parallel Processing},\n" 13 " year = 2018\n" 14 "}\n"; 15 16 #if defined(PETSC_HAVE_IMMINTRIN_H) && (defined(__AVX512F__) || (defined(__AVX2__) && defined(__FMA__)) || defined(__AVX__)) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 17 18 #include <immintrin.h> 19 20 #if !defined(_MM_SCALE_8) 21 #define _MM_SCALE_8 8 22 #endif 23 24 #if defined(__AVX512F__) 25 /* these do not work 26 vec_idx = _mm512_loadunpackhi_epi32(vec_idx,acolidx); 27 vec_vals = _mm512_loadunpackhi_pd(vec_vals,aval); 28 */ 29 #define AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \ 30 /* if the mask bit is set, copy from acolidx, otherwise from vec_idx */ \ 31 vec_idx = _mm256_loadu_si256((__m256i const *)acolidx); \ 32 vec_vals = _mm512_loadu_pd(aval); \ 33 vec_x = _mm512_i32gather_pd(vec_idx, x, _MM_SCALE_8); \ 34 vec_y = _mm512_fmadd_pd(vec_x, vec_vals, vec_y) 35 #elif defined(__AVX2__) && defined(__FMA__) 36 #define AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \ 37 vec_vals = _mm256_loadu_pd(aval); \ 38 vec_idx = _mm_loadu_si128((__m128i const *)acolidx); /* SSE2 */ \ 39 vec_x = _mm256_i32gather_pd(x, vec_idx, _MM_SCALE_8); \ 40 vec_y = _mm256_fmadd_pd(vec_x, vec_vals, vec_y) 41 #endif 42 #endif /* PETSC_HAVE_IMMINTRIN_H */ 43 44 /*@C 45 MatSeqSELLSetPreallocation - For good matrix assembly performance 46 the user should preallocate the matrix storage by setting the parameter `nz` 47 (or the array `nnz`). 48 49 Collective 50 51 Input Parameters: 52 + B - The `MATSEQSELL` matrix 53 . rlenmax - number of nonzeros per row (same for all rows), ignored if `rlen` is provided 54 - rlen - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL` 55 56 Level: intermediate 57 58 Notes: 59 Specify the preallocated storage with either `rlenmax` or `rlen` (not both). 60 Set `rlenmax` = `PETSC_DEFAULT` and `rlen` = `NULL` for PETSc to control dynamic memory 61 allocation. 62 63 You can call `MatGetInfo()` to get information on how effective the preallocation was; 64 for example the fields mallocs,nz_allocated,nz_used,nz_unneeded; 65 You can also run with the option `-info` and look for messages with the string 66 malloc in them to see if additional memory allocation was needed. 67 68 Developer Notes: 69 Use `rlenmax` of `MAT_SKIP_ALLOCATION` to not allocate any space for the matrix 70 entries or columns indices. 71 72 The maximum number of nonzeos in any row should be as accurate as possible. 73 If it is underestimated, you will get bad performance due to reallocation 74 (`MatSeqXSELLReallocateSELL()`). 75 76 .seealso: `Mat`, `MATSEQSELL`, `MATSELL`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatGetInfo()` 77 @*/ 78 PetscErrorCode MatSeqSELLSetPreallocation(Mat B, PetscInt rlenmax, const PetscInt rlen[]) 79 { 80 PetscFunctionBegin; 81 PetscValidHeaderSpecific(B, MAT_CLASSID, 1); 82 PetscValidType(B, 1); 83 PetscTryMethod(B, "MatSeqSELLSetPreallocation_C", (Mat, PetscInt, const PetscInt[]), (B, rlenmax, rlen)); 84 PetscFunctionReturn(PETSC_SUCCESS); 85 } 86 87 PetscErrorCode MatSeqSELLSetPreallocation_SeqSELL(Mat B, PetscInt maxallocrow, const PetscInt rlen[]) 88 { 89 Mat_SeqSELL *b; 90 PetscInt i, j, totalslices; 91 #if defined(PETSC_HAVE_CUDA) 92 PetscInt rlenmax = 0; 93 #endif 94 PetscBool skipallocation = PETSC_FALSE, realalloc = PETSC_FALSE; 95 96 PetscFunctionBegin; 97 if (maxallocrow >= 0 || rlen) realalloc = PETSC_TRUE; 98 if (maxallocrow == MAT_SKIP_ALLOCATION) { 99 skipallocation = PETSC_TRUE; 100 maxallocrow = 0; 101 } 102 103 PetscCall(PetscLayoutSetUp(B->rmap)); 104 PetscCall(PetscLayoutSetUp(B->cmap)); 105 106 /* FIXME: if one preallocates more space than needed, the matrix does not shrink automatically, but for best performance it should */ 107 if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 5; 108 PetscCheck(maxallocrow >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "maxallocrow cannot be less than 0: value %" PetscInt_FMT, maxallocrow); 109 if (rlen) { 110 for (i = 0; i < B->rmap->n; i++) { 111 PetscCheck(rlen[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "rlen cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, rlen[i]); 112 PetscCheck(rlen[i] <= B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "rlen cannot be greater than row length: local row %" PetscInt_FMT " value %" PetscInt_FMT " rowlength %" PetscInt_FMT, i, rlen[i], B->cmap->n); 113 } 114 } 115 116 B->preallocated = PETSC_TRUE; 117 118 b = (Mat_SeqSELL *)B->data; 119 120 if (!b->sliceheight) { /* not set yet */ 121 #if defined(PETSC_HAVE_CUDA) 122 b->sliceheight = 16; 123 #else 124 b->sliceheight = 8; 125 #endif 126 } 127 totalslices = PetscCeilInt(B->rmap->n, b->sliceheight); 128 b->totalslices = totalslices; 129 if (!skipallocation) { 130 if (B->rmap->n % b->sliceheight) PetscCall(PetscInfo(B, "Padding rows to the SEQSELL matrix because the number of rows is not the multiple of the slice height (value %" PetscInt_FMT ")\n", B->rmap->n)); 131 132 if (!b->sliidx) { /* sliidx gives the starting index of each slice, the last element is the total space allocated */ 133 PetscCall(PetscMalloc1(totalslices + 1, &b->sliidx)); 134 } 135 if (!rlen) { /* if rlen is not provided, allocate same space for all the slices */ 136 if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 10; 137 else if (maxallocrow < 0) maxallocrow = 1; 138 #if defined(PETSC_HAVE_CUDA) 139 rlenmax = maxallocrow; 140 /* Pad the slice to DEVICE_MEM_ALIGN */ 141 while (b->sliceheight * maxallocrow % DEVICE_MEM_ALIGN) maxallocrow++; 142 #endif 143 for (i = 0; i <= totalslices; i++) b->sliidx[i] = b->sliceheight * i * maxallocrow; 144 } else { 145 #if defined(PETSC_HAVE_CUDA) 146 PetscInt mul = DEVICE_MEM_ALIGN / b->sliceheight; 147 #endif 148 maxallocrow = 0; 149 b->sliidx[0] = 0; 150 for (i = 1; i < totalslices; i++) { 151 b->sliidx[i] = 0; 152 for (j = 0; j < b->sliceheight; j++) { b->sliidx[i] = PetscMax(b->sliidx[i], rlen[b->sliceheight * (i - 1) + j]); } 153 #if defined(PETSC_HAVE_CUDA) 154 rlenmax = PetscMax(b->sliidx[i], rlenmax); 155 /* Pad the slice to DEVICE_MEM_ALIGN */ 156 b->sliidx[i] = ((b->sliidx[i] - 1) / mul + 1) * mul; 157 #endif 158 maxallocrow = PetscMax(b->sliidx[i], maxallocrow); 159 PetscCall(PetscIntSumError(b->sliidx[i - 1], b->sliceheight * b->sliidx[i], &b->sliidx[i])); 160 } 161 /* last slice */ 162 b->sliidx[totalslices] = 0; 163 for (j = b->sliceheight * (totalslices - 1); j < B->rmap->n; j++) b->sliidx[totalslices] = PetscMax(b->sliidx[totalslices], rlen[j]); 164 #if defined(PETSC_HAVE_CUDA) 165 rlenmax = PetscMax(b->sliidx[i], rlenmax); 166 b->sliidx[totalslices] = ((b->sliidx[totalslices] - 1) / mul + 1) * mul; 167 #endif 168 maxallocrow = PetscMax(b->sliidx[totalslices], maxallocrow); 169 b->sliidx[totalslices] = b->sliidx[totalslices - 1] + b->sliceheight * b->sliidx[totalslices]; 170 } 171 172 /* allocate space for val, colidx, rlen */ 173 /* FIXME: should B's old memory be unlogged? */ 174 PetscCall(MatSeqXSELLFreeSELL(B, &b->val, &b->colidx)); 175 /* FIXME: assuming an element of the bit array takes 8 bits */ 176 PetscCall(PetscMalloc2(b->sliidx[totalslices], &b->val, b->sliidx[totalslices], &b->colidx)); 177 /* b->rlen will count nonzeros in each row so far. We dont copy rlen to b->rlen because the matrix has not been set. */ 178 PetscCall(PetscCalloc1(b->sliceheight * totalslices, &b->rlen)); 179 180 b->singlemalloc = PETSC_TRUE; 181 b->free_val = PETSC_TRUE; 182 b->free_colidx = PETSC_TRUE; 183 } else { 184 b->free_val = PETSC_FALSE; 185 b->free_colidx = PETSC_FALSE; 186 } 187 188 b->nz = 0; 189 b->maxallocrow = maxallocrow; 190 #if defined(PETSC_HAVE_CUDA) 191 b->rlenmax = rlenmax; 192 #else 193 b->rlenmax = maxallocrow; 194 #endif 195 b->maxallocmat = b->sliidx[totalslices]; 196 B->info.nz_unneeded = (double)b->maxallocmat; 197 if (realalloc) PetscCall(MatSetOption(B, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE)); 198 PetscFunctionReturn(PETSC_SUCCESS); 199 } 200 201 static PetscErrorCode MatGetRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v) 202 { 203 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 204 PetscInt shift; 205 206 PetscFunctionBegin; 207 PetscCheck(row >= 0 && row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row %" PetscInt_FMT " out of range", row); 208 if (nz) *nz = a->rlen[row]; 209 shift = a->sliidx[row / a->sliceheight] + (row % a->sliceheight); 210 if (!a->getrowcols) { PetscCall(PetscMalloc2(a->rlenmax, &a->getrowcols, a->rlenmax, &a->getrowvals)); } 211 if (idx) { 212 PetscInt j; 213 for (j = 0; j < a->rlen[row]; j++) a->getrowcols[j] = a->colidx[shift + a->sliceheight * j]; 214 *idx = a->getrowcols; 215 } 216 if (v) { 217 PetscInt j; 218 for (j = 0; j < a->rlen[row]; j++) a->getrowvals[j] = a->val[shift + a->sliceheight * j]; 219 *v = a->getrowvals; 220 } 221 PetscFunctionReturn(PETSC_SUCCESS); 222 } 223 224 static PetscErrorCode MatRestoreRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v) 225 { 226 PetscFunctionBegin; 227 PetscFunctionReturn(PETSC_SUCCESS); 228 } 229 230 PetscErrorCode MatConvert_SeqSELL_SeqAIJ(Mat A, MatType newtype, MatReuse reuse, Mat *newmat) 231 { 232 Mat B; 233 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 234 PetscInt i; 235 236 PetscFunctionBegin; 237 if (reuse == MAT_REUSE_MATRIX) { 238 B = *newmat; 239 PetscCall(MatZeroEntries(B)); 240 } else { 241 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B)); 242 PetscCall(MatSetSizes(B, A->rmap->n, A->cmap->n, A->rmap->N, A->cmap->N)); 243 PetscCall(MatSetType(B, MATSEQAIJ)); 244 PetscCall(MatSeqAIJSetPreallocation(B, 0, a->rlen)); 245 } 246 247 for (i = 0; i < A->rmap->n; i++) { 248 PetscInt nz = 0, *cols = NULL; 249 PetscScalar *vals = NULL; 250 251 PetscCall(MatGetRow_SeqSELL(A, i, &nz, &cols, &vals)); 252 PetscCall(MatSetValues(B, 1, &i, nz, cols, vals, INSERT_VALUES)); 253 PetscCall(MatRestoreRow_SeqSELL(A, i, &nz, &cols, &vals)); 254 } 255 256 PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY)); 257 PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY)); 258 B->rmap->bs = A->rmap->bs; 259 260 if (reuse == MAT_INPLACE_MATRIX) { 261 PetscCall(MatHeaderReplace(A, &B)); 262 } else { 263 *newmat = B; 264 } 265 PetscFunctionReturn(PETSC_SUCCESS); 266 } 267 268 #include <../src/mat/impls/aij/seq/aij.h> 269 270 PetscErrorCode MatConvert_SeqAIJ_SeqSELL(Mat A, MatType newtype, MatReuse reuse, Mat *newmat) 271 { 272 Mat B; 273 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 274 PetscInt *ai = a->i, m = A->rmap->N, n = A->cmap->N, i, *rowlengths, row, ncols; 275 const PetscInt *cols; 276 const PetscScalar *vals; 277 278 PetscFunctionBegin; 279 if (reuse == MAT_REUSE_MATRIX) { 280 B = *newmat; 281 } else { 282 if (PetscDefined(USE_DEBUG) || !a->ilen) { 283 PetscCall(PetscMalloc1(m, &rowlengths)); 284 for (i = 0; i < m; i++) rowlengths[i] = ai[i + 1] - ai[i]; 285 } 286 if (PetscDefined(USE_DEBUG) && a->ilen) { 287 PetscBool eq; 288 PetscCall(PetscMemcmp(rowlengths, a->ilen, m * sizeof(PetscInt), &eq)); 289 PetscCheck(eq, PETSC_COMM_SELF, PETSC_ERR_PLIB, "SeqAIJ ilen array incorrect"); 290 PetscCall(PetscFree(rowlengths)); 291 rowlengths = a->ilen; 292 } else if (a->ilen) rowlengths = a->ilen; 293 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B)); 294 PetscCall(MatSetSizes(B, m, n, m, n)); 295 PetscCall(MatSetType(B, MATSEQSELL)); 296 PetscCall(MatSeqSELLSetPreallocation(B, 0, rowlengths)); 297 if (rowlengths != a->ilen) PetscCall(PetscFree(rowlengths)); 298 } 299 300 for (row = 0; row < m; row++) { 301 PetscCall(MatGetRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals)); 302 PetscCall(MatSetValues_SeqSELL(B, 1, &row, ncols, cols, vals, INSERT_VALUES)); 303 PetscCall(MatRestoreRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals)); 304 } 305 PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY)); 306 PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY)); 307 B->rmap->bs = A->rmap->bs; 308 309 if (reuse == MAT_INPLACE_MATRIX) { 310 PetscCall(MatHeaderReplace(A, &B)); 311 } else { 312 *newmat = B; 313 } 314 PetscFunctionReturn(PETSC_SUCCESS); 315 } 316 317 PetscErrorCode MatMult_SeqSELL(Mat A, Vec xx, Vec yy) 318 { 319 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 320 PetscScalar *y; 321 const PetscScalar *x; 322 const MatScalar *aval = a->val; 323 PetscInt totalslices = a->totalslices; 324 const PetscInt *acolidx = a->colidx; 325 PetscInt i, j; 326 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 327 __m512d vec_x, vec_y, vec_vals; 328 __m256i vec_idx; 329 __mmask8 mask; 330 __m512d vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4; 331 __m256i vec_idx2, vec_idx3, vec_idx4; 332 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 333 __m128i vec_idx; 334 __m256d vec_x, vec_y, vec_y2, vec_vals; 335 MatScalar yval; 336 PetscInt r, rows_left, row, nnz_in_row; 337 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 338 __m128d vec_x_tmp; 339 __m256d vec_x, vec_y, vec_y2, vec_vals; 340 MatScalar yval; 341 PetscInt r, rows_left, row, nnz_in_row; 342 #else 343 PetscInt k, sliceheight = a->sliceheight; 344 PetscScalar *sum; 345 #endif 346 347 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 348 #pragma disjoint(*x, *y, *aval) 349 #endif 350 351 PetscFunctionBegin; 352 PetscCall(VecGetArrayRead(xx, &x)); 353 PetscCall(VecGetArray(yy, &y)); 354 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 355 PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight); 356 for (i = 0; i < totalslices; i++) { /* loop over slices */ 357 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 358 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 359 360 vec_y = _mm512_setzero_pd(); 361 vec_y2 = _mm512_setzero_pd(); 362 vec_y3 = _mm512_setzero_pd(); 363 vec_y4 = _mm512_setzero_pd(); 364 365 j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice column */ 366 switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) { 367 case 3: 368 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 369 acolidx += 8; 370 aval += 8; 371 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 372 acolidx += 8; 373 aval += 8; 374 AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3); 375 acolidx += 8; 376 aval += 8; 377 j += 3; 378 break; 379 case 2: 380 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 381 acolidx += 8; 382 aval += 8; 383 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 384 acolidx += 8; 385 aval += 8; 386 j += 2; 387 break; 388 case 1: 389 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 390 acolidx += 8; 391 aval += 8; 392 j += 1; 393 break; 394 } 395 #pragma novector 396 for (; j < (a->sliidx[i + 1] >> 3); j += 4) { 397 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 398 acolidx += 8; 399 aval += 8; 400 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 401 acolidx += 8; 402 aval += 8; 403 AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3); 404 acolidx += 8; 405 aval += 8; 406 AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4); 407 acolidx += 8; 408 aval += 8; 409 } 410 411 vec_y = _mm512_add_pd(vec_y, vec_y2); 412 vec_y = _mm512_add_pd(vec_y, vec_y3); 413 vec_y = _mm512_add_pd(vec_y, vec_y4); 414 if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */ 415 mask = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07))); 416 _mm512_mask_storeu_pd(&y[8 * i], mask, vec_y); 417 } else { 418 _mm512_storeu_pd(&y[8 * i], vec_y); 419 } 420 } 421 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 422 PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight); 423 for (i = 0; i < totalslices; i++) { /* loop over full slices */ 424 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 425 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 426 427 /* last slice may have padding rows. Don't use vectorization. */ 428 if (i == totalslices - 1 && (A->rmap->n & 0x07)) { 429 rows_left = A->rmap->n - 8 * i; 430 for (r = 0; r < rows_left; ++r) { 431 yval = (MatScalar)0; 432 row = 8 * i + r; 433 nnz_in_row = a->rlen[row]; 434 for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]]; 435 y[row] = yval; 436 } 437 break; 438 } 439 440 vec_y = _mm256_setzero_pd(); 441 vec_y2 = _mm256_setzero_pd(); 442 443 /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */ 444 #pragma novector 445 #pragma unroll(2) 446 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) { 447 AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 448 aval += 4; 449 acolidx += 4; 450 AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y2); 451 aval += 4; 452 acolidx += 4; 453 } 454 455 _mm256_storeu_pd(y + i * 8, vec_y); 456 _mm256_storeu_pd(y + i * 8 + 4, vec_y2); 457 } 458 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 459 PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight); 460 for (i = 0; i < totalslices; i++) { /* loop over full slices */ 461 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 462 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 463 464 vec_y = _mm256_setzero_pd(); 465 vec_y2 = _mm256_setzero_pd(); 466 467 /* last slice may have padding rows. Don't use vectorization. */ 468 if (i == totalslices - 1 && (A->rmap->n & 0x07)) { 469 rows_left = A->rmap->n - 8 * i; 470 for (r = 0; r < rows_left; ++r) { 471 yval = (MatScalar)0; 472 row = 8 * i + r; 473 nnz_in_row = a->rlen[row]; 474 for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]]; 475 y[row] = yval; 476 } 477 break; 478 } 479 480 /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */ 481 #pragma novector 482 #pragma unroll(2) 483 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) { 484 vec_vals = _mm256_loadu_pd(aval); 485 vec_x_tmp = _mm_setzero_pd(); 486 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 487 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 488 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0); 489 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 490 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 491 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1); 492 vec_y = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y); 493 aval += 4; 494 495 vec_vals = _mm256_loadu_pd(aval); 496 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 497 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 498 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0); 499 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 500 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 501 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1); 502 vec_y2 = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2); 503 aval += 4; 504 } 505 506 _mm256_storeu_pd(y + i * 8, vec_y); 507 _mm256_storeu_pd(y + i * 8 + 4, vec_y2); 508 } 509 #else 510 PetscCall(PetscMalloc1(sliceheight, &sum)); 511 for (i = 0; i < totalslices; i++) { /* loop over slices */ 512 for (j = 0; j < sliceheight; j++) { 513 sum[j] = 0.0; 514 for (k = a->sliidx[i] + j; k < a->sliidx[i + 1]; k += sliceheight) sum[j] += aval[k] * x[acolidx[k]]; 515 } 516 if (i == totalslices - 1 && (A->rmap->n % sliceheight)) { /* if last slice has padding rows */ 517 for (j = 0; j < (A->rmap->n % sliceheight); j++) y[sliceheight * i + j] = sum[j]; 518 } else { 519 for (j = 0; j < sliceheight; j++) y[sliceheight * i + j] = sum[j]; 520 } 521 } 522 PetscCall(PetscFree(sum)); 523 #endif 524 525 PetscCall(PetscLogFlops(2.0 * a->nz - a->nonzerorowcnt)); /* theoretical minimal FLOPs */ 526 PetscCall(VecRestoreArrayRead(xx, &x)); 527 PetscCall(VecRestoreArray(yy, &y)); 528 PetscFunctionReturn(PETSC_SUCCESS); 529 } 530 531 #include <../src/mat/impls/aij/seq/ftn-kernels/fmultadd.h> 532 PetscErrorCode MatMultAdd_SeqSELL(Mat A, Vec xx, Vec yy, Vec zz) 533 { 534 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 535 PetscScalar *y, *z; 536 const PetscScalar *x; 537 const MatScalar *aval = a->val; 538 PetscInt totalslices = a->totalslices; 539 const PetscInt *acolidx = a->colidx; 540 PetscInt i, j; 541 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 542 __m512d vec_x, vec_y, vec_vals; 543 __m256i vec_idx; 544 __mmask8 mask = 0; 545 __m512d vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4; 546 __m256i vec_idx2, vec_idx3, vec_idx4; 547 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 548 __m128d vec_x_tmp; 549 __m256d vec_x, vec_y, vec_y2, vec_vals; 550 MatScalar yval; 551 PetscInt r, row, nnz_in_row; 552 #else 553 PetscInt k, sliceheight = a->sliceheight; 554 PetscScalar *sum; 555 #endif 556 557 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 558 #pragma disjoint(*x, *y, *aval) 559 #endif 560 561 PetscFunctionBegin; 562 if (!a->nz) { 563 PetscCall(VecCopy(yy, zz)); 564 PetscFunctionReturn(PETSC_SUCCESS); 565 } 566 PetscCall(VecGetArrayRead(xx, &x)); 567 PetscCall(VecGetArrayPair(yy, zz, &y, &z)); 568 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 569 PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight); 570 for (i = 0; i < totalslices; i++) { /* loop over slices */ 571 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 572 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 573 574 if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */ 575 mask = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07))); 576 vec_y = _mm512_mask_loadu_pd(vec_y, mask, &y[8 * i]); 577 } else { 578 vec_y = _mm512_loadu_pd(&y[8 * i]); 579 } 580 vec_y2 = _mm512_setzero_pd(); 581 vec_y3 = _mm512_setzero_pd(); 582 vec_y4 = _mm512_setzero_pd(); 583 584 j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice column */ 585 switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) { 586 case 3: 587 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 588 acolidx += 8; 589 aval += 8; 590 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 591 acolidx += 8; 592 aval += 8; 593 AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3); 594 acolidx += 8; 595 aval += 8; 596 j += 3; 597 break; 598 case 2: 599 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 600 acolidx += 8; 601 aval += 8; 602 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 603 acolidx += 8; 604 aval += 8; 605 j += 2; 606 break; 607 case 1: 608 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 609 acolidx += 8; 610 aval += 8; 611 j += 1; 612 break; 613 } 614 #pragma novector 615 for (; j < (a->sliidx[i + 1] >> 3); j += 4) { 616 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 617 acolidx += 8; 618 aval += 8; 619 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 620 acolidx += 8; 621 aval += 8; 622 AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3); 623 acolidx += 8; 624 aval += 8; 625 AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4); 626 acolidx += 8; 627 aval += 8; 628 } 629 630 vec_y = _mm512_add_pd(vec_y, vec_y2); 631 vec_y = _mm512_add_pd(vec_y, vec_y3); 632 vec_y = _mm512_add_pd(vec_y, vec_y4); 633 if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */ 634 _mm512_mask_storeu_pd(&z[8 * i], mask, vec_y); 635 } else { 636 _mm512_storeu_pd(&z[8 * i], vec_y); 637 } 638 } 639 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 640 PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight); 641 for (i = 0; i < totalslices; i++) { /* loop over full slices */ 642 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 643 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 644 645 /* last slice may have padding rows. Don't use vectorization. */ 646 if (i == totalslices - 1 && (A->rmap->n & 0x07)) { 647 for (r = 0; r < (A->rmap->n & 0x07); ++r) { 648 row = 8 * i + r; 649 yval = (MatScalar)0.0; 650 nnz_in_row = a->rlen[row]; 651 for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]]; 652 z[row] = y[row] + yval; 653 } 654 break; 655 } 656 657 vec_y = _mm256_loadu_pd(y + 8 * i); 658 vec_y2 = _mm256_loadu_pd(y + 8 * i + 4); 659 660 /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */ 661 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) { 662 vec_vals = _mm256_loadu_pd(aval); 663 vec_x_tmp = _mm_setzero_pd(); 664 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 665 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 666 vec_x = _mm256_setzero_pd(); 667 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0); 668 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 669 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 670 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1); 671 vec_y = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y); 672 aval += 4; 673 674 vec_vals = _mm256_loadu_pd(aval); 675 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 676 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 677 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0); 678 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 679 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 680 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1); 681 vec_y2 = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2); 682 aval += 4; 683 } 684 685 _mm256_storeu_pd(z + i * 8, vec_y); 686 _mm256_storeu_pd(z + i * 8 + 4, vec_y2); 687 } 688 #else 689 PetscCall(PetscMalloc1(sliceheight, &sum)); 690 for (i = 0; i < totalslices; i++) { /* loop over slices */ 691 for (j = 0; j < sliceheight; j++) { 692 sum[j] = 0.0; 693 for (k = a->sliidx[i] + j; k < a->sliidx[i + 1]; k += sliceheight) sum[j] += aval[k] * x[acolidx[k]]; 694 } 695 if (i == totalslices - 1 && (A->rmap->n % sliceheight)) { 696 for (j = 0; j < (A->rmap->n % sliceheight); j++) z[sliceheight * i + j] = y[sliceheight * i + j] + sum[j]; 697 } else { 698 for (j = 0; j < sliceheight; j++) z[sliceheight * i + j] = y[sliceheight * i + j] + sum[j]; 699 } 700 } 701 PetscCall(PetscFree(sum)); 702 #endif 703 704 PetscCall(PetscLogFlops(2.0 * a->nz)); 705 PetscCall(VecRestoreArrayRead(xx, &x)); 706 PetscCall(VecRestoreArrayPair(yy, zz, &y, &z)); 707 PetscFunctionReturn(PETSC_SUCCESS); 708 } 709 710 PetscErrorCode MatMultTransposeAdd_SeqSELL(Mat A, Vec xx, Vec zz, Vec yy) 711 { 712 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 713 PetscScalar *y; 714 const PetscScalar *x; 715 const MatScalar *aval = a->val; 716 const PetscInt *acolidx = a->colidx; 717 PetscInt i, j, r, row, nnz_in_row, totalslices = a->totalslices, sliceheight = a->sliceheight; 718 719 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 720 #pragma disjoint(*x, *y, *aval) 721 #endif 722 723 PetscFunctionBegin; 724 if (A->symmetric == PETSC_BOOL3_TRUE) { 725 PetscCall(MatMultAdd_SeqSELL(A, xx, zz, yy)); 726 PetscFunctionReturn(PETSC_SUCCESS); 727 } 728 if (zz != yy) PetscCall(VecCopy(zz, yy)); 729 730 if (a->nz) { 731 PetscCall(VecGetArrayRead(xx, &x)); 732 PetscCall(VecGetArray(yy, &y)); 733 for (i = 0; i < a->totalslices; i++) { /* loop over slices */ 734 if (i == totalslices - 1 && (A->rmap->n % sliceheight)) { 735 for (r = 0; r < (A->rmap->n % sliceheight); ++r) { 736 row = sliceheight * i + r; 737 nnz_in_row = a->rlen[row]; 738 for (j = 0; j < nnz_in_row; ++j) y[acolidx[sliceheight * j + r]] += aval[sliceheight * j + r] * x[row]; 739 } 740 break; 741 } 742 for (r = 0; r < sliceheight; ++r) 743 for (j = a->sliidx[i] + r; j < a->sliidx[i + 1]; j += sliceheight) y[acolidx[j]] += aval[j] * x[sliceheight * i + r]; 744 } 745 PetscCall(PetscLogFlops(2.0 * a->nz)); 746 PetscCall(VecRestoreArrayRead(xx, &x)); 747 PetscCall(VecRestoreArray(yy, &y)); 748 } 749 PetscFunctionReturn(PETSC_SUCCESS); 750 } 751 752 PetscErrorCode MatMultTranspose_SeqSELL(Mat A, Vec xx, Vec yy) 753 { 754 PetscFunctionBegin; 755 if (A->symmetric == PETSC_BOOL3_TRUE) { 756 PetscCall(MatMult_SeqSELL(A, xx, yy)); 757 } else { 758 PetscCall(VecSet(yy, 0.0)); 759 PetscCall(MatMultTransposeAdd_SeqSELL(A, xx, yy, yy)); 760 } 761 PetscFunctionReturn(PETSC_SUCCESS); 762 } 763 764 /* 765 Checks for missing diagonals 766 */ 767 PetscErrorCode MatMissingDiagonal_SeqSELL(Mat A, PetscBool *missing, PetscInt *d) 768 { 769 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 770 PetscInt *diag, i; 771 772 PetscFunctionBegin; 773 *missing = PETSC_FALSE; 774 if (A->rmap->n > 0 && !a->colidx) { 775 *missing = PETSC_TRUE; 776 if (d) *d = 0; 777 PetscCall(PetscInfo(A, "Matrix has no entries therefore is missing diagonal\n")); 778 } else { 779 diag = a->diag; 780 for (i = 0; i < A->rmap->n; i++) { 781 if (diag[i] == -1) { 782 *missing = PETSC_TRUE; 783 if (d) *d = i; 784 PetscCall(PetscInfo(A, "Matrix is missing diagonal number %" PetscInt_FMT "\n", i)); 785 break; 786 } 787 } 788 } 789 PetscFunctionReturn(PETSC_SUCCESS); 790 } 791 792 PetscErrorCode MatMarkDiagonal_SeqSELL(Mat A) 793 { 794 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 795 PetscInt i, j, m = A->rmap->n, shift; 796 797 PetscFunctionBegin; 798 if (!a->diag) { 799 PetscCall(PetscMalloc1(m, &a->diag)); 800 a->free_diag = PETSC_TRUE; 801 } 802 for (i = 0; i < m; i++) { /* loop over rows */ 803 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */ 804 a->diag[i] = -1; 805 for (j = 0; j < a->rlen[i]; j++) { 806 if (a->colidx[shift + a->sliceheight * j] == i) { 807 a->diag[i] = shift + a->sliceheight * j; 808 break; 809 } 810 } 811 } 812 PetscFunctionReturn(PETSC_SUCCESS); 813 } 814 815 /* 816 Negative shift indicates do not generate an error if there is a zero diagonal, just invert it anyways 817 */ 818 PetscErrorCode MatInvertDiagonal_SeqSELL(Mat A, PetscScalar omega, PetscScalar fshift) 819 { 820 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 821 PetscInt i, *diag, m = A->rmap->n; 822 MatScalar *val = a->val; 823 PetscScalar *idiag, *mdiag; 824 825 PetscFunctionBegin; 826 if (a->idiagvalid) PetscFunctionReturn(PETSC_SUCCESS); 827 PetscCall(MatMarkDiagonal_SeqSELL(A)); 828 diag = a->diag; 829 if (!a->idiag) { 830 PetscCall(PetscMalloc3(m, &a->idiag, m, &a->mdiag, m, &a->ssor_work)); 831 val = a->val; 832 } 833 mdiag = a->mdiag; 834 idiag = a->idiag; 835 836 if (omega == 1.0 && PetscRealPart(fshift) <= 0.0) { 837 for (i = 0; i < m; i++) { 838 mdiag[i] = val[diag[i]]; 839 if (!PetscAbsScalar(mdiag[i])) { /* zero diagonal */ 840 PetscCheck(PetscRealPart(fshift), PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Zero diagonal on row %" PetscInt_FMT, i); 841 PetscCall(PetscInfo(A, "Zero diagonal on row %" PetscInt_FMT "\n", i)); 842 A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 843 A->factorerror_zeropivot_value = 0.0; 844 A->factorerror_zeropivot_row = i; 845 } 846 idiag[i] = 1.0 / val[diag[i]]; 847 } 848 PetscCall(PetscLogFlops(m)); 849 } else { 850 for (i = 0; i < m; i++) { 851 mdiag[i] = val[diag[i]]; 852 idiag[i] = omega / (fshift + val[diag[i]]); 853 } 854 PetscCall(PetscLogFlops(2.0 * m)); 855 } 856 a->idiagvalid = PETSC_TRUE; 857 PetscFunctionReturn(PETSC_SUCCESS); 858 } 859 860 PetscErrorCode MatZeroEntries_SeqSELL(Mat A) 861 { 862 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 863 864 PetscFunctionBegin; 865 PetscCall(PetscArrayzero(a->val, a->sliidx[a->totalslices])); 866 PetscCall(MatSeqSELLInvalidateDiagonal(A)); 867 PetscFunctionReturn(PETSC_SUCCESS); 868 } 869 870 PetscErrorCode MatDestroy_SeqSELL(Mat A) 871 { 872 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 873 874 PetscFunctionBegin; 875 PetscCall(PetscLogObjectState((PetscObject)A, "Rows=%" PetscInt_FMT ", Cols=%" PetscInt_FMT ", NZ=%" PetscInt_FMT, A->rmap->n, A->cmap->n, a->nz)); 876 PetscCall(MatSeqXSELLFreeSELL(A, &a->val, &a->colidx)); 877 PetscCall(ISDestroy(&a->row)); 878 PetscCall(ISDestroy(&a->col)); 879 PetscCall(PetscFree(a->diag)); 880 PetscCall(PetscFree(a->rlen)); 881 PetscCall(PetscFree(a->sliidx)); 882 PetscCall(PetscFree3(a->idiag, a->mdiag, a->ssor_work)); 883 PetscCall(PetscFree(a->solve_work)); 884 PetscCall(ISDestroy(&a->icol)); 885 PetscCall(PetscFree(a->saved_values)); 886 PetscCall(PetscFree2(a->getrowcols, a->getrowvals)); 887 PetscCall(PetscFree(A->data)); 888 #if defined(PETSC_HAVE_CUDA) 889 PetscCall(PetscFree(a->chunk_slice_map)); 890 #endif 891 892 PetscCall(PetscObjectChangeTypeName((PetscObject)A, NULL)); 893 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatStoreValues_C", NULL)); 894 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatRetrieveValues_C", NULL)); 895 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLSetPreallocation_C", NULL)); 896 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetArray_C", NULL)); 897 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLRestoreArray_C", NULL)); 898 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqsell_seqaij_C", NULL)); 899 #if defined(PETSC_HAVE_CUDA) 900 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqsell_seqsellcuda_C", NULL)); 901 #endif 902 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetFillRatio_C", NULL)); 903 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetMaxSliceWidth_C", NULL)); 904 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetAvgSliceWidth_C", NULL)); 905 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetVarSliceSize_C", NULL)); 906 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLSetSliceHeight_C", NULL)); 907 PetscFunctionReturn(PETSC_SUCCESS); 908 } 909 910 PetscErrorCode MatSetOption_SeqSELL(Mat A, MatOption op, PetscBool flg) 911 { 912 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 913 914 PetscFunctionBegin; 915 switch (op) { 916 case MAT_ROW_ORIENTED: 917 a->roworiented = flg; 918 break; 919 case MAT_KEEP_NONZERO_PATTERN: 920 a->keepnonzeropattern = flg; 921 break; 922 case MAT_NEW_NONZERO_LOCATIONS: 923 a->nonew = (flg ? 0 : 1); 924 break; 925 case MAT_NEW_NONZERO_LOCATION_ERR: 926 a->nonew = (flg ? -1 : 0); 927 break; 928 case MAT_NEW_NONZERO_ALLOCATION_ERR: 929 a->nonew = (flg ? -2 : 0); 930 break; 931 case MAT_UNUSED_NONZERO_LOCATION_ERR: 932 a->nounused = (flg ? -1 : 0); 933 break; 934 case MAT_FORCE_DIAGONAL_ENTRIES: 935 case MAT_IGNORE_OFF_PROC_ENTRIES: 936 case MAT_USE_HASH_TABLE: 937 case MAT_SORTED_FULL: 938 PetscCall(PetscInfo(A, "Option %s ignored\n", MatOptions[op])); 939 break; 940 case MAT_SPD: 941 case MAT_SYMMETRIC: 942 case MAT_STRUCTURALLY_SYMMETRIC: 943 case MAT_HERMITIAN: 944 case MAT_SYMMETRY_ETERNAL: 945 case MAT_STRUCTURAL_SYMMETRY_ETERNAL: 946 case MAT_SPD_ETERNAL: 947 /* These options are handled directly by MatSetOption() */ 948 break; 949 default: 950 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unknown option %d", op); 951 } 952 PetscFunctionReturn(PETSC_SUCCESS); 953 } 954 955 PetscErrorCode MatGetDiagonal_SeqSELL(Mat A, Vec v) 956 { 957 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 958 PetscInt i, j, n, shift; 959 PetscScalar *x, zero = 0.0; 960 961 PetscFunctionBegin; 962 PetscCall(VecGetLocalSize(v, &n)); 963 PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector"); 964 965 if (A->factortype == MAT_FACTOR_ILU || A->factortype == MAT_FACTOR_LU) { 966 PetscInt *diag = a->diag; 967 PetscCall(VecGetArray(v, &x)); 968 for (i = 0; i < n; i++) x[i] = 1.0 / a->val[diag[i]]; 969 PetscCall(VecRestoreArray(v, &x)); 970 PetscFunctionReturn(PETSC_SUCCESS); 971 } 972 973 PetscCall(VecSet(v, zero)); 974 PetscCall(VecGetArray(v, &x)); 975 for (i = 0; i < n; i++) { /* loop over rows */ 976 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */ 977 x[i] = 0; 978 for (j = 0; j < a->rlen[i]; j++) { 979 if (a->colidx[shift + a->sliceheight * j] == i) { 980 x[i] = a->val[shift + a->sliceheight * j]; 981 break; 982 } 983 } 984 } 985 PetscCall(VecRestoreArray(v, &x)); 986 PetscFunctionReturn(PETSC_SUCCESS); 987 } 988 989 PetscErrorCode MatDiagonalScale_SeqSELL(Mat A, Vec ll, Vec rr) 990 { 991 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 992 const PetscScalar *l, *r; 993 PetscInt i, j, m, n, row; 994 995 PetscFunctionBegin; 996 if (ll) { 997 /* The local size is used so that VecMPI can be passed to this routine 998 by MatDiagonalScale_MPISELL */ 999 PetscCall(VecGetLocalSize(ll, &m)); 1000 PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length"); 1001 PetscCall(VecGetArrayRead(ll, &l)); 1002 for (i = 0; i < a->totalslices; i++) { /* loop over slices */ 1003 if (i == a->totalslices - 1 && (A->rmap->n % a->sliceheight)) { /* if last slice has padding rows */ 1004 for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = (row + 1) % a->sliceheight) { 1005 if (row < (A->rmap->n % a->sliceheight)) a->val[j] *= l[a->sliceheight * i + row]; 1006 } 1007 } else { 1008 for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = (row + 1) % a->sliceheight) { a->val[j] *= l[a->sliceheight * i + row]; } 1009 } 1010 } 1011 PetscCall(VecRestoreArrayRead(ll, &l)); 1012 PetscCall(PetscLogFlops(a->nz)); 1013 } 1014 if (rr) { 1015 PetscCall(VecGetLocalSize(rr, &n)); 1016 PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length"); 1017 PetscCall(VecGetArrayRead(rr, &r)); 1018 for (i = 0; i < a->totalslices; i++) { /* loop over slices */ 1019 if (i == a->totalslices - 1 && (A->rmap->n % a->sliceheight)) { /* if last slice has padding rows */ 1020 for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) % a->sliceheight)) { 1021 if (row < (A->rmap->n % a->sliceheight)) a->val[j] *= r[a->colidx[j]]; 1022 } 1023 } else { 1024 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j++) a->val[j] *= r[a->colidx[j]]; 1025 } 1026 } 1027 PetscCall(VecRestoreArrayRead(rr, &r)); 1028 PetscCall(PetscLogFlops(a->nz)); 1029 } 1030 PetscCall(MatSeqSELLInvalidateDiagonal(A)); 1031 #if defined(PETSC_HAVE_CUDA) 1032 if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) A->offloadmask = PETSC_OFFLOAD_CPU; 1033 #endif 1034 PetscFunctionReturn(PETSC_SUCCESS); 1035 } 1036 1037 PetscErrorCode MatGetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], PetscScalar v[]) 1038 { 1039 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1040 PetscInt *cp, i, k, low, high, t, row, col, l; 1041 PetscInt shift; 1042 MatScalar *vp; 1043 1044 PetscFunctionBegin; 1045 for (k = 0; k < m; k++) { /* loop over requested rows */ 1046 row = im[k]; 1047 if (row < 0) continue; 1048 PetscCheck(row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->n - 1); 1049 shift = a->sliidx[row / a->sliceheight] + (row % a->sliceheight); /* starting index of the row */ 1050 cp = a->colidx + shift; /* pointer to the row */ 1051 vp = a->val + shift; /* pointer to the row */ 1052 for (l = 0; l < n; l++) { /* loop over requested columns */ 1053 col = in[l]; 1054 if (col < 0) continue; 1055 PetscCheck(col < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column too large: row %" PetscInt_FMT " max %" PetscInt_FMT, col, A->cmap->n - 1); 1056 high = a->rlen[row]; 1057 low = 0; /* assume unsorted */ 1058 while (high - low > 5) { 1059 t = (low + high) / 2; 1060 if (*(cp + a->sliceheight * t) > col) high = t; 1061 else low = t; 1062 } 1063 for (i = low; i < high; i++) { 1064 if (*(cp + a->sliceheight * i) > col) break; 1065 if (*(cp + a->sliceheight * i) == col) { 1066 *v++ = *(vp + a->sliceheight * i); 1067 goto finished; 1068 } 1069 } 1070 *v++ = 0.0; 1071 finished:; 1072 } 1073 } 1074 PetscFunctionReturn(PETSC_SUCCESS); 1075 } 1076 1077 static PetscErrorCode MatView_SeqSELL_ASCII(Mat A, PetscViewer viewer) 1078 { 1079 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1080 PetscInt i, j, m = A->rmap->n, shift; 1081 const char *name; 1082 PetscViewerFormat format; 1083 1084 PetscFunctionBegin; 1085 PetscCall(PetscViewerGetFormat(viewer, &format)); 1086 if (format == PETSC_VIEWER_ASCII_MATLAB) { 1087 PetscInt nofinalvalue = 0; 1088 /* 1089 if (m && ((a->i[m] == a->i[m-1]) || (a->j[a->nz-1] != A->cmap->n-1))) { 1090 nofinalvalue = 1; 1091 } 1092 */ 1093 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1094 PetscCall(PetscViewerASCIIPrintf(viewer, "%% Size = %" PetscInt_FMT " %" PetscInt_FMT " \n", m, A->cmap->n)); 1095 PetscCall(PetscViewerASCIIPrintf(viewer, "%% Nonzeros = %" PetscInt_FMT " \n", a->nz)); 1096 #if defined(PETSC_USE_COMPLEX) 1097 PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",4);\n", a->nz + nofinalvalue)); 1098 #else 1099 PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",3);\n", a->nz + nofinalvalue)); 1100 #endif 1101 PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = [\n")); 1102 1103 for (i = 0; i < m; i++) { 1104 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1105 for (j = 0; j < a->rlen[i]; j++) { 1106 #if defined(PETSC_USE_COMPLEX) 1107 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %18.16e %18.16e\n", i + 1, a->colidx[shift + a->sliceheight * j] + 1, (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j]))); 1108 #else 1109 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %18.16e\n", i + 1, a->colidx[shift + a->sliceheight * j] + 1, (double)a->val[shift + a->sliceheight * j])); 1110 #endif 1111 } 1112 } 1113 /* 1114 if (nofinalvalue) { 1115 #if defined(PETSC_USE_COMPLEX) 1116 PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %18.16e %18.16e\n",m,A->cmap->n,0.,0.)); 1117 #else 1118 PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %18.16e\n",m,A->cmap->n,0.0)); 1119 #endif 1120 } 1121 */ 1122 PetscCall(PetscObjectGetName((PetscObject)A, &name)); 1123 PetscCall(PetscViewerASCIIPrintf(viewer, "];\n %s = spconvert(zzz);\n", name)); 1124 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1125 } else if (format == PETSC_VIEWER_ASCII_FACTOR_INFO || format == PETSC_VIEWER_ASCII_INFO) { 1126 PetscFunctionReturn(PETSC_SUCCESS); 1127 } else if (format == PETSC_VIEWER_ASCII_COMMON) { 1128 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1129 for (i = 0; i < m; i++) { 1130 PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i)); 1131 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1132 for (j = 0; j < a->rlen[i]; j++) { 1133 #if defined(PETSC_USE_COMPLEX) 1134 if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) > 0.0 && PetscRealPart(a->val[shift + a->sliceheight * j]) != 0.0) { 1135 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j]))); 1136 } else if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) < 0.0 && PetscRealPart(a->val[shift + a->sliceheight * j]) != 0.0) { 1137 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)-PetscImaginaryPart(a->val[shift + a->sliceheight * j]))); 1138 } else if (PetscRealPart(a->val[shift + a->sliceheight * j]) != 0.0) { 1139 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]))); 1140 } 1141 #else 1142 if (a->val[shift + a->sliceheight * j] != 0.0) PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)a->val[shift + a->sliceheight * j])); 1143 #endif 1144 } 1145 PetscCall(PetscViewerASCIIPrintf(viewer, "\n")); 1146 } 1147 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1148 } else if (format == PETSC_VIEWER_ASCII_DENSE) { 1149 PetscInt cnt = 0, jcnt; 1150 PetscScalar value; 1151 #if defined(PETSC_USE_COMPLEX) 1152 PetscBool realonly = PETSC_TRUE; 1153 for (i = 0; i < a->sliidx[a->totalslices]; i++) { 1154 if (PetscImaginaryPart(a->val[i]) != 0.0) { 1155 realonly = PETSC_FALSE; 1156 break; 1157 } 1158 } 1159 #endif 1160 1161 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1162 for (i = 0; i < m; i++) { 1163 jcnt = 0; 1164 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1165 for (j = 0; j < A->cmap->n; j++) { 1166 if (jcnt < a->rlen[i] && j == a->colidx[shift + a->sliceheight * j]) { 1167 value = a->val[cnt++]; 1168 jcnt++; 1169 } else { 1170 value = 0.0; 1171 } 1172 #if defined(PETSC_USE_COMPLEX) 1173 if (realonly) { 1174 PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)PetscRealPart(value))); 1175 } else { 1176 PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e+%7.5e i ", (double)PetscRealPart(value), (double)PetscImaginaryPart(value))); 1177 } 1178 #else 1179 PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)value)); 1180 #endif 1181 } 1182 PetscCall(PetscViewerASCIIPrintf(viewer, "\n")); 1183 } 1184 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1185 } else if (format == PETSC_VIEWER_ASCII_MATRIXMARKET) { 1186 PetscInt fshift = 1; 1187 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1188 #if defined(PETSC_USE_COMPLEX) 1189 PetscCall(PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate complex general\n")); 1190 #else 1191 PetscCall(PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate real general\n")); 1192 #endif 1193 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %" PetscInt_FMT "\n", m, A->cmap->n, a->nz)); 1194 for (i = 0; i < m; i++) { 1195 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1196 for (j = 0; j < a->rlen[i]; j++) { 1197 #if defined(PETSC_USE_COMPLEX) 1198 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g %g\n", i + fshift, a->colidx[shift + a->sliceheight * j] + fshift, (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j]))); 1199 #else 1200 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g\n", i + fshift, a->colidx[shift + a->sliceheight * j] + fshift, (double)a->val[shift + a->sliceheight * j])); 1201 #endif 1202 } 1203 } 1204 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1205 } else if (format == PETSC_VIEWER_NATIVE) { 1206 for (i = 0; i < a->totalslices; i++) { /* loop over slices */ 1207 PetscInt row; 1208 PetscCall(PetscViewerASCIIPrintf(viewer, "slice %" PetscInt_FMT ": %" PetscInt_FMT " %" PetscInt_FMT "\n", i, a->sliidx[i], a->sliidx[i + 1])); 1209 for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = (row + 1) % a->sliceheight) { 1210 #if defined(PETSC_USE_COMPLEX) 1211 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1212 PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g + %g i\n", a->sliceheight * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j]))); 1213 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1214 PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g - %g i\n", a->sliceheight * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), -(double)PetscImaginaryPart(a->val[j]))); 1215 } else { 1216 PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g\n", a->sliceheight * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]))); 1217 } 1218 #else 1219 PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g\n", a->sliceheight * i + row, a->colidx[j], (double)a->val[j])); 1220 #endif 1221 } 1222 } 1223 } else { 1224 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1225 if (A->factortype) { 1226 for (i = 0; i < m; i++) { 1227 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1228 PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i)); 1229 /* L part */ 1230 for (j = shift; j < a->diag[i]; j += a->sliceheight) { 1231 #if defined(PETSC_USE_COMPLEX) 1232 if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) > 0.0) { 1233 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j]))); 1234 } else if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) < 0.0) { 1235 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j])))); 1236 } else { 1237 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j]))); 1238 } 1239 #else 1240 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j])); 1241 #endif 1242 } 1243 /* diagonal */ 1244 j = a->diag[i]; 1245 #if defined(PETSC_USE_COMPLEX) 1246 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1247 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)PetscImaginaryPart(1.0 / a->val[j]))); 1248 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1249 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)(-PetscImaginaryPart(1.0 / a->val[j])))); 1250 } else { 1251 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]))); 1252 } 1253 #else 1254 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)(1.0 / a->val[j]))); 1255 #endif 1256 1257 /* U part */ 1258 for (j = a->diag[i] + 1; j < shift + a->sliceheight * a->rlen[i]; j += a->sliceheight) { 1259 #if defined(PETSC_USE_COMPLEX) 1260 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1261 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j]))); 1262 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1263 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j])))); 1264 } else { 1265 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j]))); 1266 } 1267 #else 1268 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j])); 1269 #endif 1270 } 1271 PetscCall(PetscViewerASCIIPrintf(viewer, "\n")); 1272 } 1273 } else { 1274 for (i = 0; i < m; i++) { 1275 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1276 PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i)); 1277 for (j = 0; j < a->rlen[i]; j++) { 1278 #if defined(PETSC_USE_COMPLEX) 1279 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1280 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j]))); 1281 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1282 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)-PetscImaginaryPart(a->val[shift + a->sliceheight * j]))); 1283 } else { 1284 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]))); 1285 } 1286 #else 1287 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)a->val[shift + a->sliceheight * j])); 1288 #endif 1289 } 1290 PetscCall(PetscViewerASCIIPrintf(viewer, "\n")); 1291 } 1292 } 1293 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1294 } 1295 PetscCall(PetscViewerFlush(viewer)); 1296 PetscFunctionReturn(PETSC_SUCCESS); 1297 } 1298 1299 #include <petscdraw.h> 1300 static PetscErrorCode MatView_SeqSELL_Draw_Zoom(PetscDraw draw, void *Aa) 1301 { 1302 Mat A = (Mat)Aa; 1303 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1304 PetscInt i, j, m = A->rmap->n, shift; 1305 int color; 1306 PetscReal xl, yl, xr, yr, x_l, x_r, y_l, y_r; 1307 PetscViewer viewer; 1308 PetscViewerFormat format; 1309 1310 PetscFunctionBegin; 1311 PetscCall(PetscObjectQuery((PetscObject)A, "Zoomviewer", (PetscObject *)&viewer)); 1312 PetscCall(PetscViewerGetFormat(viewer, &format)); 1313 PetscCall(PetscDrawGetCoordinates(draw, &xl, &yl, &xr, &yr)); 1314 1315 /* loop over matrix elements drawing boxes */ 1316 1317 if (format != PETSC_VIEWER_DRAW_CONTOUR) { 1318 PetscDrawCollectiveBegin(draw); 1319 /* Blue for negative, Cyan for zero and Red for positive */ 1320 color = PETSC_DRAW_BLUE; 1321 for (i = 0; i < m; i++) { 1322 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */ 1323 y_l = m - i - 1.0; 1324 y_r = y_l + 1.0; 1325 for (j = 0; j < a->rlen[i]; j++) { 1326 x_l = a->colidx[shift + a->sliceheight * j]; 1327 x_r = x_l + 1.0; 1328 if (PetscRealPart(a->val[shift + a->sliceheight * j]) >= 0.) continue; 1329 PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color)); 1330 } 1331 } 1332 color = PETSC_DRAW_CYAN; 1333 for (i = 0; i < m; i++) { 1334 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1335 y_l = m - i - 1.0; 1336 y_r = y_l + 1.0; 1337 for (j = 0; j < a->rlen[i]; j++) { 1338 x_l = a->colidx[shift + a->sliceheight * j]; 1339 x_r = x_l + 1.0; 1340 if (a->val[shift + a->sliceheight * j] != 0.) continue; 1341 PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color)); 1342 } 1343 } 1344 color = PETSC_DRAW_RED; 1345 for (i = 0; i < m; i++) { 1346 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1347 y_l = m - i - 1.0; 1348 y_r = y_l + 1.0; 1349 for (j = 0; j < a->rlen[i]; j++) { 1350 x_l = a->colidx[shift + a->sliceheight * j]; 1351 x_r = x_l + 1.0; 1352 if (PetscRealPart(a->val[shift + a->sliceheight * j]) <= 0.) continue; 1353 PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color)); 1354 } 1355 } 1356 PetscDrawCollectiveEnd(draw); 1357 } else { 1358 /* use contour shading to indicate magnitude of values */ 1359 /* first determine max of all nonzero values */ 1360 PetscReal minv = 0.0, maxv = 0.0; 1361 PetscInt count = 0; 1362 PetscDraw popup; 1363 for (i = 0; i < a->sliidx[a->totalslices]; i++) { 1364 if (PetscAbsScalar(a->val[i]) > maxv) maxv = PetscAbsScalar(a->val[i]); 1365 } 1366 if (minv >= maxv) maxv = minv + PETSC_SMALL; 1367 PetscCall(PetscDrawGetPopup(draw, &popup)); 1368 PetscCall(PetscDrawScalePopup(popup, minv, maxv)); 1369 1370 PetscDrawCollectiveBegin(draw); 1371 for (i = 0; i < m; i++) { 1372 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1373 y_l = m - i - 1.0; 1374 y_r = y_l + 1.0; 1375 for (j = 0; j < a->rlen[i]; j++) { 1376 x_l = a->colidx[shift + a->sliceheight * j]; 1377 x_r = x_l + 1.0; 1378 color = PetscDrawRealToColor(PetscAbsScalar(a->val[count]), minv, maxv); 1379 PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color)); 1380 count++; 1381 } 1382 } 1383 PetscDrawCollectiveEnd(draw); 1384 } 1385 PetscFunctionReturn(PETSC_SUCCESS); 1386 } 1387 1388 #include <petscdraw.h> 1389 static PetscErrorCode MatView_SeqSELL_Draw(Mat A, PetscViewer viewer) 1390 { 1391 PetscDraw draw; 1392 PetscReal xr, yr, xl, yl, h, w; 1393 PetscBool isnull; 1394 1395 PetscFunctionBegin; 1396 PetscCall(PetscViewerDrawGetDraw(viewer, 0, &draw)); 1397 PetscCall(PetscDrawIsNull(draw, &isnull)); 1398 if (isnull) PetscFunctionReturn(PETSC_SUCCESS); 1399 1400 xr = A->cmap->n; 1401 yr = A->rmap->n; 1402 h = yr / 10.0; 1403 w = xr / 10.0; 1404 xr += w; 1405 yr += h; 1406 xl = -w; 1407 yl = -h; 1408 PetscCall(PetscDrawSetCoordinates(draw, xl, yl, xr, yr)); 1409 PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", (PetscObject)viewer)); 1410 PetscCall(PetscDrawZoom(draw, MatView_SeqSELL_Draw_Zoom, A)); 1411 PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", NULL)); 1412 PetscCall(PetscDrawSave(draw)); 1413 PetscFunctionReturn(PETSC_SUCCESS); 1414 } 1415 1416 PetscErrorCode MatView_SeqSELL(Mat A, PetscViewer viewer) 1417 { 1418 PetscBool iascii, isbinary, isdraw; 1419 1420 PetscFunctionBegin; 1421 PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii)); 1422 PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERBINARY, &isbinary)); 1423 PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERDRAW, &isdraw)); 1424 if (iascii) { 1425 PetscCall(MatView_SeqSELL_ASCII(A, viewer)); 1426 } else if (isbinary) { 1427 /* PetscCall(MatView_SeqSELL_Binary(A,viewer)); */ 1428 } else if (isdraw) PetscCall(MatView_SeqSELL_Draw(A, viewer)); 1429 PetscFunctionReturn(PETSC_SUCCESS); 1430 } 1431 1432 PetscErrorCode MatAssemblyEnd_SeqSELL(Mat A, MatAssemblyType mode) 1433 { 1434 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1435 PetscInt i, shift, row_in_slice, row, nrow, *cp, lastcol, j, k; 1436 MatScalar *vp; 1437 #if defined(PETSC_HAVE_CUDA) 1438 PetscInt totalchunks = 0; 1439 #endif 1440 1441 PetscFunctionBegin; 1442 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS); 1443 /* To do: compress out the unused elements */ 1444 PetscCall(MatMarkDiagonal_SeqSELL(A)); 1445 PetscCall(PetscInfo(A, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: %" PetscInt_FMT " allocated %" PetscInt_FMT " used (%" PetscInt_FMT " nonzeros+%" PetscInt_FMT " paddedzeros)\n", A->rmap->n, A->cmap->n, a->maxallocmat, a->sliidx[a->totalslices], a->nz, a->sliidx[a->totalslices] - a->nz)); 1446 PetscCall(PetscInfo(A, "Number of mallocs during MatSetValues() is %" PetscInt_FMT "\n", a->reallocs)); 1447 PetscCall(PetscInfo(A, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", a->rlenmax)); 1448 a->nonzerorowcnt = 0; 1449 /* Set unused slots for column indices to last valid column index. Set unused slots for values to zero. This allows for a use of unmasked intrinsics -> higher performance */ 1450 for (i = 0; i < a->totalslices; ++i) { 1451 shift = a->sliidx[i]; /* starting index of the slice */ 1452 cp = PetscSafePointerPlusOffset(a->colidx, shift); /* pointer to the column indices of the slice */ 1453 vp = PetscSafePointerPlusOffset(a->val, shift); /* pointer to the nonzero values of the slice */ 1454 for (row_in_slice = 0; row_in_slice < a->sliceheight; ++row_in_slice) { /* loop over rows in the slice */ 1455 row = a->sliceheight * i + row_in_slice; 1456 nrow = a->rlen[row]; /* number of nonzeros in row */ 1457 /* 1458 Search for the nearest nonzero. Normally setting the index to zero may cause extra communication. 1459 But if the entire slice are empty, it is fine to use 0 since the index will not be loaded. 1460 */ 1461 lastcol = 0; 1462 if (nrow > 0) { /* nonempty row */ 1463 a->nonzerorowcnt++; 1464 lastcol = cp[a->sliceheight * (nrow - 1) + row_in_slice]; /* use the index from the last nonzero at current row */ 1465 } else if (!row_in_slice) { /* first row of the correct slice is empty */ 1466 for (j = 1; j < a->sliceheight; j++) { 1467 if (a->rlen[a->sliceheight * i + j]) { 1468 lastcol = cp[j]; 1469 break; 1470 } 1471 } 1472 } else { 1473 if (a->sliidx[i + 1] != shift) lastcol = cp[row_in_slice - 1]; /* use the index from the previous row */ 1474 } 1475 1476 for (k = nrow; k < (a->sliidx[i + 1] - shift) / a->sliceheight; ++k) { 1477 cp[a->sliceheight * k + row_in_slice] = lastcol; 1478 vp[a->sliceheight * k + row_in_slice] = (MatScalar)0; 1479 } 1480 } 1481 } 1482 1483 A->info.mallocs += a->reallocs; 1484 a->reallocs = 0; 1485 1486 PetscCall(MatSeqSELLInvalidateDiagonal(A)); 1487 #if defined(PETSC_HAVE_CUDA) 1488 if (!a->chunksize && a->totalslices) { 1489 a->chunksize = 64; 1490 while (a->chunksize < 1024 && 2 * a->chunksize <= a->sliidx[a->totalslices] / a->totalslices) a->chunksize *= 2; 1491 totalchunks = 1 + (a->sliidx[a->totalslices] - 1) / a->chunksize; 1492 } 1493 if (totalchunks != a->totalchunks) { 1494 PetscCall(PetscFree(a->chunk_slice_map)); 1495 PetscCall(PetscMalloc1(totalchunks, &a->chunk_slice_map)); 1496 a->totalchunks = totalchunks; 1497 } 1498 j = 0; 1499 for (i = 0; i < totalchunks; i++) { 1500 while (a->sliidx[j + 1] <= i * a->chunksize && j < a->totalslices) j++; 1501 a->chunk_slice_map[i] = j; 1502 } 1503 #endif 1504 PetscFunctionReturn(PETSC_SUCCESS); 1505 } 1506 1507 PetscErrorCode MatGetInfo_SeqSELL(Mat A, MatInfoType flag, MatInfo *info) 1508 { 1509 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1510 1511 PetscFunctionBegin; 1512 info->block_size = 1.0; 1513 info->nz_allocated = a->maxallocmat; 1514 info->nz_used = a->sliidx[a->totalslices]; /* include padding zeros */ 1515 info->nz_unneeded = (a->maxallocmat - a->sliidx[a->totalslices]); 1516 info->assemblies = A->num_ass; 1517 info->mallocs = A->info.mallocs; 1518 info->memory = 0; /* REVIEW ME */ 1519 if (A->factortype) { 1520 info->fill_ratio_given = A->info.fill_ratio_given; 1521 info->fill_ratio_needed = A->info.fill_ratio_needed; 1522 info->factor_mallocs = A->info.factor_mallocs; 1523 } else { 1524 info->fill_ratio_given = 0; 1525 info->fill_ratio_needed = 0; 1526 info->factor_mallocs = 0; 1527 } 1528 PetscFunctionReturn(PETSC_SUCCESS); 1529 } 1530 1531 PetscErrorCode MatSetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], const PetscScalar v[], InsertMode is) 1532 { 1533 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1534 PetscInt shift, i, k, l, low, high, t, ii, row, col, nrow; 1535 PetscInt *cp, nonew = a->nonew, lastcol = -1; 1536 MatScalar *vp, value; 1537 #if defined(PETSC_HAVE_CUDA) 1538 PetscBool inserted = PETSC_FALSE; 1539 PetscInt mul = DEVICE_MEM_ALIGN / a->sliceheight; 1540 #endif 1541 1542 PetscFunctionBegin; 1543 for (k = 0; k < m; k++) { /* loop over added rows */ 1544 row = im[k]; 1545 if (row < 0) continue; 1546 PetscCheck(row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->n - 1); 1547 shift = a->sliidx[row / a->sliceheight] + row % a->sliceheight; /* starting index of the row */ 1548 cp = a->colidx + shift; /* pointer to the row */ 1549 vp = a->val + shift; /* pointer to the row */ 1550 nrow = a->rlen[row]; 1551 low = 0; 1552 high = nrow; 1553 1554 for (l = 0; l < n; l++) { /* loop over added columns */ 1555 col = in[l]; 1556 if (col < 0) continue; 1557 PetscCheck(col < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Col too large: row %" PetscInt_FMT " max %" PetscInt_FMT, col, A->cmap->n - 1); 1558 if (a->roworiented) { 1559 value = v[l + k * n]; 1560 } else { 1561 value = v[k + l * m]; 1562 } 1563 if ((value == 0.0 && a->ignorezeroentries) && (is == ADD_VALUES)) continue; 1564 1565 /* search in this row for the specified column, i indicates the column to be set */ 1566 if (col <= lastcol) low = 0; 1567 else high = nrow; 1568 lastcol = col; 1569 while (high - low > 5) { 1570 t = (low + high) / 2; 1571 if (*(cp + a->sliceheight * t) > col) high = t; 1572 else low = t; 1573 } 1574 for (i = low; i < high; i++) { 1575 if (*(cp + a->sliceheight * i) > col) break; 1576 if (*(cp + a->sliceheight * i) == col) { 1577 if (is == ADD_VALUES) *(vp + a->sliceheight * i) += value; 1578 else *(vp + a->sliceheight * i) = value; 1579 #if defined(PETSC_HAVE_CUDA) 1580 inserted = PETSC_TRUE; 1581 #endif 1582 low = i + 1; 1583 goto noinsert; 1584 } 1585 } 1586 if (value == 0.0 && a->ignorezeroentries) goto noinsert; 1587 if (nonew == 1) goto noinsert; 1588 PetscCheck(nonew != -1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Inserting a new nonzero (%" PetscInt_FMT ", %" PetscInt_FMT ") in the matrix", row, col); 1589 #if defined(PETSC_HAVE_CUDA) 1590 MatSeqXSELLReallocateSELL(A, A->rmap->n, 1, nrow, a->sliidx, a->sliceheight, row / a->sliceheight, row, col, a->colidx, a->val, cp, vp, nonew, MatScalar, mul); 1591 #else 1592 /* If the current row length exceeds the slice width (e.g. nrow==slice_width), allocate a new space, otherwise do nothing */ 1593 MatSeqXSELLReallocateSELL(A, A->rmap->n, 1, nrow, a->sliidx, a->sliceheight, row / a->sliceheight, row, col, a->colidx, a->val, cp, vp, nonew, MatScalar, 1); 1594 #endif 1595 /* add the new nonzero to the high position, shift the remaining elements in current row to the right by one slot */ 1596 for (ii = nrow - 1; ii >= i; ii--) { 1597 *(cp + a->sliceheight * (ii + 1)) = *(cp + a->sliceheight * ii); 1598 *(vp + a->sliceheight * (ii + 1)) = *(vp + a->sliceheight * ii); 1599 } 1600 a->rlen[row]++; 1601 *(cp + a->sliceheight * i) = col; 1602 *(vp + a->sliceheight * i) = value; 1603 a->nz++; 1604 A->nonzerostate++; 1605 #if defined(PETSC_HAVE_CUDA) 1606 inserted = PETSC_TRUE; 1607 #endif 1608 low = i + 1; 1609 high++; 1610 nrow++; 1611 noinsert:; 1612 } 1613 a->rlen[row] = nrow; 1614 } 1615 #if defined(PETSC_HAVE_CUDA) 1616 if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED && inserted) A->offloadmask = PETSC_OFFLOAD_CPU; 1617 #endif 1618 PetscFunctionReturn(PETSC_SUCCESS); 1619 } 1620 1621 PetscErrorCode MatCopy_SeqSELL(Mat A, Mat B, MatStructure str) 1622 { 1623 PetscFunctionBegin; 1624 /* If the two matrices have the same copy implementation, use fast copy. */ 1625 if (str == SAME_NONZERO_PATTERN && (A->ops->copy == B->ops->copy)) { 1626 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1627 Mat_SeqSELL *b = (Mat_SeqSELL *)B->data; 1628 1629 PetscCheck(a->sliidx[a->totalslices] == b->sliidx[b->totalslices], PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Number of nonzeros in two matrices are different"); 1630 PetscCall(PetscArraycpy(b->val, a->val, a->sliidx[a->totalslices])); 1631 } else { 1632 PetscCall(MatCopy_Basic(A, B, str)); 1633 } 1634 PetscFunctionReturn(PETSC_SUCCESS); 1635 } 1636 1637 PetscErrorCode MatSetUp_SeqSELL(Mat A) 1638 { 1639 PetscFunctionBegin; 1640 PetscCall(MatSeqSELLSetPreallocation(A, PETSC_DEFAULT, NULL)); 1641 PetscFunctionReturn(PETSC_SUCCESS); 1642 } 1643 1644 PetscErrorCode MatSeqSELLGetArray_SeqSELL(Mat A, PetscScalar *array[]) 1645 { 1646 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1647 1648 PetscFunctionBegin; 1649 *array = a->val; 1650 PetscFunctionReturn(PETSC_SUCCESS); 1651 } 1652 1653 PetscErrorCode MatSeqSELLRestoreArray_SeqSELL(Mat A, PetscScalar *array[]) 1654 { 1655 PetscFunctionBegin; 1656 PetscFunctionReturn(PETSC_SUCCESS); 1657 } 1658 1659 PetscErrorCode MatScale_SeqSELL(Mat inA, PetscScalar alpha) 1660 { 1661 Mat_SeqSELL *a = (Mat_SeqSELL *)inA->data; 1662 MatScalar *aval = a->val; 1663 PetscScalar oalpha = alpha; 1664 PetscBLASInt one = 1, size; 1665 1666 PetscFunctionBegin; 1667 PetscCall(PetscBLASIntCast(a->sliidx[a->totalslices], &size)); 1668 PetscCallBLAS("BLASscal", BLASscal_(&size, &oalpha, aval, &one)); 1669 PetscCall(PetscLogFlops(a->nz)); 1670 PetscCall(MatSeqSELLInvalidateDiagonal(inA)); 1671 #if defined(PETSC_HAVE_CUDA) 1672 if (inA->offloadmask != PETSC_OFFLOAD_UNALLOCATED) inA->offloadmask = PETSC_OFFLOAD_CPU; 1673 #endif 1674 PetscFunctionReturn(PETSC_SUCCESS); 1675 } 1676 1677 PetscErrorCode MatShift_SeqSELL(Mat Y, PetscScalar a) 1678 { 1679 Mat_SeqSELL *y = (Mat_SeqSELL *)Y->data; 1680 1681 PetscFunctionBegin; 1682 if (!Y->preallocated || !y->nz) PetscCall(MatSeqSELLSetPreallocation(Y, 1, NULL)); 1683 PetscCall(MatShift_Basic(Y, a)); 1684 PetscFunctionReturn(PETSC_SUCCESS); 1685 } 1686 1687 PetscErrorCode MatSOR_SeqSELL(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx) 1688 { 1689 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1690 PetscScalar *x, sum, *t; 1691 const MatScalar *idiag = NULL, *mdiag; 1692 const PetscScalar *b, *xb; 1693 PetscInt n, m = A->rmap->n, i, j, shift; 1694 const PetscInt *diag; 1695 1696 PetscFunctionBegin; 1697 its = its * lits; 1698 1699 if (fshift != a->fshift || omega != a->omega) a->idiagvalid = PETSC_FALSE; /* must recompute idiag[] */ 1700 if (!a->idiagvalid) PetscCall(MatInvertDiagonal_SeqSELL(A, omega, fshift)); 1701 a->fshift = fshift; 1702 a->omega = omega; 1703 1704 diag = a->diag; 1705 t = a->ssor_work; 1706 idiag = a->idiag; 1707 mdiag = a->mdiag; 1708 1709 PetscCall(VecGetArray(xx, &x)); 1710 PetscCall(VecGetArrayRead(bb, &b)); 1711 /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */ 1712 PetscCheck(flag != SOR_APPLY_UPPER, PETSC_COMM_SELF, PETSC_ERR_SUP, "SOR_APPLY_UPPER is not implemented"); 1713 PetscCheck(flag != SOR_APPLY_LOWER, PETSC_COMM_SELF, PETSC_ERR_SUP, "SOR_APPLY_LOWER is not implemented"); 1714 PetscCheck(!(flag & SOR_EISENSTAT), PETSC_COMM_SELF, PETSC_ERR_SUP, "No support yet for Eisenstat"); 1715 1716 if (flag & SOR_ZERO_INITIAL_GUESS) { 1717 if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) { 1718 for (i = 0; i < m; i++) { 1719 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */ 1720 sum = b[i]; 1721 n = (diag[i] - shift) / a->sliceheight; 1722 for (j = 0; j < n; j++) sum -= a->val[shift + a->sliceheight * j] * x[a->colidx[shift + a->sliceheight * j]]; 1723 t[i] = sum; 1724 x[i] = sum * idiag[i]; 1725 } 1726 xb = t; 1727 PetscCall(PetscLogFlops(a->nz)); 1728 } else xb = b; 1729 if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) { 1730 for (i = m - 1; i >= 0; i--) { 1731 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */ 1732 sum = xb[i]; 1733 n = a->rlen[i] - (diag[i] - shift) / a->sliceheight - 1; 1734 for (j = 1; j <= n; j++) sum -= a->val[diag[i] + a->sliceheight * j] * x[a->colidx[diag[i] + a->sliceheight * j]]; 1735 if (xb == b) { 1736 x[i] = sum * idiag[i]; 1737 } else { 1738 x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */ 1739 } 1740 } 1741 PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */ 1742 } 1743 its--; 1744 } 1745 while (its--) { 1746 if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) { 1747 for (i = 0; i < m; i++) { 1748 /* lower */ 1749 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */ 1750 sum = b[i]; 1751 n = (diag[i] - shift) / a->sliceheight; 1752 for (j = 0; j < n; j++) sum -= a->val[shift + a->sliceheight * j] * x[a->colidx[shift + a->sliceheight * j]]; 1753 t[i] = sum; /* save application of the lower-triangular part */ 1754 /* upper */ 1755 n = a->rlen[i] - (diag[i] - shift) / a->sliceheight - 1; 1756 for (j = 1; j <= n; j++) sum -= a->val[diag[i] + a->sliceheight * j] * x[a->colidx[diag[i] + a->sliceheight * j]]; 1757 x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */ 1758 } 1759 xb = t; 1760 PetscCall(PetscLogFlops(2.0 * a->nz)); 1761 } else xb = b; 1762 if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) { 1763 for (i = m - 1; i >= 0; i--) { 1764 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */ 1765 sum = xb[i]; 1766 if (xb == b) { 1767 /* whole matrix (no checkpointing available) */ 1768 n = a->rlen[i]; 1769 for (j = 0; j < n; j++) sum -= a->val[shift + a->sliceheight * j] * x[a->colidx[shift + a->sliceheight * j]]; 1770 x[i] = (1. - omega) * x[i] + (sum + mdiag[i] * x[i]) * idiag[i]; 1771 } else { /* lower-triangular part has been saved, so only apply upper-triangular */ 1772 n = a->rlen[i] - (diag[i] - shift) / a->sliceheight - 1; 1773 for (j = 1; j <= n; j++) sum -= a->val[diag[i] + a->sliceheight * j] * x[a->colidx[diag[i] + a->sliceheight * j]]; 1774 x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */ 1775 } 1776 } 1777 if (xb == b) { 1778 PetscCall(PetscLogFlops(2.0 * a->nz)); 1779 } else { 1780 PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */ 1781 } 1782 } 1783 } 1784 PetscCall(VecRestoreArray(xx, &x)); 1785 PetscCall(VecRestoreArrayRead(bb, &b)); 1786 PetscFunctionReturn(PETSC_SUCCESS); 1787 } 1788 1789 static struct _MatOps MatOps_Values = {MatSetValues_SeqSELL, 1790 MatGetRow_SeqSELL, 1791 MatRestoreRow_SeqSELL, 1792 MatMult_SeqSELL, 1793 /* 4*/ MatMultAdd_SeqSELL, 1794 MatMultTranspose_SeqSELL, 1795 MatMultTransposeAdd_SeqSELL, 1796 NULL, 1797 NULL, 1798 NULL, 1799 /* 10*/ NULL, 1800 NULL, 1801 NULL, 1802 MatSOR_SeqSELL, 1803 NULL, 1804 /* 15*/ MatGetInfo_SeqSELL, 1805 MatEqual_SeqSELL, 1806 MatGetDiagonal_SeqSELL, 1807 MatDiagonalScale_SeqSELL, 1808 NULL, 1809 /* 20*/ NULL, 1810 MatAssemblyEnd_SeqSELL, 1811 MatSetOption_SeqSELL, 1812 MatZeroEntries_SeqSELL, 1813 /* 24*/ NULL, 1814 NULL, 1815 NULL, 1816 NULL, 1817 NULL, 1818 /* 29*/ MatSetUp_SeqSELL, 1819 NULL, 1820 NULL, 1821 NULL, 1822 NULL, 1823 /* 34*/ MatDuplicate_SeqSELL, 1824 NULL, 1825 NULL, 1826 NULL, 1827 NULL, 1828 /* 39*/ NULL, 1829 NULL, 1830 NULL, 1831 MatGetValues_SeqSELL, 1832 MatCopy_SeqSELL, 1833 /* 44*/ NULL, 1834 MatScale_SeqSELL, 1835 MatShift_SeqSELL, 1836 NULL, 1837 NULL, 1838 /* 49*/ NULL, 1839 NULL, 1840 NULL, 1841 NULL, 1842 NULL, 1843 /* 54*/ MatFDColoringCreate_SeqXAIJ, 1844 NULL, 1845 NULL, 1846 NULL, 1847 NULL, 1848 /* 59*/ NULL, 1849 MatDestroy_SeqSELL, 1850 MatView_SeqSELL, 1851 NULL, 1852 NULL, 1853 /* 64*/ NULL, 1854 NULL, 1855 NULL, 1856 NULL, 1857 NULL, 1858 /* 69*/ NULL, 1859 NULL, 1860 NULL, 1861 NULL, 1862 NULL, 1863 /* 74*/ NULL, 1864 MatFDColoringApply_AIJ, /* reuse the FDColoring function for AIJ */ 1865 NULL, 1866 NULL, 1867 NULL, 1868 /* 79*/ NULL, 1869 NULL, 1870 NULL, 1871 NULL, 1872 NULL, 1873 /* 84*/ NULL, 1874 NULL, 1875 NULL, 1876 NULL, 1877 NULL, 1878 /* 89*/ NULL, 1879 NULL, 1880 NULL, 1881 NULL, 1882 NULL, 1883 /* 94*/ NULL, 1884 NULL, 1885 NULL, 1886 NULL, 1887 NULL, 1888 /* 99*/ NULL, 1889 NULL, 1890 NULL, 1891 MatConjugate_SeqSELL, 1892 NULL, 1893 /*104*/ NULL, 1894 NULL, 1895 NULL, 1896 NULL, 1897 NULL, 1898 /*109*/ NULL, 1899 NULL, 1900 NULL, 1901 NULL, 1902 MatMissingDiagonal_SeqSELL, 1903 /*114*/ NULL, 1904 NULL, 1905 NULL, 1906 NULL, 1907 NULL, 1908 /*119*/ NULL, 1909 NULL, 1910 NULL, 1911 NULL, 1912 NULL, 1913 /*124*/ NULL, 1914 NULL, 1915 NULL, 1916 NULL, 1917 NULL, 1918 /*129*/ NULL, 1919 NULL, 1920 NULL, 1921 NULL, 1922 NULL, 1923 /*134*/ NULL, 1924 NULL, 1925 NULL, 1926 NULL, 1927 NULL, 1928 /*139*/ NULL, 1929 NULL, 1930 NULL, 1931 MatFDColoringSetUp_SeqXAIJ, 1932 NULL, 1933 /*144*/ NULL, 1934 NULL, 1935 NULL, 1936 NULL, 1937 NULL, 1938 NULL, 1939 /*150*/ NULL, 1940 NULL, 1941 NULL}; 1942 1943 static PetscErrorCode MatStoreValues_SeqSELL(Mat mat) 1944 { 1945 Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data; 1946 1947 PetscFunctionBegin; 1948 PetscCheck(a->nonew, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first"); 1949 1950 /* allocate space for values if not already there */ 1951 if (!a->saved_values) PetscCall(PetscMalloc1(a->sliidx[a->totalslices] + 1, &a->saved_values)); 1952 1953 /* copy values over */ 1954 PetscCall(PetscArraycpy(a->saved_values, a->val, a->sliidx[a->totalslices])); 1955 PetscFunctionReturn(PETSC_SUCCESS); 1956 } 1957 1958 static PetscErrorCode MatRetrieveValues_SeqSELL(Mat mat) 1959 { 1960 Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data; 1961 1962 PetscFunctionBegin; 1963 PetscCheck(a->nonew, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first"); 1964 PetscCheck(a->saved_values, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatStoreValues(A);first"); 1965 PetscCall(PetscArraycpy(a->val, a->saved_values, a->sliidx[a->totalslices])); 1966 PetscFunctionReturn(PETSC_SUCCESS); 1967 } 1968 1969 static PetscErrorCode MatSeqSELLGetFillRatio_SeqSELL(Mat mat, PetscReal *ratio) 1970 { 1971 Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data; 1972 1973 PetscFunctionBegin; 1974 if (a->totalslices && a->sliidx[a->totalslices]) { 1975 *ratio = (PetscReal)(a->sliidx[a->totalslices] - a->nz) / a->sliidx[a->totalslices]; 1976 } else { 1977 *ratio = 0.0; 1978 } 1979 PetscFunctionReturn(PETSC_SUCCESS); 1980 } 1981 1982 static PetscErrorCode MatSeqSELLGetMaxSliceWidth_SeqSELL(Mat mat, PetscInt *slicewidth) 1983 { 1984 Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data; 1985 PetscInt i, current_slicewidth; 1986 1987 PetscFunctionBegin; 1988 *slicewidth = 0; 1989 for (i = 0; i < a->totalslices; i++) { 1990 current_slicewidth = (a->sliidx[i + 1] - a->sliidx[i]) / a->sliceheight; 1991 if (current_slicewidth > *slicewidth) *slicewidth = current_slicewidth; 1992 } 1993 PetscFunctionReturn(PETSC_SUCCESS); 1994 } 1995 1996 static PetscErrorCode MatSeqSELLGetAvgSliceWidth_SeqSELL(Mat mat, PetscReal *slicewidth) 1997 { 1998 Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data; 1999 2000 PetscFunctionBegin; 2001 *slicewidth = 0; 2002 if (a->totalslices) { *slicewidth = (PetscReal)a->sliidx[a->totalslices] / a->sliceheight / a->totalslices; } 2003 PetscFunctionReturn(PETSC_SUCCESS); 2004 } 2005 2006 static PetscErrorCode MatSeqSELLGetVarSliceSize_SeqSELL(Mat mat, PetscReal *variance) 2007 { 2008 Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data; 2009 PetscReal mean; 2010 PetscInt i, totalslices = a->totalslices, *sliidx = a->sliidx; 2011 2012 PetscFunctionBegin; 2013 *variance = 0; 2014 if (totalslices) { 2015 mean = (PetscReal)sliidx[totalslices] / totalslices; 2016 for (i = 1; i <= totalslices; i++) { *variance += ((PetscReal)(sliidx[i] - sliidx[i - 1]) - mean) * ((PetscReal)(sliidx[i] - sliidx[i - 1]) - mean) / totalslices; } 2017 } 2018 PetscFunctionReturn(PETSC_SUCCESS); 2019 } 2020 2021 static PetscErrorCode MatSeqSELLSetSliceHeight_SeqSELL(Mat A, PetscInt sliceheight) 2022 { 2023 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 2024 2025 PetscFunctionBegin; 2026 if (A->preallocated) PetscFunctionReturn(PETSC_SUCCESS); 2027 PetscCheck(a->sliceheight <= 0 || a->sliceheight == sliceheight, PETSC_COMM_SELF, PETSC_ERR_SUP, "Cannot change slice height %" PetscInt_FMT " to %" PetscInt_FMT, a->sliceheight, sliceheight); 2028 a->sliceheight = sliceheight; 2029 #if defined(PETSC_HAVE_CUDA) 2030 PetscCheck(DEVICE_MEM_ALIGN % sliceheight == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "DEVICE_MEM_ALIGN is not divisible by the slice height %" PetscInt_FMT, sliceheight); 2031 #endif 2032 PetscFunctionReturn(PETSC_SUCCESS); 2033 } 2034 2035 /*@C 2036 MatSeqSELLGetFillRatio - returns a ratio that indicates the irregularity of the matrix. 2037 2038 Not Collective 2039 2040 Input Parameter: 2041 . A - a MATSEQSELL matrix 2042 2043 Output Parameter: 2044 . ratio - ratio of number of padded zeros to number of allocated elements 2045 2046 Level: intermediate 2047 2048 .seealso: `MATSEQSELL`, `MatSeqSELLGetAvgSliceWidth()` 2049 @*/ 2050 PetscErrorCode MatSeqSELLGetFillRatio(Mat A, PetscReal *ratio) 2051 { 2052 PetscFunctionBegin; 2053 PetscUseMethod(A, "MatSeqSELLGetFillRatio_C", (Mat, PetscReal *), (A, ratio)); 2054 PetscFunctionReturn(PETSC_SUCCESS); 2055 } 2056 2057 /*@C 2058 MatSeqSELLGetMaxSliceWidth - returns the maximum slice width. 2059 2060 Not Collective 2061 2062 Input Parameter: 2063 . A - a MATSEQSELL matrix 2064 2065 Output Parameter: 2066 . slicewidth - maximum slice width 2067 2068 Level: intermediate 2069 2070 .seealso: `MATSEQSELL`, `MatSeqSELLGetAvgSliceWidth()` 2071 @*/ 2072 PetscErrorCode MatSeqSELLGetMaxSliceWidth(Mat A, PetscInt *slicewidth) 2073 { 2074 PetscFunctionBegin; 2075 PetscUseMethod(A, "MatSeqSELLGetMaxSliceWidth_C", (Mat, PetscInt *), (A, slicewidth)); 2076 PetscFunctionReturn(PETSC_SUCCESS); 2077 } 2078 2079 /*@C 2080 MatSeqSELLGetAvgSliceWidth - returns the average slice width. 2081 2082 Not Collective 2083 2084 Input Parameter: 2085 . A - a MATSEQSELL matrix 2086 2087 Output Parameter: 2088 . slicewidth - average slice width 2089 2090 Level: intermediate 2091 2092 .seealso: `MATSEQSELL`, `MatSeqSELLGetMaxSliceWidth()` 2093 @*/ 2094 PetscErrorCode MatSeqSELLGetAvgSliceWidth(Mat A, PetscReal *slicewidth) 2095 { 2096 PetscFunctionBegin; 2097 PetscUseMethod(A, "MatSeqSELLGetAvgSliceWidth_C", (Mat, PetscReal *), (A, slicewidth)); 2098 PetscFunctionReturn(PETSC_SUCCESS); 2099 } 2100 2101 /*@C 2102 MatSeqSELLSetSliceHeight - sets the slice height. 2103 2104 Not Collective 2105 2106 Input Parameters: 2107 + A - a MATSEQSELL matrix 2108 - sliceheight - slice height 2109 2110 Notes: 2111 You cannot change the slice height once it have been set. 2112 2113 The slice height must be set before MatSetUp() or MatXXXSetPreallocation() is called. 2114 2115 Level: intermediate 2116 2117 .seealso: `MATSEQSELL`, `MatSeqSELLGetVarSliceSize()` 2118 @*/ 2119 PetscErrorCode MatSeqSELLSetSliceHeight(Mat A, PetscInt sliceheight) 2120 { 2121 PetscFunctionBegin; 2122 PetscUseMethod(A, "MatSeqSELLSetSliceHeight_C", (Mat, PetscInt), (A, sliceheight)); 2123 PetscFunctionReturn(PETSC_SUCCESS); 2124 } 2125 2126 /*@C 2127 MatSeqSELLGetVarSliceSize - returns the variance of the slice size. 2128 2129 Not Collective 2130 2131 Input Parameter: 2132 . A - a MATSEQSELL matrix 2133 2134 Output Parameter: 2135 . variance - variance of the slice size 2136 2137 Level: intermediate 2138 2139 .seealso: `MATSEQSELL`, `MatSeqSELLSetSliceHeight()` 2140 @*/ 2141 PetscErrorCode MatSeqSELLGetVarSliceSize(Mat A, PetscReal *variance) 2142 { 2143 PetscFunctionBegin; 2144 PetscUseMethod(A, "MatSeqSELLGetVarSliceSize_C", (Mat, PetscReal *), (A, variance)); 2145 PetscFunctionReturn(PETSC_SUCCESS); 2146 } 2147 2148 #if defined(PETSC_HAVE_CUDA) 2149 PETSC_EXTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLCUDA(Mat); 2150 #endif 2151 2152 PETSC_EXTERN PetscErrorCode MatCreate_SeqSELL(Mat B) 2153 { 2154 Mat_SeqSELL *b; 2155 PetscMPIInt size; 2156 2157 PetscFunctionBegin; 2158 PetscCall(PetscCitationsRegister(citation, &cited)); 2159 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size)); 2160 PetscCheck(size <= 1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Comm must be of size 1"); 2161 2162 PetscCall(PetscNew(&b)); 2163 2164 B->data = (void *)b; 2165 B->ops[0] = MatOps_Values; 2166 2167 b->row = NULL; 2168 b->col = NULL; 2169 b->icol = NULL; 2170 b->reallocs = 0; 2171 b->ignorezeroentries = PETSC_FALSE; 2172 b->roworiented = PETSC_TRUE; 2173 b->nonew = 0; 2174 b->diag = NULL; 2175 b->solve_work = NULL; 2176 B->spptr = NULL; 2177 b->saved_values = NULL; 2178 b->idiag = NULL; 2179 b->mdiag = NULL; 2180 b->ssor_work = NULL; 2181 b->omega = 1.0; 2182 b->fshift = 0.0; 2183 b->idiagvalid = PETSC_FALSE; 2184 b->keepnonzeropattern = PETSC_FALSE; 2185 b->sliceheight = 0; 2186 2187 PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQSELL)); 2188 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetArray_C", MatSeqSELLGetArray_SeqSELL)); 2189 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLRestoreArray_C", MatSeqSELLRestoreArray_SeqSELL)); 2190 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatStoreValues_C", MatStoreValues_SeqSELL)); 2191 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatRetrieveValues_C", MatRetrieveValues_SeqSELL)); 2192 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLSetPreallocation_C", MatSeqSELLSetPreallocation_SeqSELL)); 2193 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqsell_seqaij_C", MatConvert_SeqSELL_SeqAIJ)); 2194 #if defined(PETSC_HAVE_CUDA) 2195 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqsell_seqsellcuda_C", MatConvert_SeqSELL_SeqSELLCUDA)); 2196 #endif 2197 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetFillRatio_C", MatSeqSELLGetFillRatio_SeqSELL)); 2198 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetMaxSliceWidth_C", MatSeqSELLGetMaxSliceWidth_SeqSELL)); 2199 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetAvgSliceWidth_C", MatSeqSELLGetAvgSliceWidth_SeqSELL)); 2200 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetVarSliceSize_C", MatSeqSELLGetVarSliceSize_SeqSELL)); 2201 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLSetSliceHeight_C", MatSeqSELLSetSliceHeight_SeqSELL)); 2202 2203 PetscObjectOptionsBegin((PetscObject)B); 2204 { 2205 PetscInt newsh = -1; 2206 PetscBool flg; 2207 #if defined(PETSC_HAVE_CUDA) 2208 PetscInt chunksize = 0; 2209 #endif 2210 2211 PetscCall(PetscOptionsInt("-mat_sell_slice_height", "Set the slice height used to store SELL matrix", "MatSELLSetSliceHeight", newsh, &newsh, &flg)); 2212 if (flg) { PetscCall(MatSeqSELLSetSliceHeight(B, newsh)); } 2213 #if defined(PETSC_HAVE_CUDA) 2214 PetscCall(PetscOptionsInt("-mat_sell_chunk_size", "Set the chunksize for load-balanced CUDA kernels. Choices include 64,128,256,512,1024", NULL, chunksize, &chunksize, &flg)); 2215 if (flg) { 2216 PetscCheck(chunksize >= 64 && chunksize <= 1024 && chunksize % 64 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "chunksize must be a number in {64,128,256,512,1024}: value %" PetscInt_FMT, chunksize); 2217 b->chunksize = chunksize; 2218 } 2219 #endif 2220 } 2221 PetscOptionsEnd(); 2222 PetscFunctionReturn(PETSC_SUCCESS); 2223 } 2224 2225 /* 2226 Given a matrix generated with MatGetFactor() duplicates all the information in A into B 2227 */ 2228 static PetscErrorCode MatDuplicateNoCreate_SeqSELL(Mat C, Mat A, MatDuplicateOption cpvalues, PetscBool mallocmatspace) 2229 { 2230 Mat_SeqSELL *c = (Mat_SeqSELL *)C->data, *a = (Mat_SeqSELL *)A->data; 2231 PetscInt i, m = A->rmap->n; 2232 PetscInt totalslices = a->totalslices; 2233 2234 PetscFunctionBegin; 2235 C->factortype = A->factortype; 2236 c->row = NULL; 2237 c->col = NULL; 2238 c->icol = NULL; 2239 c->reallocs = 0; 2240 C->assembled = PETSC_TRUE; 2241 2242 PetscCall(PetscLayoutReference(A->rmap, &C->rmap)); 2243 PetscCall(PetscLayoutReference(A->cmap, &C->cmap)); 2244 2245 PetscCall(PetscMalloc1(a->sliceheight * totalslices, &c->rlen)); 2246 PetscCall(PetscMalloc1(totalslices + 1, &c->sliidx)); 2247 2248 for (i = 0; i < m; i++) c->rlen[i] = a->rlen[i]; 2249 for (i = 0; i < totalslices + 1; i++) c->sliidx[i] = a->sliidx[i]; 2250 2251 /* allocate the matrix space */ 2252 if (mallocmatspace) { 2253 PetscCall(PetscMalloc2(a->maxallocmat, &c->val, a->maxallocmat, &c->colidx)); 2254 2255 c->singlemalloc = PETSC_TRUE; 2256 2257 if (m > 0) { 2258 PetscCall(PetscArraycpy(c->colidx, a->colidx, a->maxallocmat)); 2259 if (cpvalues == MAT_COPY_VALUES) { 2260 PetscCall(PetscArraycpy(c->val, a->val, a->maxallocmat)); 2261 } else { 2262 PetscCall(PetscArrayzero(c->val, a->maxallocmat)); 2263 } 2264 } 2265 } 2266 2267 c->ignorezeroentries = a->ignorezeroentries; 2268 c->roworiented = a->roworiented; 2269 c->nonew = a->nonew; 2270 if (a->diag) { 2271 PetscCall(PetscMalloc1(m, &c->diag)); 2272 for (i = 0; i < m; i++) c->diag[i] = a->diag[i]; 2273 } else c->diag = NULL; 2274 2275 c->solve_work = NULL; 2276 c->saved_values = NULL; 2277 c->idiag = NULL; 2278 c->ssor_work = NULL; 2279 c->keepnonzeropattern = a->keepnonzeropattern; 2280 c->free_val = PETSC_TRUE; 2281 c->free_colidx = PETSC_TRUE; 2282 2283 c->maxallocmat = a->maxallocmat; 2284 c->maxallocrow = a->maxallocrow; 2285 c->rlenmax = a->rlenmax; 2286 c->nz = a->nz; 2287 C->preallocated = PETSC_TRUE; 2288 2289 c->nonzerorowcnt = a->nonzerorowcnt; 2290 C->nonzerostate = A->nonzerostate; 2291 2292 PetscCall(PetscFunctionListDuplicate(((PetscObject)A)->qlist, &((PetscObject)C)->qlist)); 2293 PetscFunctionReturn(PETSC_SUCCESS); 2294 } 2295 2296 PetscErrorCode MatDuplicate_SeqSELL(Mat A, MatDuplicateOption cpvalues, Mat *B) 2297 { 2298 PetscFunctionBegin; 2299 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B)); 2300 PetscCall(MatSetSizes(*B, A->rmap->n, A->cmap->n, A->rmap->n, A->cmap->n)); 2301 if (!(A->rmap->n % A->rmap->bs) && !(A->cmap->n % A->cmap->bs)) PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 2302 PetscCall(MatSetType(*B, ((PetscObject)A)->type_name)); 2303 PetscCall(MatDuplicateNoCreate_SeqSELL(*B, A, cpvalues, PETSC_TRUE)); 2304 PetscFunctionReturn(PETSC_SUCCESS); 2305 } 2306 2307 /*MC 2308 MATSEQSELL - MATSEQSELL = "seqsell" - A matrix type to be used for sequential sparse matrices, 2309 based on the sliced Ellpack format, {cite}`zhangellpack2018` 2310 2311 Options Database Key: 2312 . -mat_type seqsell - sets the matrix type to "`MATSEQELL` during a call to `MatSetFromOptions()` 2313 2314 Level: beginner 2315 2316 .seealso: `Mat`, `MatCreateSeqSell()`, `MATSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATAIJ`, `MATMPIAIJ` 2317 M*/ 2318 2319 /*MC 2320 MATSELL - MATSELL = "sell" - A matrix type to be used for sparse matrices, {cite}`zhangellpack2018` 2321 2322 This matrix type is identical to `MATSEQSELL` when constructed with a single process communicator, 2323 and `MATMPISELL` otherwise. As a result, for single process communicators, 2324 `MatSeqSELLSetPreallocation()` is supported, and similarly `MatMPISELLSetPreallocation()` is supported 2325 for communicators controlling multiple processes. It is recommended that you call both of 2326 the above preallocation routines for simplicity. 2327 2328 Options Database Key: 2329 . -mat_type sell - sets the matrix type to "sell" during a call to MatSetFromOptions() 2330 2331 Level: beginner 2332 2333 Notes: 2334 This format is only supported for real scalars, double precision, and 32-bit indices (the defaults). 2335 2336 It can provide better performance on Intel and AMD processes with AVX2 or AVX512 support for matrices that have a similar number of 2337 non-zeros in contiguous groups of rows. However if the computation is memory bandwidth limited it may not provide much improvement. 2338 2339 Developer Notes: 2340 On Intel (and AMD) systems some of the matrix operations use SIMD (AVX) instructions to achieve higher performance. 2341 2342 The sparse matrix format is as follows. For simplicity we assume a slice size of 2, it is actually 8 2343 .vb 2344 (2 0 3 4) 2345 Consider the matrix A = (5 0 6 0) 2346 (0 0 7 8) 2347 (0 0 9 9) 2348 2349 symbolically the Ellpack format can be written as 2350 2351 (2 3 4 |) (0 2 3 |) 2352 v = (5 6 0 |) colidx = (0 2 2 |) 2353 -------- --------- 2354 (7 8 |) (2 3 |) 2355 (9 9 |) (2 3 |) 2356 2357 The data for 2 contiguous rows of the matrix are stored together (in column-major format) (with any left-over rows handled as a special case). 2358 Any of the rows in a slice fewer columns than the rest of the slice (row 1 above) are padded with a previous valid column in their "extra" colidx[] locations and 2359 zeros in their "extra" v locations so that the matrix operations do not need special code to handle different length rows within the 2 rows in a slice. 2360 2361 The one-dimensional representation of v used in the code is (2 5 3 6 4 0 7 9 8 9) and for colidx is (0 0 2 2 3 2 2 2 3 3) 2362 2363 .ve 2364 2365 See `MatMult_SeqSELL()` for how this format is used with the SIMD operations to achieve high performance. 2366 2367 .seealso: `Mat`, `MatCreateSeqSELL()`, `MatCreateSeqAIJ()`, `MatCreateSell()`, `MATSEQSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATMPIAIJ`, `MATAIJ` 2368 M*/ 2369 2370 /*@C 2371 MatCreateSeqSELL - Creates a sparse matrix in `MATSEQSELL` format. 2372 2373 Collective 2374 2375 Input Parameters: 2376 + comm - MPI communicator, set to `PETSC_COMM_SELF` 2377 . m - number of rows 2378 . n - number of columns 2379 . rlenmax - maximum number of nonzeros in a row, ignored if `rlen` is provided 2380 - rlen - array containing the number of nonzeros in the various rows (possibly different for each row) or NULL 2381 2382 Output Parameter: 2383 . A - the matrix 2384 2385 Level: intermediate 2386 2387 Notes: 2388 It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 2389 MatXXXXSetPreallocation() paradigm instead of this routine directly. 2390 [MatXXXXSetPreallocation() is, for example, `MatSeqSELLSetPreallocation()`] 2391 2392 Specify the preallocated storage with either `rlenmax` or `rlen` (not both). 2393 Set `rlenmax` = `PETSC_DEFAULT` and `rlen` = `NULL` for PETSc to control dynamic memory 2394 allocation. 2395 2396 .seealso: `Mat`, `MATSEQSELL`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatSeqSELLSetPreallocation()`, `MATSELL`, `MATMPISELL` 2397 @*/ 2398 PetscErrorCode MatCreateSeqSELL(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt rlenmax, const PetscInt rlen[], Mat *A) 2399 { 2400 PetscFunctionBegin; 2401 PetscCall(MatCreate(comm, A)); 2402 PetscCall(MatSetSizes(*A, m, n, m, n)); 2403 PetscCall(MatSetType(*A, MATSEQSELL)); 2404 PetscCall(MatSeqSELLSetPreallocation_SeqSELL(*A, rlenmax, rlen)); 2405 PetscFunctionReturn(PETSC_SUCCESS); 2406 } 2407 2408 PetscErrorCode MatEqual_SeqSELL(Mat A, Mat B, PetscBool *flg) 2409 { 2410 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data, *b = (Mat_SeqSELL *)B->data; 2411 PetscInt totalslices = a->totalslices; 2412 2413 PetscFunctionBegin; 2414 /* If the matrix dimensions are not equal,or no of nonzeros */ 2415 if ((A->rmap->n != B->rmap->n) || (A->cmap->n != B->cmap->n) || (a->nz != b->nz) || (a->rlenmax != b->rlenmax)) { 2416 *flg = PETSC_FALSE; 2417 PetscFunctionReturn(PETSC_SUCCESS); 2418 } 2419 /* if the a->colidx are the same */ 2420 PetscCall(PetscArraycmp(a->colidx, b->colidx, a->sliidx[totalslices], flg)); 2421 if (!*flg) PetscFunctionReturn(PETSC_SUCCESS); 2422 /* if a->val are the same */ 2423 PetscCall(PetscArraycmp(a->val, b->val, a->sliidx[totalslices], flg)); 2424 PetscFunctionReturn(PETSC_SUCCESS); 2425 } 2426 2427 PetscErrorCode MatSeqSELLInvalidateDiagonal(Mat A) 2428 { 2429 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 2430 2431 PetscFunctionBegin; 2432 a->idiagvalid = PETSC_FALSE; 2433 PetscFunctionReturn(PETSC_SUCCESS); 2434 } 2435 2436 PetscErrorCode MatConjugate_SeqSELL(Mat A) 2437 { 2438 #if defined(PETSC_USE_COMPLEX) 2439 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 2440 PetscInt i; 2441 PetscScalar *val = a->val; 2442 2443 PetscFunctionBegin; 2444 for (i = 0; i < a->sliidx[a->totalslices]; i++) { val[i] = PetscConj(val[i]); } 2445 #if defined(PETSC_HAVE_CUDA) 2446 if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) A->offloadmask = PETSC_OFFLOAD_CPU; 2447 #endif 2448 #else 2449 PetscFunctionBegin; 2450 #endif 2451 PetscFunctionReturn(PETSC_SUCCESS); 2452 } 2453