1 2 /* 3 Defines the basic matrix operations for the SELL matrix storage format. 4 */ 5 #include <../src/mat/impls/sell/seq/sell.h> /*I "petscmat.h" I*/ 6 #include <petscblaslapack.h> 7 #include <petsc/private/kernels/blocktranspose.h> 8 9 static PetscBool cited = PETSC_FALSE; 10 static const char citation[] = "@inproceedings{ZhangELLPACK2018,\n" 11 " author = {Hong Zhang and Richard T. Mills and Karl Rupp and Barry F. Smith},\n" 12 " title = {Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512}},\n" 13 " booktitle = {Proceedings of the 47th International Conference on Parallel Processing},\n" 14 " year = 2018\n" 15 "}\n"; 16 17 #if defined(PETSC_HAVE_IMMINTRIN_H) && (defined(__AVX512F__) || (defined(__AVX2__) && defined(__FMA__)) || defined(__AVX__)) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 18 19 #include <immintrin.h> 20 21 #if !defined(_MM_SCALE_8) 22 #define _MM_SCALE_8 8 23 #endif 24 25 #if defined(__AVX512F__) 26 /* these do not work 27 vec_idx = _mm512_loadunpackhi_epi32(vec_idx,acolidx); 28 vec_vals = _mm512_loadunpackhi_pd(vec_vals,aval); 29 */ 30 #define AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \ 31 /* if the mask bit is set, copy from acolidx, otherwise from vec_idx */ \ 32 vec_idx = _mm256_loadu_si256((__m256i const *)acolidx); \ 33 vec_vals = _mm512_loadu_pd(aval); \ 34 vec_x = _mm512_i32gather_pd(vec_idx, x, _MM_SCALE_8); \ 35 vec_y = _mm512_fmadd_pd(vec_x, vec_vals, vec_y) 36 #elif defined(__AVX2__) && defined(__FMA__) 37 #define AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \ 38 vec_vals = _mm256_loadu_pd(aval); \ 39 vec_idx = _mm_loadu_si128((__m128i const *)acolidx); /* SSE2 */ \ 40 vec_x = _mm256_i32gather_pd(x, vec_idx, _MM_SCALE_8); \ 41 vec_y = _mm256_fmadd_pd(vec_x, vec_vals, vec_y) 42 #endif 43 #endif /* PETSC_HAVE_IMMINTRIN_H */ 44 45 /*@C 46 MatSeqSELLSetPreallocation - For good matrix assembly performance 47 the user should preallocate the matrix storage by setting the parameter `nz` 48 (or the array `nnz`). 49 50 Collective 51 52 Input Parameters: 53 + B - The `MATSEQSELL` matrix 54 . rlenmax - number of nonzeros per row (same for all rows), ignored if `rlen` is provided 55 - rlen - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL` 56 57 Level: intermediate 58 59 Notes: 60 Specify the preallocated storage with either `rlenmax` or `rlen` (not both). 61 Set `rlenmax` = `PETSC_DEFAULT` and `rlen` = `NULL` for PETSc to control dynamic memory 62 allocation. 63 64 You can call `MatGetInfo()` to get information on how effective the preallocation was; 65 for example the fields mallocs,nz_allocated,nz_used,nz_unneeded; 66 You can also run with the option `-info` and look for messages with the string 67 malloc in them to see if additional memory allocation was needed. 68 69 Developer Note: 70 Use `rlenmax` of `MAT_SKIP_ALLOCATION` to not allocate any space for the matrix 71 entries or columns indices. 72 73 The maximum number of nonzeos in any row should be as accurate as possible. 74 If it is underestimated, you will get bad performance due to reallocation 75 (`MatSeqXSELLReallocateSELL()`). 76 77 .seealso: `Mat`, `MATSEQSELL`, `MATSELL`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatGetInfo()` 78 @*/ 79 PetscErrorCode MatSeqSELLSetPreallocation(Mat B, PetscInt rlenmax, const PetscInt rlen[]) 80 { 81 PetscFunctionBegin; 82 PetscValidHeaderSpecific(B, MAT_CLASSID, 1); 83 PetscValidType(B, 1); 84 PetscTryMethod(B, "MatSeqSELLSetPreallocation_C", (Mat, PetscInt, const PetscInt[]), (B, rlenmax, rlen)); 85 PetscFunctionReturn(PETSC_SUCCESS); 86 } 87 88 PetscErrorCode MatSeqSELLSetPreallocation_SeqSELL(Mat B, PetscInt maxallocrow, const PetscInt rlen[]) 89 { 90 Mat_SeqSELL *b; 91 PetscInt i, j, totalslices; 92 #if defined(PETSC_HAVE_CUDA) 93 PetscInt rlenmax = 0; 94 #endif 95 PetscBool skipallocation = PETSC_FALSE, realalloc = PETSC_FALSE; 96 97 PetscFunctionBegin; 98 if (maxallocrow >= 0 || rlen) realalloc = PETSC_TRUE; 99 if (maxallocrow == MAT_SKIP_ALLOCATION) { 100 skipallocation = PETSC_TRUE; 101 maxallocrow = 0; 102 } 103 104 PetscCall(PetscLayoutSetUp(B->rmap)); 105 PetscCall(PetscLayoutSetUp(B->cmap)); 106 107 /* FIXME: if one preallocates more space than needed, the matrix does not shrink automatically, but for best performance it should */ 108 if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 5; 109 PetscCheck(maxallocrow >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "maxallocrow cannot be less than 0: value %" PetscInt_FMT, maxallocrow); 110 if (rlen) { 111 for (i = 0; i < B->rmap->n; i++) { 112 PetscCheck(rlen[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "rlen cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, rlen[i]); 113 PetscCheck(rlen[i] <= B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "rlen cannot be greater than row length: local row %" PetscInt_FMT " value %" PetscInt_FMT " rowlength %" PetscInt_FMT, i, rlen[i], B->cmap->n); 114 } 115 } 116 117 B->preallocated = PETSC_TRUE; 118 119 b = (Mat_SeqSELL *)B->data; 120 121 if (!b->sliceheight) { /* not set yet */ 122 #if defined(PETSC_HAVE_CUDA) 123 b->sliceheight = 16; 124 #else 125 b->sliceheight = 8; 126 #endif 127 } 128 totalslices = PetscCeilInt(B->rmap->n, b->sliceheight); 129 b->totalslices = totalslices; 130 if (!skipallocation) { 131 if (B->rmap->n % b->sliceheight) PetscCall(PetscInfo(B, "Padding rows to the SEQSELL matrix because the number of rows is not the multiple of the slice height (value %" PetscInt_FMT ")\n", B->rmap->n)); 132 133 if (!b->sliidx) { /* sliidx gives the starting index of each slice, the last element is the total space allocated */ 134 PetscCall(PetscMalloc1(totalslices + 1, &b->sliidx)); 135 } 136 if (!rlen) { /* if rlen is not provided, allocate same space for all the slices */ 137 if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 10; 138 else if (maxallocrow < 0) maxallocrow = 1; 139 #if defined(PETSC_HAVE_CUDA) 140 rlenmax = maxallocrow; 141 /* Pad the slice to DEVICE_MEM_ALIGN */ 142 while (b->sliceheight * maxallocrow % DEVICE_MEM_ALIGN) maxallocrow++; 143 #endif 144 for (i = 0; i <= totalslices; i++) b->sliidx[i] = b->sliceheight * i * maxallocrow; 145 } else { 146 #if defined(PETSC_HAVE_CUDA) 147 PetscInt mul = DEVICE_MEM_ALIGN / b->sliceheight; 148 #endif 149 maxallocrow = 0; 150 b->sliidx[0] = 0; 151 for (i = 1; i < totalslices; i++) { 152 b->sliidx[i] = 0; 153 for (j = 0; j < b->sliceheight; j++) { b->sliidx[i] = PetscMax(b->sliidx[i], rlen[b->sliceheight * (i - 1) + j]); } 154 #if defined(PETSC_HAVE_CUDA) 155 rlenmax = PetscMax(b->sliidx[i], rlenmax); 156 /* Pad the slice to DEVICE_MEM_ALIGN */ 157 b->sliidx[i] = ((b->sliidx[i] - 1) / mul + 1) * mul; 158 #endif 159 maxallocrow = PetscMax(b->sliidx[i], maxallocrow); 160 PetscCall(PetscIntSumError(b->sliidx[i - 1], b->sliceheight * b->sliidx[i], &b->sliidx[i])); 161 } 162 /* last slice */ 163 b->sliidx[totalslices] = 0; 164 for (j = b->sliceheight * (totalslices - 1); j < B->rmap->n; j++) b->sliidx[totalslices] = PetscMax(b->sliidx[totalslices], rlen[j]); 165 #if defined(PETSC_HAVE_CUDA) 166 rlenmax = PetscMax(b->sliidx[i], rlenmax); 167 b->sliidx[totalslices] = ((b->sliidx[totalslices] - 1) / mul + 1) * mul; 168 #endif 169 maxallocrow = PetscMax(b->sliidx[totalslices], maxallocrow); 170 b->sliidx[totalslices] = b->sliidx[totalslices - 1] + b->sliceheight * b->sliidx[totalslices]; 171 } 172 173 /* allocate space for val, colidx, rlen */ 174 /* FIXME: should B's old memory be unlogged? */ 175 PetscCall(MatSeqXSELLFreeSELL(B, &b->val, &b->colidx)); 176 /* FIXME: assuming an element of the bit array takes 8 bits */ 177 PetscCall(PetscMalloc2(b->sliidx[totalslices], &b->val, b->sliidx[totalslices], &b->colidx)); 178 /* b->rlen will count nonzeros in each row so far. We dont copy rlen to b->rlen because the matrix has not been set. */ 179 PetscCall(PetscCalloc1(b->sliceheight * totalslices, &b->rlen)); 180 181 b->singlemalloc = PETSC_TRUE; 182 b->free_val = PETSC_TRUE; 183 b->free_colidx = PETSC_TRUE; 184 } else { 185 b->free_val = PETSC_FALSE; 186 b->free_colidx = PETSC_FALSE; 187 } 188 189 b->nz = 0; 190 b->maxallocrow = maxallocrow; 191 #if defined(PETSC_HAVE_CUDA) 192 b->rlenmax = rlenmax; 193 #else 194 b->rlenmax = maxallocrow; 195 #endif 196 b->maxallocmat = b->sliidx[totalslices]; 197 B->info.nz_unneeded = (double)b->maxallocmat; 198 if (realalloc) PetscCall(MatSetOption(B, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE)); 199 PetscFunctionReturn(PETSC_SUCCESS); 200 } 201 202 PetscErrorCode MatGetRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v) 203 { 204 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 205 PetscInt shift; 206 207 PetscFunctionBegin; 208 PetscCheck(row >= 0 && row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row %" PetscInt_FMT " out of range", row); 209 if (nz) *nz = a->rlen[row]; 210 shift = a->sliidx[row / a->sliceheight] + (row % a->sliceheight); 211 if (!a->getrowcols) { PetscCall(PetscMalloc2(a->rlenmax, &a->getrowcols, a->rlenmax, &a->getrowvals)); } 212 if (idx) { 213 PetscInt j; 214 for (j = 0; j < a->rlen[row]; j++) a->getrowcols[j] = a->colidx[shift + a->sliceheight * j]; 215 *idx = a->getrowcols; 216 } 217 if (v) { 218 PetscInt j; 219 for (j = 0; j < a->rlen[row]; j++) a->getrowvals[j] = a->val[shift + a->sliceheight * j]; 220 *v = a->getrowvals; 221 } 222 PetscFunctionReturn(PETSC_SUCCESS); 223 } 224 225 PetscErrorCode MatRestoreRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v) 226 { 227 PetscFunctionBegin; 228 PetscFunctionReturn(PETSC_SUCCESS); 229 } 230 231 PetscErrorCode MatConvert_SeqSELL_SeqAIJ(Mat A, MatType newtype, MatReuse reuse, Mat *newmat) 232 { 233 Mat B; 234 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 235 PetscInt i; 236 237 PetscFunctionBegin; 238 if (reuse == MAT_REUSE_MATRIX) { 239 B = *newmat; 240 PetscCall(MatZeroEntries(B)); 241 } else { 242 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B)); 243 PetscCall(MatSetSizes(B, A->rmap->n, A->cmap->n, A->rmap->N, A->cmap->N)); 244 PetscCall(MatSetType(B, MATSEQAIJ)); 245 PetscCall(MatSeqAIJSetPreallocation(B, 0, a->rlen)); 246 } 247 248 for (i = 0; i < A->rmap->n; i++) { 249 PetscInt nz = 0, *cols = NULL; 250 PetscScalar *vals = NULL; 251 252 PetscCall(MatGetRow_SeqSELL(A, i, &nz, &cols, &vals)); 253 PetscCall(MatSetValues(B, 1, &i, nz, cols, vals, INSERT_VALUES)); 254 PetscCall(MatRestoreRow_SeqSELL(A, i, &nz, &cols, &vals)); 255 } 256 257 PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY)); 258 PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY)); 259 B->rmap->bs = A->rmap->bs; 260 261 if (reuse == MAT_INPLACE_MATRIX) { 262 PetscCall(MatHeaderReplace(A, &B)); 263 } else { 264 *newmat = B; 265 } 266 PetscFunctionReturn(PETSC_SUCCESS); 267 } 268 269 #include <../src/mat/impls/aij/seq/aij.h> 270 271 PetscErrorCode MatConvert_SeqAIJ_SeqSELL(Mat A, MatType newtype, MatReuse reuse, Mat *newmat) 272 { 273 Mat B; 274 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 275 PetscInt *ai = a->i, m = A->rmap->N, n = A->cmap->N, i, *rowlengths, row, ncols; 276 const PetscInt *cols; 277 const PetscScalar *vals; 278 279 PetscFunctionBegin; 280 281 if (reuse == MAT_REUSE_MATRIX) { 282 B = *newmat; 283 } else { 284 if (PetscDefined(USE_DEBUG) || !a->ilen) { 285 PetscCall(PetscMalloc1(m, &rowlengths)); 286 for (i = 0; i < m; i++) rowlengths[i] = ai[i + 1] - ai[i]; 287 } 288 if (PetscDefined(USE_DEBUG) && a->ilen) { 289 PetscBool eq; 290 PetscCall(PetscMemcmp(rowlengths, a->ilen, m * sizeof(PetscInt), &eq)); 291 PetscCheck(eq, PETSC_COMM_SELF, PETSC_ERR_PLIB, "SeqAIJ ilen array incorrect"); 292 PetscCall(PetscFree(rowlengths)); 293 rowlengths = a->ilen; 294 } else if (a->ilen) rowlengths = a->ilen; 295 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B)); 296 PetscCall(MatSetSizes(B, m, n, m, n)); 297 PetscCall(MatSetType(B, MATSEQSELL)); 298 PetscCall(MatSeqSELLSetPreallocation(B, 0, rowlengths)); 299 if (rowlengths != a->ilen) PetscCall(PetscFree(rowlengths)); 300 } 301 302 for (row = 0; row < m; row++) { 303 PetscCall(MatGetRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals)); 304 PetscCall(MatSetValues_SeqSELL(B, 1, &row, ncols, cols, vals, INSERT_VALUES)); 305 PetscCall(MatRestoreRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals)); 306 } 307 PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY)); 308 PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY)); 309 B->rmap->bs = A->rmap->bs; 310 311 if (reuse == MAT_INPLACE_MATRIX) { 312 PetscCall(MatHeaderReplace(A, &B)); 313 } else { 314 *newmat = B; 315 } 316 PetscFunctionReturn(PETSC_SUCCESS); 317 } 318 319 PetscErrorCode MatMult_SeqSELL(Mat A, Vec xx, Vec yy) 320 { 321 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 322 PetscScalar *y; 323 const PetscScalar *x; 324 const MatScalar *aval = a->val; 325 PetscInt totalslices = a->totalslices; 326 const PetscInt *acolidx = a->colidx; 327 PetscInt i, j; 328 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 329 __m512d vec_x, vec_y, vec_vals; 330 __m256i vec_idx; 331 __mmask8 mask; 332 __m512d vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4; 333 __m256i vec_idx2, vec_idx3, vec_idx4; 334 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 335 __m128i vec_idx; 336 __m256d vec_x, vec_y, vec_y2, vec_vals; 337 MatScalar yval; 338 PetscInt r, rows_left, row, nnz_in_row; 339 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 340 __m128d vec_x_tmp; 341 __m256d vec_x, vec_y, vec_y2, vec_vals; 342 MatScalar yval; 343 PetscInt r, rows_left, row, nnz_in_row; 344 #else 345 PetscInt k, sliceheight = a->sliceheight; 346 PetscScalar *sum; 347 #endif 348 349 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 350 #pragma disjoint(*x, *y, *aval) 351 #endif 352 353 PetscFunctionBegin; 354 PetscCall(VecGetArrayRead(xx, &x)); 355 PetscCall(VecGetArray(yy, &y)); 356 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 357 PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight); 358 for (i = 0; i < totalslices; i++) { /* loop over slices */ 359 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 360 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 361 362 vec_y = _mm512_setzero_pd(); 363 vec_y2 = _mm512_setzero_pd(); 364 vec_y3 = _mm512_setzero_pd(); 365 vec_y4 = _mm512_setzero_pd(); 366 367 j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice column */ 368 switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) { 369 case 3: 370 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 371 acolidx += 8; 372 aval += 8; 373 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 374 acolidx += 8; 375 aval += 8; 376 AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3); 377 acolidx += 8; 378 aval += 8; 379 j += 3; 380 break; 381 case 2: 382 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 383 acolidx += 8; 384 aval += 8; 385 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 386 acolidx += 8; 387 aval += 8; 388 j += 2; 389 break; 390 case 1: 391 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 392 acolidx += 8; 393 aval += 8; 394 j += 1; 395 break; 396 } 397 #pragma novector 398 for (; j < (a->sliidx[i + 1] >> 3); j += 4) { 399 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 400 acolidx += 8; 401 aval += 8; 402 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 403 acolidx += 8; 404 aval += 8; 405 AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3); 406 acolidx += 8; 407 aval += 8; 408 AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4); 409 acolidx += 8; 410 aval += 8; 411 } 412 413 vec_y = _mm512_add_pd(vec_y, vec_y2); 414 vec_y = _mm512_add_pd(vec_y, vec_y3); 415 vec_y = _mm512_add_pd(vec_y, vec_y4); 416 if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */ 417 mask = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07))); 418 _mm512_mask_storeu_pd(&y[8 * i], mask, vec_y); 419 } else { 420 _mm512_storeu_pd(&y[8 * i], vec_y); 421 } 422 } 423 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 424 PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight); 425 for (i = 0; i < totalslices; i++) { /* loop over full slices */ 426 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 427 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 428 429 /* last slice may have padding rows. Don't use vectorization. */ 430 if (i == totalslices - 1 && (A->rmap->n & 0x07)) { 431 rows_left = A->rmap->n - 8 * i; 432 for (r = 0; r < rows_left; ++r) { 433 yval = (MatScalar)0; 434 row = 8 * i + r; 435 nnz_in_row = a->rlen[row]; 436 for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]]; 437 y[row] = yval; 438 } 439 break; 440 } 441 442 vec_y = _mm256_setzero_pd(); 443 vec_y2 = _mm256_setzero_pd(); 444 445 /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */ 446 #pragma novector 447 #pragma unroll(2) 448 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) { 449 AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 450 aval += 4; 451 acolidx += 4; 452 AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y2); 453 aval += 4; 454 acolidx += 4; 455 } 456 457 _mm256_storeu_pd(y + i * 8, vec_y); 458 _mm256_storeu_pd(y + i * 8 + 4, vec_y2); 459 } 460 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 461 PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight); 462 for (i = 0; i < totalslices; i++) { /* loop over full slices */ 463 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 464 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 465 466 vec_y = _mm256_setzero_pd(); 467 vec_y2 = _mm256_setzero_pd(); 468 469 /* last slice may have padding rows. Don't use vectorization. */ 470 if (i == totalslices - 1 && (A->rmap->n & 0x07)) { 471 rows_left = A->rmap->n - 8 * i; 472 for (r = 0; r < rows_left; ++r) { 473 yval = (MatScalar)0; 474 row = 8 * i + r; 475 nnz_in_row = a->rlen[row]; 476 for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]]; 477 y[row] = yval; 478 } 479 break; 480 } 481 482 /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */ 483 #pragma novector 484 #pragma unroll(2) 485 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) { 486 vec_vals = _mm256_loadu_pd(aval); 487 vec_x_tmp = _mm_setzero_pd(); 488 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 489 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 490 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0); 491 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 492 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 493 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1); 494 vec_y = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y); 495 aval += 4; 496 497 vec_vals = _mm256_loadu_pd(aval); 498 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 499 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 500 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0); 501 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 502 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 503 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1); 504 vec_y2 = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2); 505 aval += 4; 506 } 507 508 _mm256_storeu_pd(y + i * 8, vec_y); 509 _mm256_storeu_pd(y + i * 8 + 4, vec_y2); 510 } 511 #else 512 PetscCall(PetscMalloc1(sliceheight, &sum)); 513 for (i = 0; i < totalslices; i++) { /* loop over slices */ 514 for (j = 0; j < sliceheight; j++) { 515 sum[j] = 0.0; 516 for (k = a->sliidx[i] + j; k < a->sliidx[i + 1]; k += sliceheight) sum[j] += aval[k] * x[acolidx[k]]; 517 } 518 if (i == totalslices - 1 && (A->rmap->n % sliceheight)) { /* if last slice has padding rows */ 519 for (j = 0; j < (A->rmap->n % sliceheight); j++) y[sliceheight * i + j] = sum[j]; 520 } else { 521 for (j = 0; j < sliceheight; j++) y[sliceheight * i + j] = sum[j]; 522 } 523 } 524 PetscCall(PetscFree(sum)); 525 #endif 526 527 PetscCall(PetscLogFlops(2.0 * a->nz - a->nonzerorowcnt)); /* theoretical minimal FLOPs */ 528 PetscCall(VecRestoreArrayRead(xx, &x)); 529 PetscCall(VecRestoreArray(yy, &y)); 530 PetscFunctionReturn(PETSC_SUCCESS); 531 } 532 533 #include <../src/mat/impls/aij/seq/ftn-kernels/fmultadd.h> 534 PetscErrorCode MatMultAdd_SeqSELL(Mat A, Vec xx, Vec yy, Vec zz) 535 { 536 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 537 PetscScalar *y, *z; 538 const PetscScalar *x; 539 const MatScalar *aval = a->val; 540 PetscInt totalslices = a->totalslices; 541 const PetscInt *acolidx = a->colidx; 542 PetscInt i, j; 543 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 544 __m512d vec_x, vec_y, vec_vals; 545 __m256i vec_idx; 546 __mmask8 mask; 547 __m512d vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4; 548 __m256i vec_idx2, vec_idx3, vec_idx4; 549 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 550 __m128d vec_x_tmp; 551 __m256d vec_x, vec_y, vec_y2, vec_vals; 552 MatScalar yval; 553 PetscInt r, row, nnz_in_row; 554 #else 555 PetscInt k, sliceheight = a->sliceheight; 556 PetscScalar *sum; 557 #endif 558 559 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 560 #pragma disjoint(*x, *y, *aval) 561 #endif 562 563 PetscFunctionBegin; 564 if (!a->nz) { 565 PetscCall(VecCopy(yy, zz)); 566 PetscFunctionReturn(PETSC_SUCCESS); 567 } 568 PetscCall(VecGetArrayRead(xx, &x)); 569 PetscCall(VecGetArrayPair(yy, zz, &y, &z)); 570 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 571 PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight); 572 for (i = 0; i < totalslices; i++) { /* loop over slices */ 573 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 574 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 575 576 if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */ 577 mask = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07))); 578 vec_y = _mm512_mask_loadu_pd(vec_y, mask, &y[8 * i]); 579 } else { 580 vec_y = _mm512_loadu_pd(&y[8 * i]); 581 } 582 vec_y2 = _mm512_setzero_pd(); 583 vec_y3 = _mm512_setzero_pd(); 584 vec_y4 = _mm512_setzero_pd(); 585 586 j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice column */ 587 switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) { 588 case 3: 589 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 590 acolidx += 8; 591 aval += 8; 592 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 593 acolidx += 8; 594 aval += 8; 595 AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3); 596 acolidx += 8; 597 aval += 8; 598 j += 3; 599 break; 600 case 2: 601 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 602 acolidx += 8; 603 aval += 8; 604 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 605 acolidx += 8; 606 aval += 8; 607 j += 2; 608 break; 609 case 1: 610 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 611 acolidx += 8; 612 aval += 8; 613 j += 1; 614 break; 615 } 616 #pragma novector 617 for (; j < (a->sliidx[i + 1] >> 3); j += 4) { 618 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 619 acolidx += 8; 620 aval += 8; 621 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 622 acolidx += 8; 623 aval += 8; 624 AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3); 625 acolidx += 8; 626 aval += 8; 627 AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4); 628 acolidx += 8; 629 aval += 8; 630 } 631 632 vec_y = _mm512_add_pd(vec_y, vec_y2); 633 vec_y = _mm512_add_pd(vec_y, vec_y3); 634 vec_y = _mm512_add_pd(vec_y, vec_y4); 635 if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */ 636 _mm512_mask_storeu_pd(&z[8 * i], mask, vec_y); 637 } else { 638 _mm512_storeu_pd(&z[8 * i], vec_y); 639 } 640 } 641 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 642 PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight); 643 for (i = 0; i < totalslices; i++) { /* loop over full slices */ 644 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 645 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 646 647 /* last slice may have padding rows. Don't use vectorization. */ 648 if (i == totalslices - 1 && (A->rmap->n & 0x07)) { 649 for (r = 0; r < (A->rmap->n & 0x07); ++r) { 650 row = 8 * i + r; 651 yval = (MatScalar)0.0; 652 nnz_in_row = a->rlen[row]; 653 for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]]; 654 z[row] = y[row] + yval; 655 } 656 break; 657 } 658 659 vec_y = _mm256_loadu_pd(y + 8 * i); 660 vec_y2 = _mm256_loadu_pd(y + 8 * i + 4); 661 662 /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */ 663 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) { 664 vec_vals = _mm256_loadu_pd(aval); 665 vec_x_tmp = _mm_setzero_pd(); 666 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 667 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 668 vec_x = _mm256_setzero_pd(); 669 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0); 670 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 671 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 672 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1); 673 vec_y = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y); 674 aval += 4; 675 676 vec_vals = _mm256_loadu_pd(aval); 677 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 678 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 679 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0); 680 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 681 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 682 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1); 683 vec_y2 = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2); 684 aval += 4; 685 } 686 687 _mm256_storeu_pd(z + i * 8, vec_y); 688 _mm256_storeu_pd(z + i * 8 + 4, vec_y2); 689 } 690 #else 691 PetscCall(PetscMalloc1(sliceheight, &sum)); 692 for (i = 0; i < totalslices; i++) { /* loop over slices */ 693 for (j = 0; j < sliceheight; j++) { 694 sum[j] = 0.0; 695 for (k = a->sliidx[i] + j; k < a->sliidx[i + 1]; k += sliceheight) sum[j] += aval[k] * x[acolidx[k]]; 696 } 697 if (i == totalslices - 1 && (A->rmap->n % sliceheight)) { 698 for (j = 0; j < (A->rmap->n % sliceheight); j++) z[sliceheight * i + j] = y[sliceheight * i + j] + sum[j]; 699 } else { 700 for (j = 0; j < sliceheight; j++) z[sliceheight * i + j] = y[sliceheight * i + j] + sum[j]; 701 } 702 } 703 PetscCall(PetscFree(sum)); 704 #endif 705 706 PetscCall(PetscLogFlops(2.0 * a->nz)); 707 PetscCall(VecRestoreArrayRead(xx, &x)); 708 PetscCall(VecRestoreArrayPair(yy, zz, &y, &z)); 709 PetscFunctionReturn(PETSC_SUCCESS); 710 } 711 712 PetscErrorCode MatMultTransposeAdd_SeqSELL(Mat A, Vec xx, Vec zz, Vec yy) 713 { 714 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 715 PetscScalar *y; 716 const PetscScalar *x; 717 const MatScalar *aval = a->val; 718 const PetscInt *acolidx = a->colidx; 719 PetscInt i, j, r, row, nnz_in_row, totalslices = a->totalslices, sliceheight = a->sliceheight; 720 721 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 722 #pragma disjoint(*x, *y, *aval) 723 #endif 724 725 PetscFunctionBegin; 726 if (A->symmetric == PETSC_BOOL3_TRUE) { 727 PetscCall(MatMultAdd_SeqSELL(A, xx, zz, yy)); 728 PetscFunctionReturn(PETSC_SUCCESS); 729 } 730 if (zz != yy) PetscCall(VecCopy(zz, yy)); 731 732 if (a->nz) { 733 PetscCall(VecGetArrayRead(xx, &x)); 734 PetscCall(VecGetArray(yy, &y)); 735 for (i = 0; i < a->totalslices; i++) { /* loop over slices */ 736 if (i == totalslices - 1 && (A->rmap->n % sliceheight)) { 737 for (r = 0; r < (A->rmap->n % sliceheight); ++r) { 738 row = sliceheight * i + r; 739 nnz_in_row = a->rlen[row]; 740 for (j = 0; j < nnz_in_row; ++j) y[acolidx[sliceheight * j + r]] += aval[sliceheight * j + r] * x[row]; 741 } 742 break; 743 } 744 for (r = 0; r < sliceheight; ++r) 745 for (j = a->sliidx[i] + r; j < a->sliidx[i + 1]; j += sliceheight) y[acolidx[j]] += aval[j] * x[sliceheight * i + r]; 746 } 747 PetscCall(PetscLogFlops(2.0 * a->nz)); 748 PetscCall(VecRestoreArrayRead(xx, &x)); 749 PetscCall(VecRestoreArray(yy, &y)); 750 } 751 PetscFunctionReturn(PETSC_SUCCESS); 752 } 753 754 PetscErrorCode MatMultTranspose_SeqSELL(Mat A, Vec xx, Vec yy) 755 { 756 PetscFunctionBegin; 757 if (A->symmetric == PETSC_BOOL3_TRUE) { 758 PetscCall(MatMult_SeqSELL(A, xx, yy)); 759 } else { 760 PetscCall(VecSet(yy, 0.0)); 761 PetscCall(MatMultTransposeAdd_SeqSELL(A, xx, yy, yy)); 762 } 763 PetscFunctionReturn(PETSC_SUCCESS); 764 } 765 766 /* 767 Checks for missing diagonals 768 */ 769 PetscErrorCode MatMissingDiagonal_SeqSELL(Mat A, PetscBool *missing, PetscInt *d) 770 { 771 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 772 PetscInt *diag, i; 773 774 PetscFunctionBegin; 775 *missing = PETSC_FALSE; 776 if (A->rmap->n > 0 && !(a->colidx)) { 777 *missing = PETSC_TRUE; 778 if (d) *d = 0; 779 PetscCall(PetscInfo(A, "Matrix has no entries therefore is missing diagonal\n")); 780 } else { 781 diag = a->diag; 782 for (i = 0; i < A->rmap->n; i++) { 783 if (diag[i] == -1) { 784 *missing = PETSC_TRUE; 785 if (d) *d = i; 786 PetscCall(PetscInfo(A, "Matrix is missing diagonal number %" PetscInt_FMT "\n", i)); 787 break; 788 } 789 } 790 } 791 PetscFunctionReturn(PETSC_SUCCESS); 792 } 793 794 PetscErrorCode MatMarkDiagonal_SeqSELL(Mat A) 795 { 796 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 797 PetscInt i, j, m = A->rmap->n, shift; 798 799 PetscFunctionBegin; 800 if (!a->diag) { 801 PetscCall(PetscMalloc1(m, &a->diag)); 802 a->free_diag = PETSC_TRUE; 803 } 804 for (i = 0; i < m; i++) { /* loop over rows */ 805 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */ 806 a->diag[i] = -1; 807 for (j = 0; j < a->rlen[i]; j++) { 808 if (a->colidx[shift + a->sliceheight * j] == i) { 809 a->diag[i] = shift + a->sliceheight * j; 810 break; 811 } 812 } 813 } 814 PetscFunctionReturn(PETSC_SUCCESS); 815 } 816 817 /* 818 Negative shift indicates do not generate an error if there is a zero diagonal, just invert it anyways 819 */ 820 PetscErrorCode MatInvertDiagonal_SeqSELL(Mat A, PetscScalar omega, PetscScalar fshift) 821 { 822 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 823 PetscInt i, *diag, m = A->rmap->n; 824 MatScalar *val = a->val; 825 PetscScalar *idiag, *mdiag; 826 827 PetscFunctionBegin; 828 if (a->idiagvalid) PetscFunctionReturn(PETSC_SUCCESS); 829 PetscCall(MatMarkDiagonal_SeqSELL(A)); 830 diag = a->diag; 831 if (!a->idiag) { 832 PetscCall(PetscMalloc3(m, &a->idiag, m, &a->mdiag, m, &a->ssor_work)); 833 val = a->val; 834 } 835 mdiag = a->mdiag; 836 idiag = a->idiag; 837 838 if (omega == 1.0 && PetscRealPart(fshift) <= 0.0) { 839 for (i = 0; i < m; i++) { 840 mdiag[i] = val[diag[i]]; 841 if (!PetscAbsScalar(mdiag[i])) { /* zero diagonal */ 842 PetscCheck(PetscRealPart(fshift), PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Zero diagonal on row %" PetscInt_FMT, i); 843 PetscCall(PetscInfo(A, "Zero diagonal on row %" PetscInt_FMT "\n", i)); 844 A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 845 A->factorerror_zeropivot_value = 0.0; 846 A->factorerror_zeropivot_row = i; 847 } 848 idiag[i] = 1.0 / val[diag[i]]; 849 } 850 PetscCall(PetscLogFlops(m)); 851 } else { 852 for (i = 0; i < m; i++) { 853 mdiag[i] = val[diag[i]]; 854 idiag[i] = omega / (fshift + val[diag[i]]); 855 } 856 PetscCall(PetscLogFlops(2.0 * m)); 857 } 858 a->idiagvalid = PETSC_TRUE; 859 PetscFunctionReturn(PETSC_SUCCESS); 860 } 861 862 PetscErrorCode MatZeroEntries_SeqSELL(Mat A) 863 { 864 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 865 866 PetscFunctionBegin; 867 PetscCall(PetscArrayzero(a->val, a->sliidx[a->totalslices])); 868 PetscCall(MatSeqSELLInvalidateDiagonal(A)); 869 PetscFunctionReturn(PETSC_SUCCESS); 870 } 871 872 PetscErrorCode MatDestroy_SeqSELL(Mat A) 873 { 874 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 875 876 PetscFunctionBegin; 877 #if defined(PETSC_USE_LOG) 878 PetscCall(PetscLogObjectState((PetscObject)A, "Rows=%" PetscInt_FMT ", Cols=%" PetscInt_FMT ", NZ=%" PetscInt_FMT, A->rmap->n, A->cmap->n, a->nz)); 879 #endif 880 PetscCall(MatSeqXSELLFreeSELL(A, &a->val, &a->colidx)); 881 PetscCall(ISDestroy(&a->row)); 882 PetscCall(ISDestroy(&a->col)); 883 PetscCall(PetscFree(a->diag)); 884 PetscCall(PetscFree(a->rlen)); 885 PetscCall(PetscFree(a->sliidx)); 886 PetscCall(PetscFree3(a->idiag, a->mdiag, a->ssor_work)); 887 PetscCall(PetscFree(a->solve_work)); 888 PetscCall(ISDestroy(&a->icol)); 889 PetscCall(PetscFree(a->saved_values)); 890 PetscCall(PetscFree2(a->getrowcols, a->getrowvals)); 891 PetscCall(PetscFree(A->data)); 892 893 PetscCall(PetscObjectChangeTypeName((PetscObject)A, NULL)); 894 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatStoreValues_C", NULL)); 895 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatRetrieveValues_C", NULL)); 896 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLSetPreallocation_C", NULL)); 897 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetArray_C", NULL)); 898 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLRestoreArray_C", NULL)); 899 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_SeqSELL_SeqAIJ_C", NULL)); 900 #if defined(PETSC_HAVE_CUDA) 901 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_SeqSELL_SeqSELLCUDA_C", NULL)); 902 #endif 903 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetFillRatio_C", NULL)); 904 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetMaxSliceWidth_C", NULL)); 905 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetAvgSliceWidth_C", NULL)); 906 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLSetSliceHeight_C", NULL)); 907 PetscFunctionReturn(PETSC_SUCCESS); 908 } 909 910 PetscErrorCode MatSetOption_SeqSELL(Mat A, MatOption op, PetscBool flg) 911 { 912 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 913 914 PetscFunctionBegin; 915 switch (op) { 916 case MAT_ROW_ORIENTED: 917 a->roworiented = flg; 918 break; 919 case MAT_KEEP_NONZERO_PATTERN: 920 a->keepnonzeropattern = flg; 921 break; 922 case MAT_NEW_NONZERO_LOCATIONS: 923 a->nonew = (flg ? 0 : 1); 924 break; 925 case MAT_NEW_NONZERO_LOCATION_ERR: 926 a->nonew = (flg ? -1 : 0); 927 break; 928 case MAT_NEW_NONZERO_ALLOCATION_ERR: 929 a->nonew = (flg ? -2 : 0); 930 break; 931 case MAT_UNUSED_NONZERO_LOCATION_ERR: 932 a->nounused = (flg ? -1 : 0); 933 break; 934 case MAT_FORCE_DIAGONAL_ENTRIES: 935 case MAT_IGNORE_OFF_PROC_ENTRIES: 936 case MAT_USE_HASH_TABLE: 937 case MAT_SORTED_FULL: 938 PetscCall(PetscInfo(A, "Option %s ignored\n", MatOptions[op])); 939 break; 940 case MAT_SPD: 941 case MAT_SYMMETRIC: 942 case MAT_STRUCTURALLY_SYMMETRIC: 943 case MAT_HERMITIAN: 944 case MAT_SYMMETRY_ETERNAL: 945 case MAT_STRUCTURAL_SYMMETRY_ETERNAL: 946 case MAT_SPD_ETERNAL: 947 /* These options are handled directly by MatSetOption() */ 948 break; 949 default: 950 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unknown option %d", op); 951 } 952 PetscFunctionReturn(PETSC_SUCCESS); 953 } 954 955 PetscErrorCode MatGetDiagonal_SeqSELL(Mat A, Vec v) 956 { 957 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 958 PetscInt i, j, n, shift; 959 PetscScalar *x, zero = 0.0; 960 961 PetscFunctionBegin; 962 PetscCall(VecGetLocalSize(v, &n)); 963 PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector"); 964 965 if (A->factortype == MAT_FACTOR_ILU || A->factortype == MAT_FACTOR_LU) { 966 PetscInt *diag = a->diag; 967 PetscCall(VecGetArray(v, &x)); 968 for (i = 0; i < n; i++) x[i] = 1.0 / a->val[diag[i]]; 969 PetscCall(VecRestoreArray(v, &x)); 970 PetscFunctionReturn(PETSC_SUCCESS); 971 } 972 973 PetscCall(VecSet(v, zero)); 974 PetscCall(VecGetArray(v, &x)); 975 for (i = 0; i < n; i++) { /* loop over rows */ 976 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */ 977 x[i] = 0; 978 for (j = 0; j < a->rlen[i]; j++) { 979 if (a->colidx[shift + a->sliceheight * j] == i) { 980 x[i] = a->val[shift + a->sliceheight * j]; 981 break; 982 } 983 } 984 } 985 PetscCall(VecRestoreArray(v, &x)); 986 PetscFunctionReturn(PETSC_SUCCESS); 987 } 988 989 PetscErrorCode MatDiagonalScale_SeqSELL(Mat A, Vec ll, Vec rr) 990 { 991 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 992 const PetscScalar *l, *r; 993 PetscInt i, j, m, n, row; 994 995 PetscFunctionBegin; 996 if (ll) { 997 /* The local size is used so that VecMPI can be passed to this routine 998 by MatDiagonalScale_MPISELL */ 999 PetscCall(VecGetLocalSize(ll, &m)); 1000 PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length"); 1001 PetscCall(VecGetArrayRead(ll, &l)); 1002 for (i = 0; i < a->totalslices; i++) { /* loop over slices */ 1003 if (i == a->totalslices - 1 && (A->rmap->n % a->sliceheight)) { /* if last slice has padding rows */ 1004 for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = (row + 1) % a->sliceheight) { 1005 if (row < (A->rmap->n % a->sliceheight)) a->val[j] *= l[a->sliceheight * i + row]; 1006 } 1007 } else { 1008 for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = (row + 1) % a->sliceheight) { a->val[j] *= l[a->sliceheight * i + row]; } 1009 } 1010 } 1011 PetscCall(VecRestoreArrayRead(ll, &l)); 1012 PetscCall(PetscLogFlops(a->nz)); 1013 } 1014 if (rr) { 1015 PetscCall(VecGetLocalSize(rr, &n)); 1016 PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length"); 1017 PetscCall(VecGetArrayRead(rr, &r)); 1018 for (i = 0; i < a->totalslices; i++) { /* loop over slices */ 1019 if (i == a->totalslices - 1 && (A->rmap->n % a->sliceheight)) { /* if last slice has padding rows */ 1020 for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) % a->sliceheight)) { 1021 if (row < (A->rmap->n % a->sliceheight)) a->val[j] *= r[a->colidx[j]]; 1022 } 1023 } else { 1024 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j++) a->val[j] *= r[a->colidx[j]]; 1025 } 1026 } 1027 PetscCall(VecRestoreArrayRead(rr, &r)); 1028 PetscCall(PetscLogFlops(a->nz)); 1029 } 1030 PetscCall(MatSeqSELLInvalidateDiagonal(A)); 1031 #if defined(PETSC_HAVE_CUDA) 1032 if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) A->offloadmask = PETSC_OFFLOAD_CPU; 1033 #endif 1034 PetscFunctionReturn(PETSC_SUCCESS); 1035 } 1036 1037 PetscErrorCode MatGetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], PetscScalar v[]) 1038 { 1039 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1040 PetscInt *cp, i, k, low, high, t, row, col, l; 1041 PetscInt shift; 1042 MatScalar *vp; 1043 1044 PetscFunctionBegin; 1045 for (k = 0; k < m; k++) { /* loop over requested rows */ 1046 row = im[k]; 1047 if (row < 0) continue; 1048 PetscCheck(row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->n - 1); 1049 shift = a->sliidx[row / a->sliceheight] + (row % a->sliceheight); /* starting index of the row */ 1050 cp = a->colidx + shift; /* pointer to the row */ 1051 vp = a->val + shift; /* pointer to the row */ 1052 for (l = 0; l < n; l++) { /* loop over requested columns */ 1053 col = in[l]; 1054 if (col < 0) continue; 1055 PetscCheck(col < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column too large: row %" PetscInt_FMT " max %" PetscInt_FMT, col, A->cmap->n - 1); 1056 high = a->rlen[row]; 1057 low = 0; /* assume unsorted */ 1058 while (high - low > 5) { 1059 t = (low + high) / 2; 1060 if (*(cp + a->sliceheight * t) > col) high = t; 1061 else low = t; 1062 } 1063 for (i = low; i < high; i++) { 1064 if (*(cp + a->sliceheight * i) > col) break; 1065 if (*(cp + a->sliceheight * i) == col) { 1066 *v++ = *(vp + a->sliceheight * i); 1067 goto finished; 1068 } 1069 } 1070 *v++ = 0.0; 1071 finished:; 1072 } 1073 } 1074 PetscFunctionReturn(PETSC_SUCCESS); 1075 } 1076 1077 PetscErrorCode MatView_SeqSELL_ASCII(Mat A, PetscViewer viewer) 1078 { 1079 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1080 PetscInt i, j, m = A->rmap->n, shift; 1081 const char *name; 1082 PetscViewerFormat format; 1083 1084 PetscFunctionBegin; 1085 PetscCall(PetscViewerGetFormat(viewer, &format)); 1086 if (format == PETSC_VIEWER_ASCII_MATLAB) { 1087 PetscInt nofinalvalue = 0; 1088 /* 1089 if (m && ((a->i[m] == a->i[m-1]) || (a->j[a->nz-1] != A->cmap->n-1))) { 1090 nofinalvalue = 1; 1091 } 1092 */ 1093 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1094 PetscCall(PetscViewerASCIIPrintf(viewer, "%% Size = %" PetscInt_FMT " %" PetscInt_FMT " \n", m, A->cmap->n)); 1095 PetscCall(PetscViewerASCIIPrintf(viewer, "%% Nonzeros = %" PetscInt_FMT " \n", a->nz)); 1096 #if defined(PETSC_USE_COMPLEX) 1097 PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",4);\n", a->nz + nofinalvalue)); 1098 #else 1099 PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",3);\n", a->nz + nofinalvalue)); 1100 #endif 1101 PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = [\n")); 1102 1103 for (i = 0; i < m; i++) { 1104 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1105 for (j = 0; j < a->rlen[i]; j++) { 1106 #if defined(PETSC_USE_COMPLEX) 1107 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %18.16e %18.16e\n", i + 1, a->colidx[shift + a->sliceheight * j] + 1, (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j]))); 1108 #else 1109 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %18.16e\n", i + 1, a->colidx[shift + a->sliceheight * j] + 1, (double)a->val[shift + a->sliceheight * j])); 1110 #endif 1111 } 1112 } 1113 /* 1114 if (nofinalvalue) { 1115 #if defined(PETSC_USE_COMPLEX) 1116 PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %18.16e %18.16e\n",m,A->cmap->n,0.,0.)); 1117 #else 1118 PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %18.16e\n",m,A->cmap->n,0.0)); 1119 #endif 1120 } 1121 */ 1122 PetscCall(PetscObjectGetName((PetscObject)A, &name)); 1123 PetscCall(PetscViewerASCIIPrintf(viewer, "];\n %s = spconvert(zzz);\n", name)); 1124 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1125 } else if (format == PETSC_VIEWER_ASCII_FACTOR_INFO || format == PETSC_VIEWER_ASCII_INFO) { 1126 PetscFunctionReturn(PETSC_SUCCESS); 1127 } else if (format == PETSC_VIEWER_ASCII_COMMON) { 1128 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1129 for (i = 0; i < m; i++) { 1130 PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i)); 1131 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1132 for (j = 0; j < a->rlen[i]; j++) { 1133 #if defined(PETSC_USE_COMPLEX) 1134 if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) > 0.0 && PetscRealPart(a->val[shift + a->sliceheight * j]) != 0.0) { 1135 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j]))); 1136 } else if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) < 0.0 && PetscRealPart(a->val[shift + a->sliceheight * j]) != 0.0) { 1137 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)-PetscImaginaryPart(a->val[shift + a->sliceheight * j]))); 1138 } else if (PetscRealPart(a->val[shift + a->sliceheight * j]) != 0.0) { 1139 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]))); 1140 } 1141 #else 1142 if (a->val[shift + a->sliceheight * j] != 0.0) PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)a->val[shift + a->sliceheight * j])); 1143 #endif 1144 } 1145 PetscCall(PetscViewerASCIIPrintf(viewer, "\n")); 1146 } 1147 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1148 } else if (format == PETSC_VIEWER_ASCII_DENSE) { 1149 PetscInt cnt = 0, jcnt; 1150 PetscScalar value; 1151 #if defined(PETSC_USE_COMPLEX) 1152 PetscBool realonly = PETSC_TRUE; 1153 for (i = 0; i < a->sliidx[a->totalslices]; i++) { 1154 if (PetscImaginaryPart(a->val[i]) != 0.0) { 1155 realonly = PETSC_FALSE; 1156 break; 1157 } 1158 } 1159 #endif 1160 1161 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1162 for (i = 0; i < m; i++) { 1163 jcnt = 0; 1164 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1165 for (j = 0; j < A->cmap->n; j++) { 1166 if (jcnt < a->rlen[i] && j == a->colidx[shift + a->sliceheight * j]) { 1167 value = a->val[cnt++]; 1168 jcnt++; 1169 } else { 1170 value = 0.0; 1171 } 1172 #if defined(PETSC_USE_COMPLEX) 1173 if (realonly) { 1174 PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)PetscRealPart(value))); 1175 } else { 1176 PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e+%7.5e i ", (double)PetscRealPart(value), (double)PetscImaginaryPart(value))); 1177 } 1178 #else 1179 PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)value)); 1180 #endif 1181 } 1182 PetscCall(PetscViewerASCIIPrintf(viewer, "\n")); 1183 } 1184 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1185 } else if (format == PETSC_VIEWER_ASCII_MATRIXMARKET) { 1186 PetscInt fshift = 1; 1187 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1188 #if defined(PETSC_USE_COMPLEX) 1189 PetscCall(PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate complex general\n")); 1190 #else 1191 PetscCall(PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate real general\n")); 1192 #endif 1193 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %" PetscInt_FMT "\n", m, A->cmap->n, a->nz)); 1194 for (i = 0; i < m; i++) { 1195 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1196 for (j = 0; j < a->rlen[i]; j++) { 1197 #if defined(PETSC_USE_COMPLEX) 1198 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g %g\n", i + fshift, a->colidx[shift + a->sliceheight * j] + fshift, (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j]))); 1199 #else 1200 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g\n", i + fshift, a->colidx[shift + a->sliceheight * j] + fshift, (double)a->val[shift + a->sliceheight * j])); 1201 #endif 1202 } 1203 } 1204 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1205 } else if (format == PETSC_VIEWER_NATIVE) { 1206 for (i = 0; i < a->totalslices; i++) { /* loop over slices */ 1207 PetscInt row; 1208 PetscCall(PetscViewerASCIIPrintf(viewer, "slice %" PetscInt_FMT ": %" PetscInt_FMT " %" PetscInt_FMT "\n", i, a->sliidx[i], a->sliidx[i + 1])); 1209 for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = (row + 1) % a->sliceheight) { 1210 #if defined(PETSC_USE_COMPLEX) 1211 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1212 PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g + %g i\n", a->sliceheight * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j]))); 1213 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1214 PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g - %g i\n", a->sliceheight * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), -(double)PetscImaginaryPart(a->val[j]))); 1215 } else { 1216 PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g\n", a->sliceheight * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]))); 1217 } 1218 #else 1219 PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g\n", a->sliceheight * i + row, a->colidx[j], (double)a->val[j])); 1220 #endif 1221 } 1222 } 1223 } else { 1224 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1225 if (A->factortype) { 1226 for (i = 0; i < m; i++) { 1227 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1228 PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i)); 1229 /* L part */ 1230 for (j = shift; j < a->diag[i]; j += a->sliceheight) { 1231 #if defined(PETSC_USE_COMPLEX) 1232 if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) > 0.0) { 1233 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j]))); 1234 } else if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) < 0.0) { 1235 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j])))); 1236 } else { 1237 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j]))); 1238 } 1239 #else 1240 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j])); 1241 #endif 1242 } 1243 /* diagonal */ 1244 j = a->diag[i]; 1245 #if defined(PETSC_USE_COMPLEX) 1246 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1247 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)PetscImaginaryPart(1.0 / a->val[j]))); 1248 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1249 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)(-PetscImaginaryPart(1.0 / a->val[j])))); 1250 } else { 1251 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]))); 1252 } 1253 #else 1254 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)(1.0 / a->val[j]))); 1255 #endif 1256 1257 /* U part */ 1258 for (j = a->diag[i] + 1; j < shift + a->sliceheight * a->rlen[i]; j += a->sliceheight) { 1259 #if defined(PETSC_USE_COMPLEX) 1260 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1261 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j]))); 1262 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1263 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j])))); 1264 } else { 1265 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j]))); 1266 } 1267 #else 1268 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j])); 1269 #endif 1270 } 1271 PetscCall(PetscViewerASCIIPrintf(viewer, "\n")); 1272 } 1273 } else { 1274 for (i = 0; i < m; i++) { 1275 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1276 PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i)); 1277 for (j = 0; j < a->rlen[i]; j++) { 1278 #if defined(PETSC_USE_COMPLEX) 1279 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1280 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j]))); 1281 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1282 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)-PetscImaginaryPart(a->val[shift + a->sliceheight * j]))); 1283 } else { 1284 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]))); 1285 } 1286 #else 1287 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)a->val[shift + a->sliceheight * j])); 1288 #endif 1289 } 1290 PetscCall(PetscViewerASCIIPrintf(viewer, "\n")); 1291 } 1292 } 1293 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1294 } 1295 PetscCall(PetscViewerFlush(viewer)); 1296 PetscFunctionReturn(PETSC_SUCCESS); 1297 } 1298 1299 #include <petscdraw.h> 1300 PetscErrorCode MatView_SeqSELL_Draw_Zoom(PetscDraw draw, void *Aa) 1301 { 1302 Mat A = (Mat)Aa; 1303 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1304 PetscInt i, j, m = A->rmap->n, shift; 1305 int color; 1306 PetscReal xl, yl, xr, yr, x_l, x_r, y_l, y_r; 1307 PetscViewer viewer; 1308 PetscViewerFormat format; 1309 1310 PetscFunctionBegin; 1311 PetscCall(PetscObjectQuery((PetscObject)A, "Zoomviewer", (PetscObject *)&viewer)); 1312 PetscCall(PetscViewerGetFormat(viewer, &format)); 1313 PetscCall(PetscDrawGetCoordinates(draw, &xl, &yl, &xr, &yr)); 1314 1315 /* loop over matrix elements drawing boxes */ 1316 1317 if (format != PETSC_VIEWER_DRAW_CONTOUR) { 1318 PetscDrawCollectiveBegin(draw); 1319 /* Blue for negative, Cyan for zero and Red for positive */ 1320 color = PETSC_DRAW_BLUE; 1321 for (i = 0; i < m; i++) { 1322 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */ 1323 y_l = m - i - 1.0; 1324 y_r = y_l + 1.0; 1325 for (j = 0; j < a->rlen[i]; j++) { 1326 x_l = a->colidx[shift + a->sliceheight * j]; 1327 x_r = x_l + 1.0; 1328 if (PetscRealPart(a->val[shift + a->sliceheight * j]) >= 0.) continue; 1329 PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color)); 1330 } 1331 } 1332 color = PETSC_DRAW_CYAN; 1333 for (i = 0; i < m; i++) { 1334 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1335 y_l = m - i - 1.0; 1336 y_r = y_l + 1.0; 1337 for (j = 0; j < a->rlen[i]; j++) { 1338 x_l = a->colidx[shift + a->sliceheight * j]; 1339 x_r = x_l + 1.0; 1340 if (a->val[shift + a->sliceheight * j] != 0.) continue; 1341 PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color)); 1342 } 1343 } 1344 color = PETSC_DRAW_RED; 1345 for (i = 0; i < m; i++) { 1346 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1347 y_l = m - i - 1.0; 1348 y_r = y_l + 1.0; 1349 for (j = 0; j < a->rlen[i]; j++) { 1350 x_l = a->colidx[shift + a->sliceheight * j]; 1351 x_r = x_l + 1.0; 1352 if (PetscRealPart(a->val[shift + a->sliceheight * j]) <= 0.) continue; 1353 PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color)); 1354 } 1355 } 1356 PetscDrawCollectiveEnd(draw); 1357 } else { 1358 /* use contour shading to indicate magnitude of values */ 1359 /* first determine max of all nonzero values */ 1360 PetscReal minv = 0.0, maxv = 0.0; 1361 PetscInt count = 0; 1362 PetscDraw popup; 1363 for (i = 0; i < a->sliidx[a->totalslices]; i++) { 1364 if (PetscAbsScalar(a->val[i]) > maxv) maxv = PetscAbsScalar(a->val[i]); 1365 } 1366 if (minv >= maxv) maxv = minv + PETSC_SMALL; 1367 PetscCall(PetscDrawGetPopup(draw, &popup)); 1368 PetscCall(PetscDrawScalePopup(popup, minv, maxv)); 1369 1370 PetscDrawCollectiveBegin(draw); 1371 for (i = 0; i < m; i++) { 1372 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1373 y_l = m - i - 1.0; 1374 y_r = y_l + 1.0; 1375 for (j = 0; j < a->rlen[i]; j++) { 1376 x_l = a->colidx[shift + a->sliceheight * j]; 1377 x_r = x_l + 1.0; 1378 color = PetscDrawRealToColor(PetscAbsScalar(a->val[count]), minv, maxv); 1379 PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color)); 1380 count++; 1381 } 1382 } 1383 PetscDrawCollectiveEnd(draw); 1384 } 1385 PetscFunctionReturn(PETSC_SUCCESS); 1386 } 1387 1388 #include <petscdraw.h> 1389 PetscErrorCode MatView_SeqSELL_Draw(Mat A, PetscViewer viewer) 1390 { 1391 PetscDraw draw; 1392 PetscReal xr, yr, xl, yl, h, w; 1393 PetscBool isnull; 1394 1395 PetscFunctionBegin; 1396 PetscCall(PetscViewerDrawGetDraw(viewer, 0, &draw)); 1397 PetscCall(PetscDrawIsNull(draw, &isnull)); 1398 if (isnull) PetscFunctionReturn(PETSC_SUCCESS); 1399 1400 xr = A->cmap->n; 1401 yr = A->rmap->n; 1402 h = yr / 10.0; 1403 w = xr / 10.0; 1404 xr += w; 1405 yr += h; 1406 xl = -w; 1407 yl = -h; 1408 PetscCall(PetscDrawSetCoordinates(draw, xl, yl, xr, yr)); 1409 PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", (PetscObject)viewer)); 1410 PetscCall(PetscDrawZoom(draw, MatView_SeqSELL_Draw_Zoom, A)); 1411 PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", NULL)); 1412 PetscCall(PetscDrawSave(draw)); 1413 PetscFunctionReturn(PETSC_SUCCESS); 1414 } 1415 1416 PetscErrorCode MatView_SeqSELL(Mat A, PetscViewer viewer) 1417 { 1418 PetscBool iascii, isbinary, isdraw; 1419 1420 PetscFunctionBegin; 1421 PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii)); 1422 PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERBINARY, &isbinary)); 1423 PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERDRAW, &isdraw)); 1424 if (iascii) { 1425 PetscCall(MatView_SeqSELL_ASCII(A, viewer)); 1426 } else if (isbinary) { 1427 /* PetscCall(MatView_SeqSELL_Binary(A,viewer)); */ 1428 } else if (isdraw) PetscCall(MatView_SeqSELL_Draw(A, viewer)); 1429 PetscFunctionReturn(PETSC_SUCCESS); 1430 } 1431 1432 PetscErrorCode MatAssemblyEnd_SeqSELL(Mat A, MatAssemblyType mode) 1433 { 1434 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1435 PetscInt i, shift, row_in_slice, row, nrow, *cp, lastcol, j, k; 1436 MatScalar *vp; 1437 1438 PetscFunctionBegin; 1439 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS); 1440 /* To do: compress out the unused elements */ 1441 PetscCall(MatMarkDiagonal_SeqSELL(A)); 1442 PetscCall(PetscInfo(A, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: %" PetscInt_FMT " allocated %" PetscInt_FMT " used (%" PetscInt_FMT " nonzeros+%" PetscInt_FMT " paddedzeros)\n", A->rmap->n, A->cmap->n, a->maxallocmat, a->sliidx[a->totalslices], a->nz, a->sliidx[a->totalslices] - a->nz)); 1443 PetscCall(PetscInfo(A, "Number of mallocs during MatSetValues() is %" PetscInt_FMT "\n", a->reallocs)); 1444 PetscCall(PetscInfo(A, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", a->rlenmax)); 1445 a->nonzerorowcnt = 0; 1446 /* Set unused slots for column indices to last valid column index. Set unused slots for values to zero. This allows for a use of unmasked intrinsics -> higher performance */ 1447 for (i = 0; i < a->totalslices; ++i) { 1448 shift = a->sliidx[i]; /* starting index of the slice */ 1449 cp = a->colidx + shift; /* pointer to the column indices of the slice */ 1450 vp = a->val + shift; /* pointer to the nonzero values of the slice */ 1451 for (row_in_slice = 0; row_in_slice < a->sliceheight; ++row_in_slice) { /* loop over rows in the slice */ 1452 row = a->sliceheight * i + row_in_slice; 1453 nrow = a->rlen[row]; /* number of nonzeros in row */ 1454 /* 1455 Search for the nearest nonzero. Normally setting the index to zero may cause extra communication. 1456 But if the entire slice are empty, it is fine to use 0 since the index will not be loaded. 1457 */ 1458 lastcol = 0; 1459 if (nrow > 0) { /* nonempty row */ 1460 a->nonzerorowcnt++; 1461 lastcol = cp[a->sliceheight * (nrow - 1) + row_in_slice]; /* use the index from the last nonzero at current row */ 1462 } else if (!row_in_slice) { /* first row of the correct slice is empty */ 1463 for (j = 1; j < a->sliceheight; j++) { 1464 if (a->rlen[a->sliceheight * i + j]) { 1465 lastcol = cp[j]; 1466 break; 1467 } 1468 } 1469 } else { 1470 if (a->sliidx[i + 1] != shift) lastcol = cp[row_in_slice - 1]; /* use the index from the previous row */ 1471 } 1472 1473 for (k = nrow; k < (a->sliidx[i + 1] - shift) / a->sliceheight; ++k) { 1474 cp[a->sliceheight * k + row_in_slice] = lastcol; 1475 vp[a->sliceheight * k + row_in_slice] = (MatScalar)0; 1476 } 1477 } 1478 } 1479 1480 A->info.mallocs += a->reallocs; 1481 a->reallocs = 0; 1482 1483 PetscCall(MatSeqSELLInvalidateDiagonal(A)); 1484 PetscFunctionReturn(PETSC_SUCCESS); 1485 } 1486 1487 PetscErrorCode MatGetInfo_SeqSELL(Mat A, MatInfoType flag, MatInfo *info) 1488 { 1489 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1490 1491 PetscFunctionBegin; 1492 info->block_size = 1.0; 1493 info->nz_allocated = a->maxallocmat; 1494 info->nz_used = a->sliidx[a->totalslices]; /* include padding zeros */ 1495 info->nz_unneeded = (a->maxallocmat - a->sliidx[a->totalslices]); 1496 info->assemblies = A->num_ass; 1497 info->mallocs = A->info.mallocs; 1498 info->memory = 0; /* REVIEW ME */ 1499 if (A->factortype) { 1500 info->fill_ratio_given = A->info.fill_ratio_given; 1501 info->fill_ratio_needed = A->info.fill_ratio_needed; 1502 info->factor_mallocs = A->info.factor_mallocs; 1503 } else { 1504 info->fill_ratio_given = 0; 1505 info->fill_ratio_needed = 0; 1506 info->factor_mallocs = 0; 1507 } 1508 PetscFunctionReturn(PETSC_SUCCESS); 1509 } 1510 1511 PetscErrorCode MatSetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], const PetscScalar v[], InsertMode is) 1512 { 1513 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1514 PetscInt shift, i, k, l, low, high, t, ii, row, col, nrow; 1515 PetscInt *cp, nonew = a->nonew, lastcol = -1; 1516 MatScalar *vp, value; 1517 #if defined(PETSC_HAVE_CUDA) 1518 PetscBool inserted = PETSC_FALSE; 1519 PetscInt mul = DEVICE_MEM_ALIGN / a->sliceheight; 1520 #endif 1521 1522 PetscFunctionBegin; 1523 for (k = 0; k < m; k++) { /* loop over added rows */ 1524 row = im[k]; 1525 if (row < 0) continue; 1526 PetscCheck(row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->n - 1); 1527 shift = a->sliidx[row / a->sliceheight] + row % a->sliceheight; /* starting index of the row */ 1528 cp = a->colidx + shift; /* pointer to the row */ 1529 vp = a->val + shift; /* pointer to the row */ 1530 nrow = a->rlen[row]; 1531 low = 0; 1532 high = nrow; 1533 1534 for (l = 0; l < n; l++) { /* loop over added columns */ 1535 col = in[l]; 1536 if (col < 0) continue; 1537 PetscCheck(col < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Col too large: row %" PetscInt_FMT " max %" PetscInt_FMT, col, A->cmap->n - 1); 1538 if (a->roworiented) { 1539 value = v[l + k * n]; 1540 } else { 1541 value = v[k + l * m]; 1542 } 1543 if ((value == 0.0 && a->ignorezeroentries) && (is == ADD_VALUES)) continue; 1544 1545 /* search in this row for the specified column, i indicates the column to be set */ 1546 if (col <= lastcol) low = 0; 1547 else high = nrow; 1548 lastcol = col; 1549 while (high - low > 5) { 1550 t = (low + high) / 2; 1551 if (*(cp + a->sliceheight * t) > col) high = t; 1552 else low = t; 1553 } 1554 for (i = low; i < high; i++) { 1555 if (*(cp + a->sliceheight * i) > col) break; 1556 if (*(cp + a->sliceheight * i) == col) { 1557 if (is == ADD_VALUES) *(vp + a->sliceheight * i) += value; 1558 else *(vp + a->sliceheight * i) = value; 1559 #if defined(PETSC_HAVE_CUDA) 1560 inserted = PETSC_TRUE; 1561 #endif 1562 low = i + 1; 1563 goto noinsert; 1564 } 1565 } 1566 if (value == 0.0 && a->ignorezeroentries) goto noinsert; 1567 if (nonew == 1) goto noinsert; 1568 PetscCheck(nonew != -1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Inserting a new nonzero (%" PetscInt_FMT ", %" PetscInt_FMT ") in the matrix", row, col); 1569 #if defined(PETSC_HAVE_CUDA) 1570 MatSeqXSELLReallocateSELL(A, A->rmap->n, 1, nrow, a->sliidx, a->sliceheight, row / a->sliceheight, row, col, a->colidx, a->val, cp, vp, nonew, MatScalar, mul); 1571 #else 1572 /* If the current row length exceeds the slice width (e.g. nrow==slice_width), allocate a new space, otherwise do nothing */ 1573 MatSeqXSELLReallocateSELL(A, A->rmap->n, 1, nrow, a->sliidx, a->sliceheight, row / a->sliceheight, row, col, a->colidx, a->val, cp, vp, nonew, MatScalar, 1); 1574 #endif 1575 /* add the new nonzero to the high position, shift the remaining elements in current row to the right by one slot */ 1576 for (ii = nrow - 1; ii >= i; ii--) { 1577 *(cp + a->sliceheight * (ii + 1)) = *(cp + a->sliceheight * ii); 1578 *(vp + a->sliceheight * (ii + 1)) = *(vp + a->sliceheight * ii); 1579 } 1580 a->rlen[row]++; 1581 *(cp + a->sliceheight * i) = col; 1582 *(vp + a->sliceheight * i) = value; 1583 a->nz++; 1584 A->nonzerostate++; 1585 #if defined(PETSC_HAVE_CUDA) 1586 inserted = PETSC_TRUE; 1587 #endif 1588 low = i + 1; 1589 high++; 1590 nrow++; 1591 noinsert:; 1592 } 1593 a->rlen[row] = nrow; 1594 } 1595 #if defined(PETSC_HAVE_CUDA) 1596 if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED && inserted) A->offloadmask = PETSC_OFFLOAD_CPU; 1597 #endif 1598 PetscFunctionReturn(PETSC_SUCCESS); 1599 } 1600 1601 PetscErrorCode MatCopy_SeqSELL(Mat A, Mat B, MatStructure str) 1602 { 1603 PetscFunctionBegin; 1604 /* If the two matrices have the same copy implementation, use fast copy. */ 1605 if (str == SAME_NONZERO_PATTERN && (A->ops->copy == B->ops->copy)) { 1606 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1607 Mat_SeqSELL *b = (Mat_SeqSELL *)B->data; 1608 1609 PetscCheck(a->sliidx[a->totalslices] == b->sliidx[b->totalslices], PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Number of nonzeros in two matrices are different"); 1610 PetscCall(PetscArraycpy(b->val, a->val, a->sliidx[a->totalslices])); 1611 } else { 1612 PetscCall(MatCopy_Basic(A, B, str)); 1613 } 1614 PetscFunctionReturn(PETSC_SUCCESS); 1615 } 1616 1617 PetscErrorCode MatSetUp_SeqSELL(Mat A) 1618 { 1619 PetscFunctionBegin; 1620 PetscCall(MatSeqSELLSetPreallocation(A, PETSC_DEFAULT, NULL)); 1621 PetscFunctionReturn(PETSC_SUCCESS); 1622 } 1623 1624 PetscErrorCode MatSeqSELLGetArray_SeqSELL(Mat A, PetscScalar *array[]) 1625 { 1626 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1627 1628 PetscFunctionBegin; 1629 *array = a->val; 1630 PetscFunctionReturn(PETSC_SUCCESS); 1631 } 1632 1633 PetscErrorCode MatSeqSELLRestoreArray_SeqSELL(Mat A, PetscScalar *array[]) 1634 { 1635 PetscFunctionBegin; 1636 PetscFunctionReturn(PETSC_SUCCESS); 1637 } 1638 1639 PetscErrorCode MatRealPart_SeqSELL(Mat A) 1640 { 1641 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1642 PetscInt i; 1643 MatScalar *aval = a->val; 1644 1645 PetscFunctionBegin; 1646 for (i = 0; i < a->sliidx[a->totalslices]; i++) aval[i] = PetscRealPart(aval[i]); 1647 #if defined(PETSC_HAVE_CUDA) 1648 if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) A->offloadmask = PETSC_OFFLOAD_CPU; 1649 #endif 1650 PetscFunctionReturn(PETSC_SUCCESS); 1651 } 1652 1653 PetscErrorCode MatImaginaryPart_SeqSELL(Mat A) 1654 { 1655 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1656 PetscInt i; 1657 MatScalar *aval = a->val; 1658 1659 PetscFunctionBegin; 1660 for (i = 0; i < a->sliidx[a->totalslices]; i++) aval[i] = PetscImaginaryPart(aval[i]); 1661 PetscCall(MatSeqSELLInvalidateDiagonal(A)); 1662 #if defined(PETSC_HAVE_CUDA) 1663 if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) A->offloadmask = PETSC_OFFLOAD_CPU; 1664 #endif 1665 PetscFunctionReturn(PETSC_SUCCESS); 1666 } 1667 1668 PetscErrorCode MatScale_SeqSELL(Mat inA, PetscScalar alpha) 1669 { 1670 Mat_SeqSELL *a = (Mat_SeqSELL *)inA->data; 1671 MatScalar *aval = a->val; 1672 PetscScalar oalpha = alpha; 1673 PetscBLASInt one = 1, size; 1674 1675 PetscFunctionBegin; 1676 PetscCall(PetscBLASIntCast(a->sliidx[a->totalslices], &size)); 1677 PetscCallBLAS("BLASscal", BLASscal_(&size, &oalpha, aval, &one)); 1678 PetscCall(PetscLogFlops(a->nz)); 1679 PetscCall(MatSeqSELLInvalidateDiagonal(inA)); 1680 #if defined(PETSC_HAVE_CUDA) 1681 if (inA->offloadmask != PETSC_OFFLOAD_UNALLOCATED) inA->offloadmask = PETSC_OFFLOAD_CPU; 1682 #endif 1683 PetscFunctionReturn(PETSC_SUCCESS); 1684 } 1685 1686 PetscErrorCode MatShift_SeqSELL(Mat Y, PetscScalar a) 1687 { 1688 Mat_SeqSELL *y = (Mat_SeqSELL *)Y->data; 1689 1690 PetscFunctionBegin; 1691 if (!Y->preallocated || !y->nz) PetscCall(MatSeqSELLSetPreallocation(Y, 1, NULL)); 1692 PetscCall(MatShift_Basic(Y, a)); 1693 PetscFunctionReturn(PETSC_SUCCESS); 1694 } 1695 1696 PetscErrorCode MatSOR_SeqSELL(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx) 1697 { 1698 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1699 PetscScalar *x, sum, *t; 1700 const MatScalar *idiag = NULL, *mdiag; 1701 const PetscScalar *b, *xb; 1702 PetscInt n, m = A->rmap->n, i, j, shift; 1703 const PetscInt *diag; 1704 1705 PetscFunctionBegin; 1706 its = its * lits; 1707 1708 if (fshift != a->fshift || omega != a->omega) a->idiagvalid = PETSC_FALSE; /* must recompute idiag[] */ 1709 if (!a->idiagvalid) PetscCall(MatInvertDiagonal_SeqSELL(A, omega, fshift)); 1710 a->fshift = fshift; 1711 a->omega = omega; 1712 1713 diag = a->diag; 1714 t = a->ssor_work; 1715 idiag = a->idiag; 1716 mdiag = a->mdiag; 1717 1718 PetscCall(VecGetArray(xx, &x)); 1719 PetscCall(VecGetArrayRead(bb, &b)); 1720 /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */ 1721 PetscCheck(flag != SOR_APPLY_UPPER, PETSC_COMM_SELF, PETSC_ERR_SUP, "SOR_APPLY_UPPER is not implemented"); 1722 PetscCheck(flag != SOR_APPLY_LOWER, PETSC_COMM_SELF, PETSC_ERR_SUP, "SOR_APPLY_LOWER is not implemented"); 1723 PetscCheck(!(flag & SOR_EISENSTAT), PETSC_COMM_SELF, PETSC_ERR_SUP, "No support yet for Eisenstat"); 1724 1725 if (flag & SOR_ZERO_INITIAL_GUESS) { 1726 if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) { 1727 for (i = 0; i < m; i++) { 1728 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */ 1729 sum = b[i]; 1730 n = (diag[i] - shift) / a->sliceheight; 1731 for (j = 0; j < n; j++) sum -= a->val[shift + a->sliceheight * j] * x[a->colidx[shift + a->sliceheight * j]]; 1732 t[i] = sum; 1733 x[i] = sum * idiag[i]; 1734 } 1735 xb = t; 1736 PetscCall(PetscLogFlops(a->nz)); 1737 } else xb = b; 1738 if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) { 1739 for (i = m - 1; i >= 0; i--) { 1740 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */ 1741 sum = xb[i]; 1742 n = a->rlen[i] - (diag[i] - shift) / a->sliceheight - 1; 1743 for (j = 1; j <= n; j++) sum -= a->val[diag[i] + a->sliceheight * j] * x[a->colidx[diag[i] + a->sliceheight * j]]; 1744 if (xb == b) { 1745 x[i] = sum * idiag[i]; 1746 } else { 1747 x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */ 1748 } 1749 } 1750 PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */ 1751 } 1752 its--; 1753 } 1754 while (its--) { 1755 if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) { 1756 for (i = 0; i < m; i++) { 1757 /* lower */ 1758 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */ 1759 sum = b[i]; 1760 n = (diag[i] - shift) / a->sliceheight; 1761 for (j = 0; j < n; j++) sum -= a->val[shift + a->sliceheight * j] * x[a->colidx[shift + a->sliceheight * j]]; 1762 t[i] = sum; /* save application of the lower-triangular part */ 1763 /* upper */ 1764 n = a->rlen[i] - (diag[i] - shift) / a->sliceheight - 1; 1765 for (j = 1; j <= n; j++) sum -= a->val[diag[i] + a->sliceheight * j] * x[a->colidx[diag[i] + a->sliceheight * j]]; 1766 x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */ 1767 } 1768 xb = t; 1769 PetscCall(PetscLogFlops(2.0 * a->nz)); 1770 } else xb = b; 1771 if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) { 1772 for (i = m - 1; i >= 0; i--) { 1773 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */ 1774 sum = xb[i]; 1775 if (xb == b) { 1776 /* whole matrix (no checkpointing available) */ 1777 n = a->rlen[i]; 1778 for (j = 0; j < n; j++) sum -= a->val[shift + a->sliceheight * j] * x[a->colidx[shift + a->sliceheight * j]]; 1779 x[i] = (1. - omega) * x[i] + (sum + mdiag[i] * x[i]) * idiag[i]; 1780 } else { /* lower-triangular part has been saved, so only apply upper-triangular */ 1781 n = a->rlen[i] - (diag[i] - shift) / a->sliceheight - 1; 1782 for (j = 1; j <= n; j++) sum -= a->val[diag[i] + a->sliceheight * j] * x[a->colidx[diag[i] + a->sliceheight * j]]; 1783 x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */ 1784 } 1785 } 1786 if (xb == b) { 1787 PetscCall(PetscLogFlops(2.0 * a->nz)); 1788 } else { 1789 PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */ 1790 } 1791 } 1792 } 1793 PetscCall(VecRestoreArray(xx, &x)); 1794 PetscCall(VecRestoreArrayRead(bb, &b)); 1795 PetscFunctionReturn(PETSC_SUCCESS); 1796 } 1797 1798 static struct _MatOps MatOps_Values = {MatSetValues_SeqSELL, 1799 MatGetRow_SeqSELL, 1800 MatRestoreRow_SeqSELL, 1801 MatMult_SeqSELL, 1802 /* 4*/ MatMultAdd_SeqSELL, 1803 MatMultTranspose_SeqSELL, 1804 MatMultTransposeAdd_SeqSELL, 1805 NULL, 1806 NULL, 1807 NULL, 1808 /* 10*/ NULL, 1809 NULL, 1810 NULL, 1811 MatSOR_SeqSELL, 1812 NULL, 1813 /* 15*/ MatGetInfo_SeqSELL, 1814 MatEqual_SeqSELL, 1815 MatGetDiagonal_SeqSELL, 1816 MatDiagonalScale_SeqSELL, 1817 NULL, 1818 /* 20*/ NULL, 1819 MatAssemblyEnd_SeqSELL, 1820 MatSetOption_SeqSELL, 1821 MatZeroEntries_SeqSELL, 1822 /* 24*/ NULL, 1823 NULL, 1824 NULL, 1825 NULL, 1826 NULL, 1827 /* 29*/ MatSetUp_SeqSELL, 1828 NULL, 1829 NULL, 1830 NULL, 1831 NULL, 1832 /* 34*/ MatDuplicate_SeqSELL, 1833 NULL, 1834 NULL, 1835 NULL, 1836 NULL, 1837 /* 39*/ NULL, 1838 NULL, 1839 NULL, 1840 MatGetValues_SeqSELL, 1841 MatCopy_SeqSELL, 1842 /* 44*/ NULL, 1843 MatScale_SeqSELL, 1844 MatShift_SeqSELL, 1845 NULL, 1846 NULL, 1847 /* 49*/ NULL, 1848 NULL, 1849 NULL, 1850 NULL, 1851 NULL, 1852 /* 54*/ MatFDColoringCreate_SeqXAIJ, 1853 NULL, 1854 NULL, 1855 NULL, 1856 NULL, 1857 /* 59*/ NULL, 1858 MatDestroy_SeqSELL, 1859 MatView_SeqSELL, 1860 NULL, 1861 NULL, 1862 /* 64*/ NULL, 1863 NULL, 1864 NULL, 1865 NULL, 1866 NULL, 1867 /* 69*/ NULL, 1868 NULL, 1869 NULL, 1870 NULL, 1871 NULL, 1872 /* 74*/ NULL, 1873 MatFDColoringApply_AIJ, /* reuse the FDColoring function for AIJ */ 1874 NULL, 1875 NULL, 1876 NULL, 1877 /* 79*/ NULL, 1878 NULL, 1879 NULL, 1880 NULL, 1881 NULL, 1882 /* 84*/ NULL, 1883 NULL, 1884 NULL, 1885 NULL, 1886 NULL, 1887 /* 89*/ NULL, 1888 NULL, 1889 NULL, 1890 NULL, 1891 NULL, 1892 /* 94*/ NULL, 1893 NULL, 1894 NULL, 1895 NULL, 1896 NULL, 1897 /* 99*/ NULL, 1898 NULL, 1899 NULL, 1900 MatConjugate_SeqSELL, 1901 NULL, 1902 /*104*/ NULL, 1903 NULL, 1904 NULL, 1905 NULL, 1906 NULL, 1907 /*109*/ NULL, 1908 NULL, 1909 NULL, 1910 NULL, 1911 MatMissingDiagonal_SeqSELL, 1912 /*114*/ NULL, 1913 NULL, 1914 NULL, 1915 NULL, 1916 NULL, 1917 /*119*/ NULL, 1918 NULL, 1919 NULL, 1920 NULL, 1921 NULL, 1922 /*124*/ NULL, 1923 NULL, 1924 NULL, 1925 NULL, 1926 NULL, 1927 /*129*/ NULL, 1928 NULL, 1929 NULL, 1930 NULL, 1931 NULL, 1932 /*134*/ NULL, 1933 NULL, 1934 NULL, 1935 NULL, 1936 NULL, 1937 /*139*/ NULL, 1938 NULL, 1939 NULL, 1940 MatFDColoringSetUp_SeqXAIJ, 1941 NULL, 1942 /*144*/ NULL, 1943 NULL, 1944 NULL, 1945 NULL, 1946 NULL, 1947 NULL, 1948 /*150*/ NULL, 1949 NULL}; 1950 1951 PetscErrorCode MatStoreValues_SeqSELL(Mat mat) 1952 { 1953 Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data; 1954 1955 PetscFunctionBegin; 1956 PetscCheck(a->nonew, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first"); 1957 1958 /* allocate space for values if not already there */ 1959 if (!a->saved_values) PetscCall(PetscMalloc1(a->sliidx[a->totalslices] + 1, &a->saved_values)); 1960 1961 /* copy values over */ 1962 PetscCall(PetscArraycpy(a->saved_values, a->val, a->sliidx[a->totalslices])); 1963 PetscFunctionReturn(PETSC_SUCCESS); 1964 } 1965 1966 PetscErrorCode MatRetrieveValues_SeqSELL(Mat mat) 1967 { 1968 Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data; 1969 1970 PetscFunctionBegin; 1971 PetscCheck(a->nonew, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first"); 1972 PetscCheck(a->saved_values, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatStoreValues(A);first"); 1973 PetscCall(PetscArraycpy(a->val, a->saved_values, a->sliidx[a->totalslices])); 1974 PetscFunctionReturn(PETSC_SUCCESS); 1975 } 1976 1977 PetscErrorCode MatSeqSELLGetFillRatio_SeqSELL(Mat mat, PetscReal *ratio) 1978 { 1979 Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data; 1980 1981 PetscFunctionBegin; 1982 if (a->totalslices && a->sliidx[a->totalslices]) { 1983 *ratio = (PetscReal)(a->sliidx[a->totalslices] - a->nz) / a->sliidx[a->totalslices]; 1984 } else { 1985 *ratio = 0.0; 1986 } 1987 PetscFunctionReturn(PETSC_SUCCESS); 1988 } 1989 1990 PetscErrorCode MatSeqSELLGetMaxSliceWidth_SeqSELL(Mat mat, PetscInt *slicewidth) 1991 { 1992 Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data; 1993 PetscInt i, current_slicewidth; 1994 1995 PetscFunctionBegin; 1996 *slicewidth = 0; 1997 for (i = 0; i < a->totalslices; i++) { 1998 current_slicewidth = (a->sliidx[i + 1] - a->sliidx[i]) / a->sliceheight; 1999 if (current_slicewidth > *slicewidth) *slicewidth = current_slicewidth; 2000 } 2001 PetscFunctionReturn(PETSC_SUCCESS); 2002 } 2003 2004 PetscErrorCode MatSeqSELLGetAvgSliceWidth_SeqSELL(Mat mat, PetscReal *slicewidth) 2005 { 2006 Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data; 2007 2008 PetscFunctionBegin; 2009 *slicewidth = 0; 2010 if (a->totalslices) { *slicewidth = (PetscReal)a->sliidx[a->totalslices] / a->sliceheight / a->totalslices; } 2011 PetscFunctionReturn(PETSC_SUCCESS); 2012 } 2013 2014 PetscErrorCode MatSeqSELLSetSliceHeight_SeqSELL(Mat A, PetscInt sliceheight) 2015 { 2016 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 2017 2018 PetscFunctionBegin; 2019 if (A->preallocated) PetscFunctionReturn(PETSC_SUCCESS); 2020 PetscCheck(a->sliceheight <= 0 || a->sliceheight == sliceheight, PETSC_COMM_SELF, PETSC_ERR_SUP, "Cannot change slice height %" PetscInt_FMT " to %" PetscInt_FMT, a->sliceheight, sliceheight); 2021 a->sliceheight = sliceheight; 2022 #if defined(PETSC_HAVE_CUDA) 2023 PetscCheck(DEVICE_MEM_ALIGN % sliceheight == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "DEVICE_MEM_ALIGN is not divisible by the slice height %" PetscInt_FMT, sliceheight); 2024 #endif 2025 PetscFunctionReturn(PETSC_SUCCESS); 2026 } 2027 2028 /*@C 2029 MatSeqSELLRestoreArray - returns access to the array where the data for a `MATSEQSELL` matrix is stored obtained by `MatSeqSELLGetArray()` 2030 2031 Not Collective 2032 2033 Input Parameters: 2034 + A - a `MATSEQSELL` matrix 2035 - array - pointer to the data 2036 2037 Level: intermediate 2038 2039 .seealso: `Mat`, `MATSEQSELL`, `MatSeqSELLGetArray()`, `MatSeqSELLRestoreArrayF90()` 2040 @*/ 2041 PetscErrorCode MatSeqSELLRestoreArray(Mat A, PetscScalar **array) 2042 { 2043 PetscFunctionBegin; 2044 PetscUseMethod(A, "MatSeqSELLRestoreArray_C", (Mat, PetscScalar **), (A, array)); 2045 PetscFunctionReturn(PETSC_SUCCESS); 2046 } 2047 2048 /*@C 2049 MatSeqSELLGetFillRatio - returns a ratio that indicates the irregularity of the matrix. 2050 2051 Not Collective 2052 2053 Input Parameter: 2054 . A - a MATSEQSELL matrix 2055 2056 Output Parameter: 2057 . ratio - ratio of number of padded zeros to number of allocated elements 2058 2059 Level: intermediate 2060 @*/ 2061 PetscErrorCode MatSeqSELLGetFillRatio(Mat A, PetscReal *ratio) 2062 { 2063 PetscFunctionBegin; 2064 PetscUseMethod(A, "MatSeqSELLGetFillRatio_C", (Mat, PetscReal *), (A, ratio)); 2065 PetscFunctionReturn(PETSC_SUCCESS); 2066 } 2067 2068 /*@C 2069 MatSeqSELLGetMaxSliceWidth - returns the maximum slice width. 2070 2071 Not Collective 2072 2073 Input Parameter: 2074 . A - a MATSEQSELL matrix 2075 2076 Output Parameter: 2077 . slicewidth - maximum slice width 2078 2079 Level: intermediate 2080 @*/ 2081 PetscErrorCode MatSeqSELLGetMaxSliceWidth(Mat A, PetscInt *slicewidth) 2082 { 2083 PetscFunctionBegin; 2084 PetscUseMethod(A, "MatSeqSELLGetMaxSliceWidth_C", (Mat, PetscInt *), (A, slicewidth)); 2085 PetscFunctionReturn(PETSC_SUCCESS); 2086 } 2087 2088 /*@C 2089 MatSeqSELLGetAvgSliceWidth - returns the average slice width. 2090 2091 Not Collective 2092 2093 Input Parameter: 2094 . A - a MATSEQSELL matrix 2095 2096 Output Parameter: 2097 . slicewidth - average slice width 2098 2099 Level: intermediate 2100 @*/ 2101 PetscErrorCode MatSeqSELLGetAvgSliceWidth(Mat A, PetscReal *slicewidth) 2102 { 2103 PetscFunctionBegin; 2104 PetscUseMethod(A, "MatSeqSELLGetAvgSliceWidth_C", (Mat, PetscReal *), (A, slicewidth)); 2105 PetscFunctionReturn(PETSC_SUCCESS); 2106 } 2107 2108 /*@C 2109 MatSeqSELLSetSliceHeight - sets the slice height. 2110 2111 Not Collective 2112 2113 Input Parameters: 2114 + A - a MATSEQSELL matrix 2115 - sliceheight - slice height 2116 2117 Notes: 2118 You cannot change the slice height once it have been set. 2119 2120 The slice height must be set before MatSetUp() or MatXXXSetPreallocation() is called. 2121 2122 Level: intermediate 2123 @*/ 2124 PetscErrorCode MatSeqSELLSetSliceHeight(Mat A, PetscInt sliceheight) 2125 { 2126 PetscFunctionBegin; 2127 PetscUseMethod(A, "MatSeqSELLSetSliceHeight_C", (Mat, PetscInt), (A, sliceheight)); 2128 PetscFunctionReturn(PETSC_SUCCESS); 2129 } 2130 2131 /*@C 2132 MatSeqSELLGetVarSliceSize - returns the variance of the slice size. 2133 2134 Not Collective 2135 2136 Input Parameter: 2137 . A - a MATSEQSELL matrix 2138 2139 Output Parameter: 2140 . variance - variance of the slice size 2141 2142 Level: intermediate 2143 @*/ 2144 PetscErrorCode MatSeqSELLGetVarSliceSize(Mat A, PetscReal *variance) 2145 { 2146 PetscFunctionBegin; 2147 PetscUseMethod(A, "MatSeqSELLGetVarSliceSize_C", (Mat, PetscReal *), (A, variance)); 2148 PetscFunctionReturn(PETSC_SUCCESS); 2149 } 2150 2151 #if defined(PETSC_HAVE_CUDA) 2152 PETSC_EXTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLCUDA(Mat); 2153 #endif 2154 2155 PETSC_EXTERN PetscErrorCode MatCreate_SeqSELL(Mat B) 2156 { 2157 Mat_SeqSELL *b; 2158 PetscMPIInt size; 2159 2160 PetscFunctionBegin; 2161 PetscCall(PetscCitationsRegister(citation, &cited)); 2162 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size)); 2163 PetscCheck(size <= 1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Comm must be of size 1"); 2164 2165 PetscCall(PetscNew(&b)); 2166 2167 B->data = (void *)b; 2168 2169 PetscCall(PetscMemcpy(B->ops, &MatOps_Values, sizeof(struct _MatOps))); 2170 2171 b->row = NULL; 2172 b->col = NULL; 2173 b->icol = NULL; 2174 b->reallocs = 0; 2175 b->ignorezeroentries = PETSC_FALSE; 2176 b->roworiented = PETSC_TRUE; 2177 b->nonew = 0; 2178 b->diag = NULL; 2179 b->solve_work = NULL; 2180 B->spptr = NULL; 2181 b->saved_values = NULL; 2182 b->idiag = NULL; 2183 b->mdiag = NULL; 2184 b->ssor_work = NULL; 2185 b->omega = 1.0; 2186 b->fshift = 0.0; 2187 b->idiagvalid = PETSC_FALSE; 2188 b->keepnonzeropattern = PETSC_FALSE; 2189 b->sliceheight = 0; 2190 2191 PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQSELL)); 2192 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetArray_C", MatSeqSELLGetArray_SeqSELL)); 2193 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLRestoreArray_C", MatSeqSELLRestoreArray_SeqSELL)); 2194 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatStoreValues_C", MatStoreValues_SeqSELL)); 2195 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatRetrieveValues_C", MatRetrieveValues_SeqSELL)); 2196 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLSetPreallocation_C", MatSeqSELLSetPreallocation_SeqSELL)); 2197 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_SeqSELL_SeqAIJ_C", MatConvert_SeqSELL_SeqAIJ)); 2198 #if defined(PETSC_HAVE_CUDA) 2199 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_SeqSELL_SeqSELLCUDA_C", MatConvert_SeqSELL_SeqSELLCUDA)); 2200 #endif 2201 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetFillRatio_C", MatSeqSELLGetFillRatio_SeqSELL)); 2202 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetMaxSliceWidth_C", MatSeqSELLGetMaxSliceWidth_SeqSELL)); 2203 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetAvgSliceWidth_C", MatSeqSELLGetAvgSliceWidth_SeqSELL)); 2204 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLSetSliceHeight_C", MatSeqSELLSetSliceHeight_SeqSELL)); 2205 2206 PetscObjectOptionsBegin((PetscObject)B); 2207 { 2208 PetscInt newsh = -1; 2209 PetscBool flg; 2210 2211 PetscCall(PetscOptionsInt("-mat_sell_slice_height", "Set the slice height used to store SELL matrix", "MatSELLSetSliceHeight", newsh, &newsh, &flg)); 2212 if (flg) { PetscCall(MatSeqSELLSetSliceHeight(B, newsh)); } 2213 } 2214 PetscOptionsEnd(); 2215 PetscFunctionReturn(PETSC_SUCCESS); 2216 } 2217 2218 /* 2219 Given a matrix generated with MatGetFactor() duplicates all the information in A into B 2220 */ 2221 PetscErrorCode MatDuplicateNoCreate_SeqSELL(Mat C, Mat A, MatDuplicateOption cpvalues, PetscBool mallocmatspace) 2222 { 2223 Mat_SeqSELL *c = (Mat_SeqSELL *)C->data, *a = (Mat_SeqSELL *)A->data; 2224 PetscInt i, m = A->rmap->n; 2225 PetscInt totalslices = a->totalslices; 2226 2227 PetscFunctionBegin; 2228 C->factortype = A->factortype; 2229 c->row = NULL; 2230 c->col = NULL; 2231 c->icol = NULL; 2232 c->reallocs = 0; 2233 C->assembled = PETSC_TRUE; 2234 2235 PetscCall(PetscLayoutReference(A->rmap, &C->rmap)); 2236 PetscCall(PetscLayoutReference(A->cmap, &C->cmap)); 2237 2238 PetscCall(PetscMalloc1(a->sliceheight * totalslices, &c->rlen)); 2239 PetscCall(PetscMalloc1(totalslices + 1, &c->sliidx)); 2240 2241 for (i = 0; i < m; i++) c->rlen[i] = a->rlen[i]; 2242 for (i = 0; i < totalslices + 1; i++) c->sliidx[i] = a->sliidx[i]; 2243 2244 /* allocate the matrix space */ 2245 if (mallocmatspace) { 2246 PetscCall(PetscMalloc2(a->maxallocmat, &c->val, a->maxallocmat, &c->colidx)); 2247 2248 c->singlemalloc = PETSC_TRUE; 2249 2250 if (m > 0) { 2251 PetscCall(PetscArraycpy(c->colidx, a->colidx, a->maxallocmat)); 2252 if (cpvalues == MAT_COPY_VALUES) { 2253 PetscCall(PetscArraycpy(c->val, a->val, a->maxallocmat)); 2254 } else { 2255 PetscCall(PetscArrayzero(c->val, a->maxallocmat)); 2256 } 2257 } 2258 } 2259 2260 c->ignorezeroentries = a->ignorezeroentries; 2261 c->roworiented = a->roworiented; 2262 c->nonew = a->nonew; 2263 if (a->diag) { 2264 PetscCall(PetscMalloc1(m, &c->diag)); 2265 for (i = 0; i < m; i++) c->diag[i] = a->diag[i]; 2266 } else c->diag = NULL; 2267 2268 c->solve_work = NULL; 2269 c->saved_values = NULL; 2270 c->idiag = NULL; 2271 c->ssor_work = NULL; 2272 c->keepnonzeropattern = a->keepnonzeropattern; 2273 c->free_val = PETSC_TRUE; 2274 c->free_colidx = PETSC_TRUE; 2275 2276 c->maxallocmat = a->maxallocmat; 2277 c->maxallocrow = a->maxallocrow; 2278 c->rlenmax = a->rlenmax; 2279 c->nz = a->nz; 2280 C->preallocated = PETSC_TRUE; 2281 2282 c->nonzerorowcnt = a->nonzerorowcnt; 2283 C->nonzerostate = A->nonzerostate; 2284 2285 PetscCall(PetscFunctionListDuplicate(((PetscObject)A)->qlist, &((PetscObject)C)->qlist)); 2286 PetscFunctionReturn(PETSC_SUCCESS); 2287 } 2288 2289 PetscErrorCode MatDuplicate_SeqSELL(Mat A, MatDuplicateOption cpvalues, Mat *B) 2290 { 2291 PetscFunctionBegin; 2292 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B)); 2293 PetscCall(MatSetSizes(*B, A->rmap->n, A->cmap->n, A->rmap->n, A->cmap->n)); 2294 if (!(A->rmap->n % A->rmap->bs) && !(A->cmap->n % A->cmap->bs)) PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 2295 PetscCall(MatSetType(*B, ((PetscObject)A)->type_name)); 2296 PetscCall(MatDuplicateNoCreate_SeqSELL(*B, A, cpvalues, PETSC_TRUE)); 2297 PetscFunctionReturn(PETSC_SUCCESS); 2298 } 2299 2300 /*MC 2301 MATSEQSELL - MATSEQSELL = "seqsell" - A matrix type to be used for sequential sparse matrices, 2302 based on the sliced Ellpack format 2303 2304 Options Database Key: 2305 . -mat_type seqsell - sets the matrix type to "`MATSEQELL` during a call to `MatSetFromOptions()` 2306 2307 Level: beginner 2308 2309 .seealso: `Mat`, `MatCreateSeqSell()`, `MATSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATAIJ`, `MATMPIAIJ` 2310 M*/ 2311 2312 /*MC 2313 MATSELL - MATSELL = "sell" - A matrix type to be used for sparse matrices. 2314 2315 This matrix type is identical to `MATSEQSELL` when constructed with a single process communicator, 2316 and `MATMPISELL` otherwise. As a result, for single process communicators, 2317 `MatSeqSELLSetPreallocation()` is supported, and similarly `MatMPISELLSetPreallocation()` is supported 2318 for communicators controlling multiple processes. It is recommended that you call both of 2319 the above preallocation routines for simplicity. 2320 2321 Options Database Key: 2322 . -mat_type sell - sets the matrix type to "sell" during a call to MatSetFromOptions() 2323 2324 Level: beginner 2325 2326 Notes: 2327 This format is only supported for real scalars, double precision, and 32 bit indices (the defaults). 2328 2329 It can provide better performance on Intel and AMD processes with AVX2 or AVX512 support for matrices that have a similar number of 2330 non-zeros in contiguous groups of rows. However if the computation is memory bandwidth limited it may not provide much improvement. 2331 2332 Developer Notes: 2333 On Intel (and AMD) systems some of the matrix operations use SIMD (AVX) instructions to achieve higher performance. 2334 2335 The sparse matrix format is as follows. For simplicity we assume a slice size of 2, it is actually 8 2336 .vb 2337 (2 0 3 4) 2338 Consider the matrix A = (5 0 6 0) 2339 (0 0 7 8) 2340 (0 0 9 9) 2341 2342 symbolically the Ellpack format can be written as 2343 2344 (2 3 4 |) (0 2 3 |) 2345 v = (5 6 0 |) colidx = (0 2 2 |) 2346 -------- --------- 2347 (7 8 |) (2 3 |) 2348 (9 9 |) (2 3 |) 2349 2350 The data for 2 contiguous rows of the matrix are stored together (in column-major format) (with any left-over rows handled as a special case). 2351 Any of the rows in a slice fewer columns than the rest of the slice (row 1 above) are padded with a previous valid column in their "extra" colidx[] locations and 2352 zeros in their "extra" v locations so that the matrix operations do not need special code to handle different length rows within the 2 rows in a slice. 2353 2354 The one-dimensional representation of v used in the code is (2 5 3 6 4 0 7 9 8 9) and for colidx is (0 0 2 2 3 2 2 2 3 3) 2355 2356 .ve 2357 2358 See MatMult_SeqSELL() for how this format is used with the SIMD operations to achieve high performance. 2359 2360 References: 2361 . * - Hong Zhang, Richard T. Mills, Karl Rupp, and Barry F. Smith, Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512}, 2362 Proceedings of the 47th International Conference on Parallel Processing, 2018. 2363 2364 .seealso: `Mat`, `MatCreateSeqSELL()`, `MatCreateSeqAIJ()`, `MatCreateSell()`, `MATSEQSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATMPIAIJ`, `MATAIJ` 2365 M*/ 2366 2367 /*@C 2368 MatCreateSeqSELL - Creates a sparse matrix in `MATSEQSELL` format. 2369 2370 Collective 2371 2372 Input Parameters: 2373 + comm - MPI communicator, set to `PETSC_COMM_SELF` 2374 . m - number of rows 2375 . n - number of columns 2376 . rlenmax - maximum number of nonzeros in a row, ignored if `rlen` is provided 2377 - rlen - array containing the number of nonzeros in the various rows (possibly different for each row) or NULL 2378 2379 Output Parameter: 2380 . A - the matrix 2381 2382 Level: intermediate 2383 2384 Notes: 2385 It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 2386 MatXXXXSetPreallocation() paradigm instead of this routine directly. 2387 [MatXXXXSetPreallocation() is, for example, `MatSeqSELLSetPreallocation()`] 2388 2389 Specify the preallocated storage with either `rlenmax` or `rlen` (not both). 2390 Set `rlenmax` = `PETSC_DEFAULT` and `rlen` = `NULL` for PETSc to control dynamic memory 2391 allocation. 2392 2393 .seealso: `Mat`, `MATSEQSELL`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatSeqSELLSetPreallocation()`, `MATSELL`, `MATSEQSELL`, `MATMPISELL` 2394 @*/ 2395 PetscErrorCode MatCreateSeqSELL(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt rlenmax, const PetscInt rlen[], Mat *A) 2396 { 2397 PetscFunctionBegin; 2398 PetscCall(MatCreate(comm, A)); 2399 PetscCall(MatSetSizes(*A, m, n, m, n)); 2400 PetscCall(MatSetType(*A, MATSEQSELL)); 2401 PetscCall(MatSeqSELLSetPreallocation_SeqSELL(*A, rlenmax, rlen)); 2402 PetscFunctionReturn(PETSC_SUCCESS); 2403 } 2404 2405 PetscErrorCode MatEqual_SeqSELL(Mat A, Mat B, PetscBool *flg) 2406 { 2407 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data, *b = (Mat_SeqSELL *)B->data; 2408 PetscInt totalslices = a->totalslices; 2409 2410 PetscFunctionBegin; 2411 /* If the matrix dimensions are not equal,or no of nonzeros */ 2412 if ((A->rmap->n != B->rmap->n) || (A->cmap->n != B->cmap->n) || (a->nz != b->nz) || (a->rlenmax != b->rlenmax)) { 2413 *flg = PETSC_FALSE; 2414 PetscFunctionReturn(PETSC_SUCCESS); 2415 } 2416 /* if the a->colidx are the same */ 2417 PetscCall(PetscArraycmp(a->colidx, b->colidx, a->sliidx[totalslices], flg)); 2418 if (!*flg) PetscFunctionReturn(PETSC_SUCCESS); 2419 /* if a->val are the same */ 2420 PetscCall(PetscArraycmp(a->val, b->val, a->sliidx[totalslices], flg)); 2421 PetscFunctionReturn(PETSC_SUCCESS); 2422 } 2423 2424 PetscErrorCode MatSeqSELLInvalidateDiagonal(Mat A) 2425 { 2426 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 2427 2428 PetscFunctionBegin; 2429 a->idiagvalid = PETSC_FALSE; 2430 PetscFunctionReturn(PETSC_SUCCESS); 2431 } 2432 2433 PetscErrorCode MatConjugate_SeqSELL(Mat A) 2434 { 2435 #if defined(PETSC_USE_COMPLEX) 2436 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 2437 PetscInt i; 2438 PetscScalar *val = a->val; 2439 2440 PetscFunctionBegin; 2441 for (i = 0; i < a->sliidx[a->totalslices]; i++) { val[i] = PetscConj(val[i]); } 2442 #if defined(PETSC_HAVE_CUDA) 2443 if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) A->offloadmask = PETSC_OFFLOAD_CPU; 2444 #endif 2445 #else 2446 PetscFunctionBegin; 2447 #endif 2448 PetscFunctionReturn(PETSC_SUCCESS); 2449 } 2450