1 2 /* 3 Defines the basic matrix operations for the SELL matrix storage format. 4 */ 5 #include <../src/mat/impls/sell/seq/sell.h> /*I "petscmat.h" I*/ 6 #include <petscblaslapack.h> 7 #include <petsc/private/kernels/blocktranspose.h> 8 9 static PetscBool cited = PETSC_FALSE; 10 static const char citation[] = "@inproceedings{ZhangELLPACK2018,\n" 11 " author = {Hong Zhang and Richard T. Mills and Karl Rupp and Barry F. Smith},\n" 12 " title = {Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512}},\n" 13 " booktitle = {Proceedings of the 47th International Conference on Parallel Processing},\n" 14 " year = 2018\n" 15 "}\n"; 16 17 #if defined(PETSC_HAVE_IMMINTRIN_H) && (defined(__AVX512F__) || (defined(__AVX2__) && defined(__FMA__)) || defined(__AVX__)) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 18 19 #include <immintrin.h> 20 21 #if !defined(_MM_SCALE_8) 22 #define _MM_SCALE_8 8 23 #endif 24 25 #if defined(__AVX512F__) 26 /* these do not work 27 vec_idx = _mm512_loadunpackhi_epi32(vec_idx,acolidx); 28 vec_vals = _mm512_loadunpackhi_pd(vec_vals,aval); 29 */ 30 #define AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \ 31 /* if the mask bit is set, copy from acolidx, otherwise from vec_idx */ \ 32 vec_idx = _mm256_loadu_si256((__m256i const *)acolidx); \ 33 vec_vals = _mm512_loadu_pd(aval); \ 34 vec_x = _mm512_i32gather_pd(vec_idx, x, _MM_SCALE_8); \ 35 vec_y = _mm512_fmadd_pd(vec_x, vec_vals, vec_y) 36 #elif defined(__AVX2__) && defined(__FMA__) 37 #define AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \ 38 vec_vals = _mm256_loadu_pd(aval); \ 39 vec_idx = _mm_loadu_si128((__m128i const *)acolidx); /* SSE2 */ \ 40 vec_x = _mm256_i32gather_pd(x, vec_idx, _MM_SCALE_8); \ 41 vec_y = _mm256_fmadd_pd(vec_x, vec_vals, vec_y) 42 #endif 43 #endif /* PETSC_HAVE_IMMINTRIN_H */ 44 45 /*@C 46 MatSeqSELLSetPreallocation - For good matrix assembly performance 47 the user should preallocate the matrix storage by setting the parameter `nz` 48 (or the array `nnz`). 49 50 Collective 51 52 Input Parameters: 53 + B - The `MATSEQSELL` matrix 54 . rlenmax - number of nonzeros per row (same for all rows), ignored if `rlen` is provided 55 - rlen - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL` 56 57 Level: intermediate 58 59 Notes: 60 Specify the preallocated storage with either `rlenmax` or `rlen` (not both). 61 Set `rlenmax` = `PETSC_DEFAULT` and `rlen` = `NULL` for PETSc to control dynamic memory 62 allocation. 63 64 You can call `MatGetInfo()` to get information on how effective the preallocation was; 65 for example the fields mallocs,nz_allocated,nz_used,nz_unneeded; 66 You can also run with the option `-info` and look for messages with the string 67 malloc in them to see if additional memory allocation was needed. 68 69 Developer Note: 70 Use `rlenmax` of `MAT_SKIP_ALLOCATION` to not allocate any space for the matrix 71 entries or columns indices. 72 73 The maximum number of nonzeos in any row should be as accurate as possible. 74 If it is underestimated, you will get bad performance due to reallocation 75 (`MatSeqXSELLReallocateSELL()`). 76 77 .seealso: `Mat`, `MATSEQSELL`, `MATSELL`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatGetInfo()` 78 @*/ 79 PetscErrorCode MatSeqSELLSetPreallocation(Mat B, PetscInt rlenmax, const PetscInt rlen[]) 80 { 81 PetscFunctionBegin; 82 PetscValidHeaderSpecific(B, MAT_CLASSID, 1); 83 PetscValidType(B, 1); 84 PetscTryMethod(B, "MatSeqSELLSetPreallocation_C", (Mat, PetscInt, const PetscInt[]), (B, rlenmax, rlen)); 85 PetscFunctionReturn(PETSC_SUCCESS); 86 } 87 88 PetscErrorCode MatSeqSELLSetPreallocation_SeqSELL(Mat B, PetscInt maxallocrow, const PetscInt rlen[]) 89 { 90 Mat_SeqSELL *b; 91 PetscInt i, j, totalslices; 92 #if defined(PETSC_HAVE_CUDA) 93 PetscInt rlenmax = 0; 94 #endif 95 PetscBool skipallocation = PETSC_FALSE, realalloc = PETSC_FALSE; 96 97 PetscFunctionBegin; 98 if (maxallocrow >= 0 || rlen) realalloc = PETSC_TRUE; 99 if (maxallocrow == MAT_SKIP_ALLOCATION) { 100 skipallocation = PETSC_TRUE; 101 maxallocrow = 0; 102 } 103 104 PetscCall(PetscLayoutSetUp(B->rmap)); 105 PetscCall(PetscLayoutSetUp(B->cmap)); 106 107 /* FIXME: if one preallocates more space than needed, the matrix does not shrink automatically, but for best performance it should */ 108 if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 5; 109 PetscCheck(maxallocrow >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "maxallocrow cannot be less than 0: value %" PetscInt_FMT, maxallocrow); 110 if (rlen) { 111 for (i = 0; i < B->rmap->n; i++) { 112 PetscCheck(rlen[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "rlen cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, rlen[i]); 113 PetscCheck(rlen[i] <= B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "rlen cannot be greater than row length: local row %" PetscInt_FMT " value %" PetscInt_FMT " rowlength %" PetscInt_FMT, i, rlen[i], B->cmap->n); 114 } 115 } 116 117 B->preallocated = PETSC_TRUE; 118 119 b = (Mat_SeqSELL *)B->data; 120 121 if (!b->sliceheight) { /* not set yet */ 122 #if defined(PETSC_HAVE_CUDA) 123 b->sliceheight = 16; 124 #else 125 b->sliceheight = 8; 126 #endif 127 } 128 totalslices = PetscCeilInt(B->rmap->n, b->sliceheight); 129 b->totalslices = totalslices; 130 if (!skipallocation) { 131 if (B->rmap->n % b->sliceheight) PetscCall(PetscInfo(B, "Padding rows to the SEQSELL matrix because the number of rows is not the multiple of the slice height (value %" PetscInt_FMT ")\n", B->rmap->n)); 132 133 if (!b->sliidx) { /* sliidx gives the starting index of each slice, the last element is the total space allocated */ 134 PetscCall(PetscMalloc1(totalslices + 1, &b->sliidx)); 135 } 136 if (!rlen) { /* if rlen is not provided, allocate same space for all the slices */ 137 if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 10; 138 else if (maxallocrow < 0) maxallocrow = 1; 139 #if defined(PETSC_HAVE_CUDA) 140 rlenmax = maxallocrow; 141 /* Pad the slice to DEVICE_MEM_ALIGN */ 142 while (b->sliceheight * maxallocrow % DEVICE_MEM_ALIGN) maxallocrow++; 143 #endif 144 for (i = 0; i <= totalslices; i++) b->sliidx[i] = b->sliceheight * i * maxallocrow; 145 } else { 146 #if defined(PETSC_HAVE_CUDA) 147 PetscInt mul = DEVICE_MEM_ALIGN / b->sliceheight; 148 #endif 149 maxallocrow = 0; 150 b->sliidx[0] = 0; 151 for (i = 1; i < totalslices; i++) { 152 b->sliidx[i] = 0; 153 for (j = 0; j < b->sliceheight; j++) { b->sliidx[i] = PetscMax(b->sliidx[i], rlen[b->sliceheight * (i - 1) + j]); } 154 #if defined(PETSC_HAVE_CUDA) 155 rlenmax = PetscMax(b->sliidx[i], rlenmax); 156 /* Pad the slice to DEVICE_MEM_ALIGN */ 157 b->sliidx[i] = ((b->sliidx[i] - 1) / mul + 1) * mul; 158 #endif 159 maxallocrow = PetscMax(b->sliidx[i], maxallocrow); 160 PetscCall(PetscIntSumError(b->sliidx[i - 1], b->sliceheight * b->sliidx[i], &b->sliidx[i])); 161 } 162 /* last slice */ 163 b->sliidx[totalslices] = 0; 164 for (j = b->sliceheight * (totalslices - 1); j < B->rmap->n; j++) b->sliidx[totalslices] = PetscMax(b->sliidx[totalslices], rlen[j]); 165 #if defined(PETSC_HAVE_CUDA) 166 rlenmax = PetscMax(b->sliidx[i], rlenmax); 167 b->sliidx[totalslices] = ((b->sliidx[totalslices] - 1) / mul + 1) * mul; 168 #endif 169 maxallocrow = PetscMax(b->sliidx[totalslices], maxallocrow); 170 b->sliidx[totalslices] = b->sliidx[totalslices - 1] + b->sliceheight * b->sliidx[totalslices]; 171 } 172 173 /* allocate space for val, colidx, rlen */ 174 /* FIXME: should B's old memory be unlogged? */ 175 PetscCall(MatSeqXSELLFreeSELL(B, &b->val, &b->colidx)); 176 /* FIXME: assuming an element of the bit array takes 8 bits */ 177 PetscCall(PetscMalloc2(b->sliidx[totalslices], &b->val, b->sliidx[totalslices], &b->colidx)); 178 /* b->rlen will count nonzeros in each row so far. We dont copy rlen to b->rlen because the matrix has not been set. */ 179 PetscCall(PetscCalloc1(b->sliceheight * totalslices, &b->rlen)); 180 181 b->singlemalloc = PETSC_TRUE; 182 b->free_val = PETSC_TRUE; 183 b->free_colidx = PETSC_TRUE; 184 } else { 185 b->free_val = PETSC_FALSE; 186 b->free_colidx = PETSC_FALSE; 187 } 188 189 b->nz = 0; 190 b->maxallocrow = maxallocrow; 191 #if defined(PETSC_HAVE_CUDA) 192 b->rlenmax = rlenmax; 193 #else 194 b->rlenmax = maxallocrow; 195 #endif 196 b->maxallocmat = b->sliidx[totalslices]; 197 B->info.nz_unneeded = (double)b->maxallocmat; 198 if (realalloc) PetscCall(MatSetOption(B, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE)); 199 PetscFunctionReturn(PETSC_SUCCESS); 200 } 201 202 PetscErrorCode MatGetRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v) 203 { 204 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 205 PetscInt shift; 206 207 PetscFunctionBegin; 208 PetscCheck(row >= 0 && row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row %" PetscInt_FMT " out of range", row); 209 if (nz) *nz = a->rlen[row]; 210 shift = a->sliidx[row / a->sliceheight] + (row % a->sliceheight); 211 if (!a->getrowcols) { PetscCall(PetscMalloc2(a->rlenmax, &a->getrowcols, a->rlenmax, &a->getrowvals)); } 212 if (idx) { 213 PetscInt j; 214 for (j = 0; j < a->rlen[row]; j++) a->getrowcols[j] = a->colidx[shift + a->sliceheight * j]; 215 *idx = a->getrowcols; 216 } 217 if (v) { 218 PetscInt j; 219 for (j = 0; j < a->rlen[row]; j++) a->getrowvals[j] = a->val[shift + a->sliceheight * j]; 220 *v = a->getrowvals; 221 } 222 PetscFunctionReturn(PETSC_SUCCESS); 223 } 224 225 PetscErrorCode MatRestoreRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v) 226 { 227 PetscFunctionBegin; 228 PetscFunctionReturn(PETSC_SUCCESS); 229 } 230 231 PetscErrorCode MatConvert_SeqSELL_SeqAIJ(Mat A, MatType newtype, MatReuse reuse, Mat *newmat) 232 { 233 Mat B; 234 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 235 PetscInt i; 236 237 PetscFunctionBegin; 238 if (reuse == MAT_REUSE_MATRIX) { 239 B = *newmat; 240 PetscCall(MatZeroEntries(B)); 241 } else { 242 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B)); 243 PetscCall(MatSetSizes(B, A->rmap->n, A->cmap->n, A->rmap->N, A->cmap->N)); 244 PetscCall(MatSetType(B, MATSEQAIJ)); 245 PetscCall(MatSeqAIJSetPreallocation(B, 0, a->rlen)); 246 } 247 248 for (i = 0; i < A->rmap->n; i++) { 249 PetscInt nz = 0, *cols = NULL; 250 PetscScalar *vals = NULL; 251 252 PetscCall(MatGetRow_SeqSELL(A, i, &nz, &cols, &vals)); 253 PetscCall(MatSetValues(B, 1, &i, nz, cols, vals, INSERT_VALUES)); 254 PetscCall(MatRestoreRow_SeqSELL(A, i, &nz, &cols, &vals)); 255 } 256 257 PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY)); 258 PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY)); 259 B->rmap->bs = A->rmap->bs; 260 261 if (reuse == MAT_INPLACE_MATRIX) { 262 PetscCall(MatHeaderReplace(A, &B)); 263 } else { 264 *newmat = B; 265 } 266 PetscFunctionReturn(PETSC_SUCCESS); 267 } 268 269 #include <../src/mat/impls/aij/seq/aij.h> 270 271 PetscErrorCode MatConvert_SeqAIJ_SeqSELL(Mat A, MatType newtype, MatReuse reuse, Mat *newmat) 272 { 273 Mat B; 274 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 275 PetscInt *ai = a->i, m = A->rmap->N, n = A->cmap->N, i, *rowlengths, row, ncols; 276 const PetscInt *cols; 277 const PetscScalar *vals; 278 279 PetscFunctionBegin; 280 281 if (reuse == MAT_REUSE_MATRIX) { 282 B = *newmat; 283 } else { 284 if (PetscDefined(USE_DEBUG) || !a->ilen) { 285 PetscCall(PetscMalloc1(m, &rowlengths)); 286 for (i = 0; i < m; i++) rowlengths[i] = ai[i + 1] - ai[i]; 287 } 288 if (PetscDefined(USE_DEBUG) && a->ilen) { 289 PetscBool eq; 290 PetscCall(PetscMemcmp(rowlengths, a->ilen, m * sizeof(PetscInt), &eq)); 291 PetscCheck(eq, PETSC_COMM_SELF, PETSC_ERR_PLIB, "SeqAIJ ilen array incorrect"); 292 PetscCall(PetscFree(rowlengths)); 293 rowlengths = a->ilen; 294 } else if (a->ilen) rowlengths = a->ilen; 295 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B)); 296 PetscCall(MatSetSizes(B, m, n, m, n)); 297 PetscCall(MatSetType(B, MATSEQSELL)); 298 PetscCall(MatSeqSELLSetPreallocation(B, 0, rowlengths)); 299 if (rowlengths != a->ilen) PetscCall(PetscFree(rowlengths)); 300 } 301 302 for (row = 0; row < m; row++) { 303 PetscCall(MatGetRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals)); 304 PetscCall(MatSetValues_SeqSELL(B, 1, &row, ncols, cols, vals, INSERT_VALUES)); 305 PetscCall(MatRestoreRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals)); 306 } 307 PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY)); 308 PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY)); 309 B->rmap->bs = A->rmap->bs; 310 311 if (reuse == MAT_INPLACE_MATRIX) { 312 PetscCall(MatHeaderReplace(A, &B)); 313 } else { 314 *newmat = B; 315 } 316 PetscFunctionReturn(PETSC_SUCCESS); 317 } 318 319 PetscErrorCode MatMult_SeqSELL(Mat A, Vec xx, Vec yy) 320 { 321 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 322 PetscScalar *y; 323 const PetscScalar *x; 324 const MatScalar *aval = a->val; 325 PetscInt totalslices = a->totalslices; 326 const PetscInt *acolidx = a->colidx; 327 PetscInt i, j; 328 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 329 __m512d vec_x, vec_y, vec_vals; 330 __m256i vec_idx; 331 __mmask8 mask; 332 __m512d vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4; 333 __m256i vec_idx2, vec_idx3, vec_idx4; 334 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 335 __m128i vec_idx; 336 __m256d vec_x, vec_y, vec_y2, vec_vals; 337 MatScalar yval; 338 PetscInt r, rows_left, row, nnz_in_row; 339 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 340 __m128d vec_x_tmp; 341 __m256d vec_x, vec_y, vec_y2, vec_vals; 342 MatScalar yval; 343 PetscInt r, rows_left, row, nnz_in_row; 344 #else 345 PetscInt k, sliceheight = a->sliceheight; 346 PetscScalar *sum; 347 #endif 348 349 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 350 #pragma disjoint(*x, *y, *aval) 351 #endif 352 353 PetscFunctionBegin; 354 PetscCall(VecGetArrayRead(xx, &x)); 355 PetscCall(VecGetArray(yy, &y)); 356 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 357 PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight); 358 for (i = 0; i < totalslices; i++) { /* loop over slices */ 359 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 360 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 361 362 vec_y = _mm512_setzero_pd(); 363 vec_y2 = _mm512_setzero_pd(); 364 vec_y3 = _mm512_setzero_pd(); 365 vec_y4 = _mm512_setzero_pd(); 366 367 j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice column */ 368 switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) { 369 case 3: 370 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 371 acolidx += 8; 372 aval += 8; 373 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 374 acolidx += 8; 375 aval += 8; 376 AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3); 377 acolidx += 8; 378 aval += 8; 379 j += 3; 380 break; 381 case 2: 382 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 383 acolidx += 8; 384 aval += 8; 385 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 386 acolidx += 8; 387 aval += 8; 388 j += 2; 389 break; 390 case 1: 391 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 392 acolidx += 8; 393 aval += 8; 394 j += 1; 395 break; 396 } 397 #pragma novector 398 for (; j < (a->sliidx[i + 1] >> 3); j += 4) { 399 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 400 acolidx += 8; 401 aval += 8; 402 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 403 acolidx += 8; 404 aval += 8; 405 AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3); 406 acolidx += 8; 407 aval += 8; 408 AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4); 409 acolidx += 8; 410 aval += 8; 411 } 412 413 vec_y = _mm512_add_pd(vec_y, vec_y2); 414 vec_y = _mm512_add_pd(vec_y, vec_y3); 415 vec_y = _mm512_add_pd(vec_y, vec_y4); 416 if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */ 417 mask = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07))); 418 _mm512_mask_storeu_pd(&y[8 * i], mask, vec_y); 419 } else { 420 _mm512_storeu_pd(&y[8 * i], vec_y); 421 } 422 } 423 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 424 PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight); 425 for (i = 0; i < totalslices; i++) { /* loop over full slices */ 426 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 427 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 428 429 /* last slice may have padding rows. Don't use vectorization. */ 430 if (i == totalslices - 1 && (A->rmap->n & 0x07)) { 431 rows_left = A->rmap->n - 8 * i; 432 for (r = 0; r < rows_left; ++r) { 433 yval = (MatScalar)0; 434 row = 8 * i + r; 435 nnz_in_row = a->rlen[row]; 436 for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]]; 437 y[row] = yval; 438 } 439 break; 440 } 441 442 vec_y = _mm256_setzero_pd(); 443 vec_y2 = _mm256_setzero_pd(); 444 445 /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */ 446 #pragma novector 447 #pragma unroll(2) 448 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) { 449 AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 450 aval += 4; 451 acolidx += 4; 452 AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y2); 453 aval += 4; 454 acolidx += 4; 455 } 456 457 _mm256_storeu_pd(y + i * 8, vec_y); 458 _mm256_storeu_pd(y + i * 8 + 4, vec_y2); 459 } 460 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 461 PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight); 462 for (i = 0; i < totalslices; i++) { /* loop over full slices */ 463 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 464 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 465 466 vec_y = _mm256_setzero_pd(); 467 vec_y2 = _mm256_setzero_pd(); 468 469 /* last slice may have padding rows. Don't use vectorization. */ 470 if (i == totalslices - 1 && (A->rmap->n & 0x07)) { 471 rows_left = A->rmap->n - 8 * i; 472 for (r = 0; r < rows_left; ++r) { 473 yval = (MatScalar)0; 474 row = 8 * i + r; 475 nnz_in_row = a->rlen[row]; 476 for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]]; 477 y[row] = yval; 478 } 479 break; 480 } 481 482 /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */ 483 #pragma novector 484 #pragma unroll(2) 485 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) { 486 vec_vals = _mm256_loadu_pd(aval); 487 vec_x_tmp = _mm_setzero_pd(); 488 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 489 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 490 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0); 491 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 492 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 493 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1); 494 vec_y = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y); 495 aval += 4; 496 497 vec_vals = _mm256_loadu_pd(aval); 498 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 499 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 500 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0); 501 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 502 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 503 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1); 504 vec_y2 = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2); 505 aval += 4; 506 } 507 508 _mm256_storeu_pd(y + i * 8, vec_y); 509 _mm256_storeu_pd(y + i * 8 + 4, vec_y2); 510 } 511 #else 512 PetscCall(PetscMalloc1(sliceheight, &sum)); 513 for (i = 0; i < totalslices; i++) { /* loop over slices */ 514 for (j = 0; j < sliceheight; j++) { 515 sum[j] = 0.0; 516 for (k = a->sliidx[i] + j; k < a->sliidx[i + 1]; k += sliceheight) sum[j] += aval[k] * x[acolidx[k]]; 517 } 518 if (i == totalslices - 1 && (A->rmap->n % sliceheight)) { /* if last slice has padding rows */ 519 for (j = 0; j < (A->rmap->n % sliceheight); j++) y[sliceheight * i + j] = sum[j]; 520 } else { 521 for (j = 0; j < sliceheight; j++) y[sliceheight * i + j] = sum[j]; 522 } 523 } 524 PetscCall(PetscFree(sum)); 525 #endif 526 527 PetscCall(PetscLogFlops(2.0 * a->nz - a->nonzerorowcnt)); /* theoretical minimal FLOPs */ 528 PetscCall(VecRestoreArrayRead(xx, &x)); 529 PetscCall(VecRestoreArray(yy, &y)); 530 PetscFunctionReturn(PETSC_SUCCESS); 531 } 532 533 #include <../src/mat/impls/aij/seq/ftn-kernels/fmultadd.h> 534 PetscErrorCode MatMultAdd_SeqSELL(Mat A, Vec xx, Vec yy, Vec zz) 535 { 536 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 537 PetscScalar *y, *z; 538 const PetscScalar *x; 539 const MatScalar *aval = a->val; 540 PetscInt totalslices = a->totalslices; 541 const PetscInt *acolidx = a->colidx; 542 PetscInt i, j; 543 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 544 __m512d vec_x, vec_y, vec_vals; 545 __m256i vec_idx; 546 __mmask8 mask; 547 __m512d vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4; 548 __m256i vec_idx2, vec_idx3, vec_idx4; 549 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 550 __m128d vec_x_tmp; 551 __m256d vec_x, vec_y, vec_y2, vec_vals; 552 MatScalar yval; 553 PetscInt r, row, nnz_in_row; 554 #else 555 PetscInt k, sliceheight = a->sliceheight; 556 PetscScalar *sum; 557 #endif 558 559 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 560 #pragma disjoint(*x, *y, *aval) 561 #endif 562 563 PetscFunctionBegin; 564 if (!a->nz) { 565 PetscCall(VecCopy(yy, zz)); 566 PetscFunctionReturn(PETSC_SUCCESS); 567 } 568 PetscCall(VecGetArrayRead(xx, &x)); 569 PetscCall(VecGetArrayPair(yy, zz, &y, &z)); 570 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 571 PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight); 572 for (i = 0; i < totalslices; i++) { /* loop over slices */ 573 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 574 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 575 576 if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */ 577 mask = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07))); 578 vec_y = _mm512_mask_loadu_pd(vec_y, mask, &y[8 * i]); 579 } else { 580 vec_y = _mm512_loadu_pd(&y[8 * i]); 581 } 582 vec_y2 = _mm512_setzero_pd(); 583 vec_y3 = _mm512_setzero_pd(); 584 vec_y4 = _mm512_setzero_pd(); 585 586 j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice column */ 587 switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) { 588 case 3: 589 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 590 acolidx += 8; 591 aval += 8; 592 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 593 acolidx += 8; 594 aval += 8; 595 AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3); 596 acolidx += 8; 597 aval += 8; 598 j += 3; 599 break; 600 case 2: 601 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 602 acolidx += 8; 603 aval += 8; 604 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 605 acolidx += 8; 606 aval += 8; 607 j += 2; 608 break; 609 case 1: 610 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 611 acolidx += 8; 612 aval += 8; 613 j += 1; 614 break; 615 } 616 #pragma novector 617 for (; j < (a->sliidx[i + 1] >> 3); j += 4) { 618 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 619 acolidx += 8; 620 aval += 8; 621 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 622 acolidx += 8; 623 aval += 8; 624 AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3); 625 acolidx += 8; 626 aval += 8; 627 AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4); 628 acolidx += 8; 629 aval += 8; 630 } 631 632 vec_y = _mm512_add_pd(vec_y, vec_y2); 633 vec_y = _mm512_add_pd(vec_y, vec_y3); 634 vec_y = _mm512_add_pd(vec_y, vec_y4); 635 if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */ 636 _mm512_mask_storeu_pd(&z[8 * i], mask, vec_y); 637 } else { 638 _mm512_storeu_pd(&z[8 * i], vec_y); 639 } 640 } 641 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 642 PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight); 643 for (i = 0; i < totalslices; i++) { /* loop over full slices */ 644 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 645 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 646 647 /* last slice may have padding rows. Don't use vectorization. */ 648 if (i == totalslices - 1 && (A->rmap->n & 0x07)) { 649 for (r = 0; r < (A->rmap->n & 0x07); ++r) { 650 row = 8 * i + r; 651 yval = (MatScalar)0.0; 652 nnz_in_row = a->rlen[row]; 653 for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]]; 654 z[row] = y[row] + yval; 655 } 656 break; 657 } 658 659 vec_y = _mm256_loadu_pd(y + 8 * i); 660 vec_y2 = _mm256_loadu_pd(y + 8 * i + 4); 661 662 /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */ 663 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) { 664 vec_vals = _mm256_loadu_pd(aval); 665 vec_x_tmp = _mm_setzero_pd(); 666 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 667 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 668 vec_x = _mm256_setzero_pd(); 669 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0); 670 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 671 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 672 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1); 673 vec_y = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y); 674 aval += 4; 675 676 vec_vals = _mm256_loadu_pd(aval); 677 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 678 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 679 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0); 680 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 681 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 682 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1); 683 vec_y2 = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2); 684 aval += 4; 685 } 686 687 _mm256_storeu_pd(z + i * 8, vec_y); 688 _mm256_storeu_pd(z + i * 8 + 4, vec_y2); 689 } 690 #else 691 PetscCall(PetscMalloc1(sliceheight, &sum)); 692 for (i = 0; i < totalslices; i++) { /* loop over slices */ 693 for (j = 0; j < sliceheight; j++) { 694 sum[j] = 0.0; 695 for (k = a->sliidx[i] + j; k < a->sliidx[i + 1]; k += sliceheight) sum[j] += aval[k] * x[acolidx[k]]; 696 } 697 if (i == totalslices - 1 && (A->rmap->n % sliceheight)) { 698 for (j = 0; j < (A->rmap->n % sliceheight); j++) z[sliceheight * i + j] = y[sliceheight * i + j] + sum[j]; 699 } else { 700 for (j = 0; j < sliceheight; j++) z[sliceheight * i + j] = y[sliceheight * i + j] + sum[j]; 701 } 702 } 703 PetscCall(PetscFree(sum)); 704 #endif 705 706 PetscCall(PetscLogFlops(2.0 * a->nz)); 707 PetscCall(VecRestoreArrayRead(xx, &x)); 708 PetscCall(VecRestoreArrayPair(yy, zz, &y, &z)); 709 PetscFunctionReturn(PETSC_SUCCESS); 710 } 711 712 PetscErrorCode MatMultTransposeAdd_SeqSELL(Mat A, Vec xx, Vec zz, Vec yy) 713 { 714 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 715 PetscScalar *y; 716 const PetscScalar *x; 717 const MatScalar *aval = a->val; 718 const PetscInt *acolidx = a->colidx; 719 PetscInt i, j, r, row, nnz_in_row, totalslices = a->totalslices, sliceheight = a->sliceheight; 720 721 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 722 #pragma disjoint(*x, *y, *aval) 723 #endif 724 725 PetscFunctionBegin; 726 if (A->symmetric == PETSC_BOOL3_TRUE) { 727 PetscCall(MatMultAdd_SeqSELL(A, xx, zz, yy)); 728 PetscFunctionReturn(PETSC_SUCCESS); 729 } 730 if (zz != yy) PetscCall(VecCopy(zz, yy)); 731 732 if (a->nz) { 733 PetscCall(VecGetArrayRead(xx, &x)); 734 PetscCall(VecGetArray(yy, &y)); 735 for (i = 0; i < a->totalslices; i++) { /* loop over slices */ 736 if (i == totalslices - 1 && (A->rmap->n % sliceheight)) { 737 for (r = 0; r < (A->rmap->n % sliceheight); ++r) { 738 row = sliceheight * i + r; 739 nnz_in_row = a->rlen[row]; 740 for (j = 0; j < nnz_in_row; ++j) y[acolidx[sliceheight * j + r]] += aval[sliceheight * j + r] * x[row]; 741 } 742 break; 743 } 744 for (r = 0; r < sliceheight; ++r) 745 for (j = a->sliidx[i] + r; j < a->sliidx[i + 1]; j += sliceheight) y[acolidx[j]] += aval[j] * x[sliceheight * i + r]; 746 } 747 PetscCall(PetscLogFlops(2.0 * a->nz)); 748 PetscCall(VecRestoreArrayRead(xx, &x)); 749 PetscCall(VecRestoreArray(yy, &y)); 750 } 751 PetscFunctionReturn(PETSC_SUCCESS); 752 } 753 754 PetscErrorCode MatMultTranspose_SeqSELL(Mat A, Vec xx, Vec yy) 755 { 756 PetscFunctionBegin; 757 if (A->symmetric == PETSC_BOOL3_TRUE) { 758 PetscCall(MatMult_SeqSELL(A, xx, yy)); 759 } else { 760 PetscCall(VecSet(yy, 0.0)); 761 PetscCall(MatMultTransposeAdd_SeqSELL(A, xx, yy, yy)); 762 } 763 PetscFunctionReturn(PETSC_SUCCESS); 764 } 765 766 /* 767 Checks for missing diagonals 768 */ 769 PetscErrorCode MatMissingDiagonal_SeqSELL(Mat A, PetscBool *missing, PetscInt *d) 770 { 771 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 772 PetscInt *diag, i; 773 774 PetscFunctionBegin; 775 *missing = PETSC_FALSE; 776 if (A->rmap->n > 0 && !(a->colidx)) { 777 *missing = PETSC_TRUE; 778 if (d) *d = 0; 779 PetscCall(PetscInfo(A, "Matrix has no entries therefore is missing diagonal\n")); 780 } else { 781 diag = a->diag; 782 for (i = 0; i < A->rmap->n; i++) { 783 if (diag[i] == -1) { 784 *missing = PETSC_TRUE; 785 if (d) *d = i; 786 PetscCall(PetscInfo(A, "Matrix is missing diagonal number %" PetscInt_FMT "\n", i)); 787 break; 788 } 789 } 790 } 791 PetscFunctionReturn(PETSC_SUCCESS); 792 } 793 794 PetscErrorCode MatMarkDiagonal_SeqSELL(Mat A) 795 { 796 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 797 PetscInt i, j, m = A->rmap->n, shift; 798 799 PetscFunctionBegin; 800 if (!a->diag) { 801 PetscCall(PetscMalloc1(m, &a->diag)); 802 a->free_diag = PETSC_TRUE; 803 } 804 for (i = 0; i < m; i++) { /* loop over rows */ 805 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */ 806 a->diag[i] = -1; 807 for (j = 0; j < a->rlen[i]; j++) { 808 if (a->colidx[shift + a->sliceheight * j] == i) { 809 a->diag[i] = shift + a->sliceheight * j; 810 break; 811 } 812 } 813 } 814 PetscFunctionReturn(PETSC_SUCCESS); 815 } 816 817 /* 818 Negative shift indicates do not generate an error if there is a zero diagonal, just invert it anyways 819 */ 820 PetscErrorCode MatInvertDiagonal_SeqSELL(Mat A, PetscScalar omega, PetscScalar fshift) 821 { 822 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 823 PetscInt i, *diag, m = A->rmap->n; 824 MatScalar *val = a->val; 825 PetscScalar *idiag, *mdiag; 826 827 PetscFunctionBegin; 828 if (a->idiagvalid) PetscFunctionReturn(PETSC_SUCCESS); 829 PetscCall(MatMarkDiagonal_SeqSELL(A)); 830 diag = a->diag; 831 if (!a->idiag) { 832 PetscCall(PetscMalloc3(m, &a->idiag, m, &a->mdiag, m, &a->ssor_work)); 833 val = a->val; 834 } 835 mdiag = a->mdiag; 836 idiag = a->idiag; 837 838 if (omega == 1.0 && PetscRealPart(fshift) <= 0.0) { 839 for (i = 0; i < m; i++) { 840 mdiag[i] = val[diag[i]]; 841 if (!PetscAbsScalar(mdiag[i])) { /* zero diagonal */ 842 PetscCheck(PetscRealPart(fshift), PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Zero diagonal on row %" PetscInt_FMT, i); 843 PetscCall(PetscInfo(A, "Zero diagonal on row %" PetscInt_FMT "\n", i)); 844 A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 845 A->factorerror_zeropivot_value = 0.0; 846 A->factorerror_zeropivot_row = i; 847 } 848 idiag[i] = 1.0 / val[diag[i]]; 849 } 850 PetscCall(PetscLogFlops(m)); 851 } else { 852 for (i = 0; i < m; i++) { 853 mdiag[i] = val[diag[i]]; 854 idiag[i] = omega / (fshift + val[diag[i]]); 855 } 856 PetscCall(PetscLogFlops(2.0 * m)); 857 } 858 a->idiagvalid = PETSC_TRUE; 859 PetscFunctionReturn(PETSC_SUCCESS); 860 } 861 862 PetscErrorCode MatZeroEntries_SeqSELL(Mat A) 863 { 864 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 865 866 PetscFunctionBegin; 867 PetscCall(PetscArrayzero(a->val, a->sliidx[a->totalslices])); 868 PetscCall(MatSeqSELLInvalidateDiagonal(A)); 869 PetscFunctionReturn(PETSC_SUCCESS); 870 } 871 872 PetscErrorCode MatDestroy_SeqSELL(Mat A) 873 { 874 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 875 876 PetscFunctionBegin; 877 #if defined(PETSC_USE_LOG) 878 PetscCall(PetscLogObjectState((PetscObject)A, "Rows=%" PetscInt_FMT ", Cols=%" PetscInt_FMT ", NZ=%" PetscInt_FMT, A->rmap->n, A->cmap->n, a->nz)); 879 #endif 880 PetscCall(MatSeqXSELLFreeSELL(A, &a->val, &a->colidx)); 881 PetscCall(ISDestroy(&a->row)); 882 PetscCall(ISDestroy(&a->col)); 883 PetscCall(PetscFree(a->diag)); 884 PetscCall(PetscFree(a->rlen)); 885 PetscCall(PetscFree(a->sliidx)); 886 PetscCall(PetscFree3(a->idiag, a->mdiag, a->ssor_work)); 887 PetscCall(PetscFree(a->solve_work)); 888 PetscCall(ISDestroy(&a->icol)); 889 PetscCall(PetscFree(a->saved_values)); 890 PetscCall(PetscFree2(a->getrowcols, a->getrowvals)); 891 PetscCall(PetscFree(A->data)); 892 #if defined(PETSC_HAVE_CUDA) 893 PetscCall(PetscFree(a->chunk_slice_map)); 894 #endif 895 896 PetscCall(PetscObjectChangeTypeName((PetscObject)A, NULL)); 897 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatStoreValues_C", NULL)); 898 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatRetrieveValues_C", NULL)); 899 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLSetPreallocation_C", NULL)); 900 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetArray_C", NULL)); 901 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLRestoreArray_C", NULL)); 902 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_SeqSELL_SeqAIJ_C", NULL)); 903 #if defined(PETSC_HAVE_CUDA) 904 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_SeqSELL_SeqSELLCUDA_C", NULL)); 905 #endif 906 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetFillRatio_C", NULL)); 907 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetMaxSliceWidth_C", NULL)); 908 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetAvgSliceWidth_C", NULL)); 909 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLSetSliceHeight_C", NULL)); 910 PetscFunctionReturn(PETSC_SUCCESS); 911 } 912 913 PetscErrorCode MatSetOption_SeqSELL(Mat A, MatOption op, PetscBool flg) 914 { 915 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 916 917 PetscFunctionBegin; 918 switch (op) { 919 case MAT_ROW_ORIENTED: 920 a->roworiented = flg; 921 break; 922 case MAT_KEEP_NONZERO_PATTERN: 923 a->keepnonzeropattern = flg; 924 break; 925 case MAT_NEW_NONZERO_LOCATIONS: 926 a->nonew = (flg ? 0 : 1); 927 break; 928 case MAT_NEW_NONZERO_LOCATION_ERR: 929 a->nonew = (flg ? -1 : 0); 930 break; 931 case MAT_NEW_NONZERO_ALLOCATION_ERR: 932 a->nonew = (flg ? -2 : 0); 933 break; 934 case MAT_UNUSED_NONZERO_LOCATION_ERR: 935 a->nounused = (flg ? -1 : 0); 936 break; 937 case MAT_FORCE_DIAGONAL_ENTRIES: 938 case MAT_IGNORE_OFF_PROC_ENTRIES: 939 case MAT_USE_HASH_TABLE: 940 case MAT_SORTED_FULL: 941 PetscCall(PetscInfo(A, "Option %s ignored\n", MatOptions[op])); 942 break; 943 case MAT_SPD: 944 case MAT_SYMMETRIC: 945 case MAT_STRUCTURALLY_SYMMETRIC: 946 case MAT_HERMITIAN: 947 case MAT_SYMMETRY_ETERNAL: 948 case MAT_STRUCTURAL_SYMMETRY_ETERNAL: 949 case MAT_SPD_ETERNAL: 950 /* These options are handled directly by MatSetOption() */ 951 break; 952 default: 953 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unknown option %d", op); 954 } 955 PetscFunctionReturn(PETSC_SUCCESS); 956 } 957 958 PetscErrorCode MatGetDiagonal_SeqSELL(Mat A, Vec v) 959 { 960 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 961 PetscInt i, j, n, shift; 962 PetscScalar *x, zero = 0.0; 963 964 PetscFunctionBegin; 965 PetscCall(VecGetLocalSize(v, &n)); 966 PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector"); 967 968 if (A->factortype == MAT_FACTOR_ILU || A->factortype == MAT_FACTOR_LU) { 969 PetscInt *diag = a->diag; 970 PetscCall(VecGetArray(v, &x)); 971 for (i = 0; i < n; i++) x[i] = 1.0 / a->val[diag[i]]; 972 PetscCall(VecRestoreArray(v, &x)); 973 PetscFunctionReturn(PETSC_SUCCESS); 974 } 975 976 PetscCall(VecSet(v, zero)); 977 PetscCall(VecGetArray(v, &x)); 978 for (i = 0; i < n; i++) { /* loop over rows */ 979 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */ 980 x[i] = 0; 981 for (j = 0; j < a->rlen[i]; j++) { 982 if (a->colidx[shift + a->sliceheight * j] == i) { 983 x[i] = a->val[shift + a->sliceheight * j]; 984 break; 985 } 986 } 987 } 988 PetscCall(VecRestoreArray(v, &x)); 989 PetscFunctionReturn(PETSC_SUCCESS); 990 } 991 992 PetscErrorCode MatDiagonalScale_SeqSELL(Mat A, Vec ll, Vec rr) 993 { 994 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 995 const PetscScalar *l, *r; 996 PetscInt i, j, m, n, row; 997 998 PetscFunctionBegin; 999 if (ll) { 1000 /* The local size is used so that VecMPI can be passed to this routine 1001 by MatDiagonalScale_MPISELL */ 1002 PetscCall(VecGetLocalSize(ll, &m)); 1003 PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length"); 1004 PetscCall(VecGetArrayRead(ll, &l)); 1005 for (i = 0; i < a->totalslices; i++) { /* loop over slices */ 1006 if (i == a->totalslices - 1 && (A->rmap->n % a->sliceheight)) { /* if last slice has padding rows */ 1007 for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = (row + 1) % a->sliceheight) { 1008 if (row < (A->rmap->n % a->sliceheight)) a->val[j] *= l[a->sliceheight * i + row]; 1009 } 1010 } else { 1011 for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = (row + 1) % a->sliceheight) { a->val[j] *= l[a->sliceheight * i + row]; } 1012 } 1013 } 1014 PetscCall(VecRestoreArrayRead(ll, &l)); 1015 PetscCall(PetscLogFlops(a->nz)); 1016 } 1017 if (rr) { 1018 PetscCall(VecGetLocalSize(rr, &n)); 1019 PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length"); 1020 PetscCall(VecGetArrayRead(rr, &r)); 1021 for (i = 0; i < a->totalslices; i++) { /* loop over slices */ 1022 if (i == a->totalslices - 1 && (A->rmap->n % a->sliceheight)) { /* if last slice has padding rows */ 1023 for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) % a->sliceheight)) { 1024 if (row < (A->rmap->n % a->sliceheight)) a->val[j] *= r[a->colidx[j]]; 1025 } 1026 } else { 1027 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j++) a->val[j] *= r[a->colidx[j]]; 1028 } 1029 } 1030 PetscCall(VecRestoreArrayRead(rr, &r)); 1031 PetscCall(PetscLogFlops(a->nz)); 1032 } 1033 PetscCall(MatSeqSELLInvalidateDiagonal(A)); 1034 #if defined(PETSC_HAVE_CUDA) 1035 if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) A->offloadmask = PETSC_OFFLOAD_CPU; 1036 #endif 1037 PetscFunctionReturn(PETSC_SUCCESS); 1038 } 1039 1040 PetscErrorCode MatGetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], PetscScalar v[]) 1041 { 1042 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1043 PetscInt *cp, i, k, low, high, t, row, col, l; 1044 PetscInt shift; 1045 MatScalar *vp; 1046 1047 PetscFunctionBegin; 1048 for (k = 0; k < m; k++) { /* loop over requested rows */ 1049 row = im[k]; 1050 if (row < 0) continue; 1051 PetscCheck(row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->n - 1); 1052 shift = a->sliidx[row / a->sliceheight] + (row % a->sliceheight); /* starting index of the row */ 1053 cp = a->colidx + shift; /* pointer to the row */ 1054 vp = a->val + shift; /* pointer to the row */ 1055 for (l = 0; l < n; l++) { /* loop over requested columns */ 1056 col = in[l]; 1057 if (col < 0) continue; 1058 PetscCheck(col < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column too large: row %" PetscInt_FMT " max %" PetscInt_FMT, col, A->cmap->n - 1); 1059 high = a->rlen[row]; 1060 low = 0; /* assume unsorted */ 1061 while (high - low > 5) { 1062 t = (low + high) / 2; 1063 if (*(cp + a->sliceheight * t) > col) high = t; 1064 else low = t; 1065 } 1066 for (i = low; i < high; i++) { 1067 if (*(cp + a->sliceheight * i) > col) break; 1068 if (*(cp + a->sliceheight * i) == col) { 1069 *v++ = *(vp + a->sliceheight * i); 1070 goto finished; 1071 } 1072 } 1073 *v++ = 0.0; 1074 finished:; 1075 } 1076 } 1077 PetscFunctionReturn(PETSC_SUCCESS); 1078 } 1079 1080 PetscErrorCode MatView_SeqSELL_ASCII(Mat A, PetscViewer viewer) 1081 { 1082 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1083 PetscInt i, j, m = A->rmap->n, shift; 1084 const char *name; 1085 PetscViewerFormat format; 1086 1087 PetscFunctionBegin; 1088 PetscCall(PetscViewerGetFormat(viewer, &format)); 1089 if (format == PETSC_VIEWER_ASCII_MATLAB) { 1090 PetscInt nofinalvalue = 0; 1091 /* 1092 if (m && ((a->i[m] == a->i[m-1]) || (a->j[a->nz-1] != A->cmap->n-1))) { 1093 nofinalvalue = 1; 1094 } 1095 */ 1096 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1097 PetscCall(PetscViewerASCIIPrintf(viewer, "%% Size = %" PetscInt_FMT " %" PetscInt_FMT " \n", m, A->cmap->n)); 1098 PetscCall(PetscViewerASCIIPrintf(viewer, "%% Nonzeros = %" PetscInt_FMT " \n", a->nz)); 1099 #if defined(PETSC_USE_COMPLEX) 1100 PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",4);\n", a->nz + nofinalvalue)); 1101 #else 1102 PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",3);\n", a->nz + nofinalvalue)); 1103 #endif 1104 PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = [\n")); 1105 1106 for (i = 0; i < m; i++) { 1107 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1108 for (j = 0; j < a->rlen[i]; j++) { 1109 #if defined(PETSC_USE_COMPLEX) 1110 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %18.16e %18.16e\n", i + 1, a->colidx[shift + a->sliceheight * j] + 1, (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j]))); 1111 #else 1112 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %18.16e\n", i + 1, a->colidx[shift + a->sliceheight * j] + 1, (double)a->val[shift + a->sliceheight * j])); 1113 #endif 1114 } 1115 } 1116 /* 1117 if (nofinalvalue) { 1118 #if defined(PETSC_USE_COMPLEX) 1119 PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %18.16e %18.16e\n",m,A->cmap->n,0.,0.)); 1120 #else 1121 PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %18.16e\n",m,A->cmap->n,0.0)); 1122 #endif 1123 } 1124 */ 1125 PetscCall(PetscObjectGetName((PetscObject)A, &name)); 1126 PetscCall(PetscViewerASCIIPrintf(viewer, "];\n %s = spconvert(zzz);\n", name)); 1127 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1128 } else if (format == PETSC_VIEWER_ASCII_FACTOR_INFO || format == PETSC_VIEWER_ASCII_INFO) { 1129 PetscFunctionReturn(PETSC_SUCCESS); 1130 } else if (format == PETSC_VIEWER_ASCII_COMMON) { 1131 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1132 for (i = 0; i < m; i++) { 1133 PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i)); 1134 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1135 for (j = 0; j < a->rlen[i]; j++) { 1136 #if defined(PETSC_USE_COMPLEX) 1137 if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) > 0.0 && PetscRealPart(a->val[shift + a->sliceheight * j]) != 0.0) { 1138 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j]))); 1139 } else if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) < 0.0 && PetscRealPart(a->val[shift + a->sliceheight * j]) != 0.0) { 1140 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)-PetscImaginaryPart(a->val[shift + a->sliceheight * j]))); 1141 } else if (PetscRealPart(a->val[shift + a->sliceheight * j]) != 0.0) { 1142 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]))); 1143 } 1144 #else 1145 if (a->val[shift + a->sliceheight * j] != 0.0) PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)a->val[shift + a->sliceheight * j])); 1146 #endif 1147 } 1148 PetscCall(PetscViewerASCIIPrintf(viewer, "\n")); 1149 } 1150 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1151 } else if (format == PETSC_VIEWER_ASCII_DENSE) { 1152 PetscInt cnt = 0, jcnt; 1153 PetscScalar value; 1154 #if defined(PETSC_USE_COMPLEX) 1155 PetscBool realonly = PETSC_TRUE; 1156 for (i = 0; i < a->sliidx[a->totalslices]; i++) { 1157 if (PetscImaginaryPart(a->val[i]) != 0.0) { 1158 realonly = PETSC_FALSE; 1159 break; 1160 } 1161 } 1162 #endif 1163 1164 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1165 for (i = 0; i < m; i++) { 1166 jcnt = 0; 1167 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1168 for (j = 0; j < A->cmap->n; j++) { 1169 if (jcnt < a->rlen[i] && j == a->colidx[shift + a->sliceheight * j]) { 1170 value = a->val[cnt++]; 1171 jcnt++; 1172 } else { 1173 value = 0.0; 1174 } 1175 #if defined(PETSC_USE_COMPLEX) 1176 if (realonly) { 1177 PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)PetscRealPart(value))); 1178 } else { 1179 PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e+%7.5e i ", (double)PetscRealPart(value), (double)PetscImaginaryPart(value))); 1180 } 1181 #else 1182 PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)value)); 1183 #endif 1184 } 1185 PetscCall(PetscViewerASCIIPrintf(viewer, "\n")); 1186 } 1187 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1188 } else if (format == PETSC_VIEWER_ASCII_MATRIXMARKET) { 1189 PetscInt fshift = 1; 1190 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1191 #if defined(PETSC_USE_COMPLEX) 1192 PetscCall(PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate complex general\n")); 1193 #else 1194 PetscCall(PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate real general\n")); 1195 #endif 1196 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %" PetscInt_FMT "\n", m, A->cmap->n, a->nz)); 1197 for (i = 0; i < m; i++) { 1198 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1199 for (j = 0; j < a->rlen[i]; j++) { 1200 #if defined(PETSC_USE_COMPLEX) 1201 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g %g\n", i + fshift, a->colidx[shift + a->sliceheight * j] + fshift, (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j]))); 1202 #else 1203 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g\n", i + fshift, a->colidx[shift + a->sliceheight * j] + fshift, (double)a->val[shift + a->sliceheight * j])); 1204 #endif 1205 } 1206 } 1207 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1208 } else if (format == PETSC_VIEWER_NATIVE) { 1209 for (i = 0; i < a->totalslices; i++) { /* loop over slices */ 1210 PetscInt row; 1211 PetscCall(PetscViewerASCIIPrintf(viewer, "slice %" PetscInt_FMT ": %" PetscInt_FMT " %" PetscInt_FMT "\n", i, a->sliidx[i], a->sliidx[i + 1])); 1212 for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = (row + 1) % a->sliceheight) { 1213 #if defined(PETSC_USE_COMPLEX) 1214 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1215 PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g + %g i\n", a->sliceheight * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j]))); 1216 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1217 PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g - %g i\n", a->sliceheight * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), -(double)PetscImaginaryPart(a->val[j]))); 1218 } else { 1219 PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g\n", a->sliceheight * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]))); 1220 } 1221 #else 1222 PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g\n", a->sliceheight * i + row, a->colidx[j], (double)a->val[j])); 1223 #endif 1224 } 1225 } 1226 } else { 1227 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1228 if (A->factortype) { 1229 for (i = 0; i < m; i++) { 1230 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1231 PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i)); 1232 /* L part */ 1233 for (j = shift; j < a->diag[i]; j += a->sliceheight) { 1234 #if defined(PETSC_USE_COMPLEX) 1235 if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) > 0.0) { 1236 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j]))); 1237 } else if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) < 0.0) { 1238 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j])))); 1239 } else { 1240 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j]))); 1241 } 1242 #else 1243 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j])); 1244 #endif 1245 } 1246 /* diagonal */ 1247 j = a->diag[i]; 1248 #if defined(PETSC_USE_COMPLEX) 1249 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1250 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)PetscImaginaryPart(1.0 / a->val[j]))); 1251 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1252 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)(-PetscImaginaryPart(1.0 / a->val[j])))); 1253 } else { 1254 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]))); 1255 } 1256 #else 1257 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)(1.0 / a->val[j]))); 1258 #endif 1259 1260 /* U part */ 1261 for (j = a->diag[i] + 1; j < shift + a->sliceheight * a->rlen[i]; j += a->sliceheight) { 1262 #if defined(PETSC_USE_COMPLEX) 1263 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1264 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j]))); 1265 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1266 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j])))); 1267 } else { 1268 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j]))); 1269 } 1270 #else 1271 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j])); 1272 #endif 1273 } 1274 PetscCall(PetscViewerASCIIPrintf(viewer, "\n")); 1275 } 1276 } else { 1277 for (i = 0; i < m; i++) { 1278 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1279 PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i)); 1280 for (j = 0; j < a->rlen[i]; j++) { 1281 #if defined(PETSC_USE_COMPLEX) 1282 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1283 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j]))); 1284 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1285 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)-PetscImaginaryPart(a->val[shift + a->sliceheight * j]))); 1286 } else { 1287 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]))); 1288 } 1289 #else 1290 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)a->val[shift + a->sliceheight * j])); 1291 #endif 1292 } 1293 PetscCall(PetscViewerASCIIPrintf(viewer, "\n")); 1294 } 1295 } 1296 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1297 } 1298 PetscCall(PetscViewerFlush(viewer)); 1299 PetscFunctionReturn(PETSC_SUCCESS); 1300 } 1301 1302 #include <petscdraw.h> 1303 PetscErrorCode MatView_SeqSELL_Draw_Zoom(PetscDraw draw, void *Aa) 1304 { 1305 Mat A = (Mat)Aa; 1306 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1307 PetscInt i, j, m = A->rmap->n, shift; 1308 int color; 1309 PetscReal xl, yl, xr, yr, x_l, x_r, y_l, y_r; 1310 PetscViewer viewer; 1311 PetscViewerFormat format; 1312 1313 PetscFunctionBegin; 1314 PetscCall(PetscObjectQuery((PetscObject)A, "Zoomviewer", (PetscObject *)&viewer)); 1315 PetscCall(PetscViewerGetFormat(viewer, &format)); 1316 PetscCall(PetscDrawGetCoordinates(draw, &xl, &yl, &xr, &yr)); 1317 1318 /* loop over matrix elements drawing boxes */ 1319 1320 if (format != PETSC_VIEWER_DRAW_CONTOUR) { 1321 PetscDrawCollectiveBegin(draw); 1322 /* Blue for negative, Cyan for zero and Red for positive */ 1323 color = PETSC_DRAW_BLUE; 1324 for (i = 0; i < m; i++) { 1325 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */ 1326 y_l = m - i - 1.0; 1327 y_r = y_l + 1.0; 1328 for (j = 0; j < a->rlen[i]; j++) { 1329 x_l = a->colidx[shift + a->sliceheight * j]; 1330 x_r = x_l + 1.0; 1331 if (PetscRealPart(a->val[shift + a->sliceheight * j]) >= 0.) continue; 1332 PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color)); 1333 } 1334 } 1335 color = PETSC_DRAW_CYAN; 1336 for (i = 0; i < m; i++) { 1337 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1338 y_l = m - i - 1.0; 1339 y_r = y_l + 1.0; 1340 for (j = 0; j < a->rlen[i]; j++) { 1341 x_l = a->colidx[shift + a->sliceheight * j]; 1342 x_r = x_l + 1.0; 1343 if (a->val[shift + a->sliceheight * j] != 0.) continue; 1344 PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color)); 1345 } 1346 } 1347 color = PETSC_DRAW_RED; 1348 for (i = 0; i < m; i++) { 1349 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1350 y_l = m - i - 1.0; 1351 y_r = y_l + 1.0; 1352 for (j = 0; j < a->rlen[i]; j++) { 1353 x_l = a->colidx[shift + a->sliceheight * j]; 1354 x_r = x_l + 1.0; 1355 if (PetscRealPart(a->val[shift + a->sliceheight * j]) <= 0.) continue; 1356 PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color)); 1357 } 1358 } 1359 PetscDrawCollectiveEnd(draw); 1360 } else { 1361 /* use contour shading to indicate magnitude of values */ 1362 /* first determine max of all nonzero values */ 1363 PetscReal minv = 0.0, maxv = 0.0; 1364 PetscInt count = 0; 1365 PetscDraw popup; 1366 for (i = 0; i < a->sliidx[a->totalslices]; i++) { 1367 if (PetscAbsScalar(a->val[i]) > maxv) maxv = PetscAbsScalar(a->val[i]); 1368 } 1369 if (minv >= maxv) maxv = minv + PETSC_SMALL; 1370 PetscCall(PetscDrawGetPopup(draw, &popup)); 1371 PetscCall(PetscDrawScalePopup(popup, minv, maxv)); 1372 1373 PetscDrawCollectiveBegin(draw); 1374 for (i = 0; i < m; i++) { 1375 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1376 y_l = m - i - 1.0; 1377 y_r = y_l + 1.0; 1378 for (j = 0; j < a->rlen[i]; j++) { 1379 x_l = a->colidx[shift + a->sliceheight * j]; 1380 x_r = x_l + 1.0; 1381 color = PetscDrawRealToColor(PetscAbsScalar(a->val[count]), minv, maxv); 1382 PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color)); 1383 count++; 1384 } 1385 } 1386 PetscDrawCollectiveEnd(draw); 1387 } 1388 PetscFunctionReturn(PETSC_SUCCESS); 1389 } 1390 1391 #include <petscdraw.h> 1392 PetscErrorCode MatView_SeqSELL_Draw(Mat A, PetscViewer viewer) 1393 { 1394 PetscDraw draw; 1395 PetscReal xr, yr, xl, yl, h, w; 1396 PetscBool isnull; 1397 1398 PetscFunctionBegin; 1399 PetscCall(PetscViewerDrawGetDraw(viewer, 0, &draw)); 1400 PetscCall(PetscDrawIsNull(draw, &isnull)); 1401 if (isnull) PetscFunctionReturn(PETSC_SUCCESS); 1402 1403 xr = A->cmap->n; 1404 yr = A->rmap->n; 1405 h = yr / 10.0; 1406 w = xr / 10.0; 1407 xr += w; 1408 yr += h; 1409 xl = -w; 1410 yl = -h; 1411 PetscCall(PetscDrawSetCoordinates(draw, xl, yl, xr, yr)); 1412 PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", (PetscObject)viewer)); 1413 PetscCall(PetscDrawZoom(draw, MatView_SeqSELL_Draw_Zoom, A)); 1414 PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", NULL)); 1415 PetscCall(PetscDrawSave(draw)); 1416 PetscFunctionReturn(PETSC_SUCCESS); 1417 } 1418 1419 PetscErrorCode MatView_SeqSELL(Mat A, PetscViewer viewer) 1420 { 1421 PetscBool iascii, isbinary, isdraw; 1422 1423 PetscFunctionBegin; 1424 PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii)); 1425 PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERBINARY, &isbinary)); 1426 PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERDRAW, &isdraw)); 1427 if (iascii) { 1428 PetscCall(MatView_SeqSELL_ASCII(A, viewer)); 1429 } else if (isbinary) { 1430 /* PetscCall(MatView_SeqSELL_Binary(A,viewer)); */ 1431 } else if (isdraw) PetscCall(MatView_SeqSELL_Draw(A, viewer)); 1432 PetscFunctionReturn(PETSC_SUCCESS); 1433 } 1434 1435 PetscErrorCode MatAssemblyEnd_SeqSELL(Mat A, MatAssemblyType mode) 1436 { 1437 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1438 PetscInt i, shift, row_in_slice, row, nrow, *cp, lastcol, j, k; 1439 MatScalar *vp; 1440 #if defined(PETSC_HAVE_CUDA) 1441 PetscInt totalchunks = 0; 1442 #endif 1443 1444 PetscFunctionBegin; 1445 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS); 1446 /* To do: compress out the unused elements */ 1447 PetscCall(MatMarkDiagonal_SeqSELL(A)); 1448 PetscCall(PetscInfo(A, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: %" PetscInt_FMT " allocated %" PetscInt_FMT " used (%" PetscInt_FMT " nonzeros+%" PetscInt_FMT " paddedzeros)\n", A->rmap->n, A->cmap->n, a->maxallocmat, a->sliidx[a->totalslices], a->nz, a->sliidx[a->totalslices] - a->nz)); 1449 PetscCall(PetscInfo(A, "Number of mallocs during MatSetValues() is %" PetscInt_FMT "\n", a->reallocs)); 1450 PetscCall(PetscInfo(A, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", a->rlenmax)); 1451 a->nonzerorowcnt = 0; 1452 /* Set unused slots for column indices to last valid column index. Set unused slots for values to zero. This allows for a use of unmasked intrinsics -> higher performance */ 1453 for (i = 0; i < a->totalslices; ++i) { 1454 shift = a->sliidx[i]; /* starting index of the slice */ 1455 cp = a->colidx + shift; /* pointer to the column indices of the slice */ 1456 vp = a->val + shift; /* pointer to the nonzero values of the slice */ 1457 for (row_in_slice = 0; row_in_slice < a->sliceheight; ++row_in_slice) { /* loop over rows in the slice */ 1458 row = a->sliceheight * i + row_in_slice; 1459 nrow = a->rlen[row]; /* number of nonzeros in row */ 1460 /* 1461 Search for the nearest nonzero. Normally setting the index to zero may cause extra communication. 1462 But if the entire slice are empty, it is fine to use 0 since the index will not be loaded. 1463 */ 1464 lastcol = 0; 1465 if (nrow > 0) { /* nonempty row */ 1466 a->nonzerorowcnt++; 1467 lastcol = cp[a->sliceheight * (nrow - 1) + row_in_slice]; /* use the index from the last nonzero at current row */ 1468 } else if (!row_in_slice) { /* first row of the correct slice is empty */ 1469 for (j = 1; j < a->sliceheight; j++) { 1470 if (a->rlen[a->sliceheight * i + j]) { 1471 lastcol = cp[j]; 1472 break; 1473 } 1474 } 1475 } else { 1476 if (a->sliidx[i + 1] != shift) lastcol = cp[row_in_slice - 1]; /* use the index from the previous row */ 1477 } 1478 1479 for (k = nrow; k < (a->sliidx[i + 1] - shift) / a->sliceheight; ++k) { 1480 cp[a->sliceheight * k + row_in_slice] = lastcol; 1481 vp[a->sliceheight * k + row_in_slice] = (MatScalar)0; 1482 } 1483 } 1484 } 1485 1486 A->info.mallocs += a->reallocs; 1487 a->reallocs = 0; 1488 1489 PetscCall(MatSeqSELLInvalidateDiagonal(A)); 1490 #if defined(PETSC_HAVE_CUDA) 1491 if (!a->chunksize && a->totalslices) { 1492 a->chunksize = 64; 1493 while (a->chunksize < 1024 && 2 * a->chunksize <= a->sliidx[a->totalslices] / a->totalslices) a->chunksize *= 2; 1494 totalchunks = 1 + (a->sliidx[a->totalslices] - 1) / a->chunksize; 1495 } 1496 if (totalchunks != a->totalchunks) { 1497 PetscCall(PetscFree(a->chunk_slice_map)); 1498 PetscCall(PetscMalloc1(totalchunks, &a->chunk_slice_map)); 1499 a->totalchunks = totalchunks; 1500 } 1501 j = 0; 1502 for (i = 0; i < totalchunks; i++) { 1503 while (a->sliidx[j + 1] <= i * a->chunksize && j < a->totalslices) j++; 1504 a->chunk_slice_map[i] = j; 1505 } 1506 #endif 1507 PetscFunctionReturn(PETSC_SUCCESS); 1508 } 1509 1510 PetscErrorCode MatGetInfo_SeqSELL(Mat A, MatInfoType flag, MatInfo *info) 1511 { 1512 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1513 1514 PetscFunctionBegin; 1515 info->block_size = 1.0; 1516 info->nz_allocated = a->maxallocmat; 1517 info->nz_used = a->sliidx[a->totalslices]; /* include padding zeros */ 1518 info->nz_unneeded = (a->maxallocmat - a->sliidx[a->totalslices]); 1519 info->assemblies = A->num_ass; 1520 info->mallocs = A->info.mallocs; 1521 info->memory = 0; /* REVIEW ME */ 1522 if (A->factortype) { 1523 info->fill_ratio_given = A->info.fill_ratio_given; 1524 info->fill_ratio_needed = A->info.fill_ratio_needed; 1525 info->factor_mallocs = A->info.factor_mallocs; 1526 } else { 1527 info->fill_ratio_given = 0; 1528 info->fill_ratio_needed = 0; 1529 info->factor_mallocs = 0; 1530 } 1531 PetscFunctionReturn(PETSC_SUCCESS); 1532 } 1533 1534 PetscErrorCode MatSetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], const PetscScalar v[], InsertMode is) 1535 { 1536 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1537 PetscInt shift, i, k, l, low, high, t, ii, row, col, nrow; 1538 PetscInt *cp, nonew = a->nonew, lastcol = -1; 1539 MatScalar *vp, value; 1540 #if defined(PETSC_HAVE_CUDA) 1541 PetscBool inserted = PETSC_FALSE; 1542 PetscInt mul = DEVICE_MEM_ALIGN / a->sliceheight; 1543 #endif 1544 1545 PetscFunctionBegin; 1546 for (k = 0; k < m; k++) { /* loop over added rows */ 1547 row = im[k]; 1548 if (row < 0) continue; 1549 PetscCheck(row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->n - 1); 1550 shift = a->sliidx[row / a->sliceheight] + row % a->sliceheight; /* starting index of the row */ 1551 cp = a->colidx + shift; /* pointer to the row */ 1552 vp = a->val + shift; /* pointer to the row */ 1553 nrow = a->rlen[row]; 1554 low = 0; 1555 high = nrow; 1556 1557 for (l = 0; l < n; l++) { /* loop over added columns */ 1558 col = in[l]; 1559 if (col < 0) continue; 1560 PetscCheck(col < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Col too large: row %" PetscInt_FMT " max %" PetscInt_FMT, col, A->cmap->n - 1); 1561 if (a->roworiented) { 1562 value = v[l + k * n]; 1563 } else { 1564 value = v[k + l * m]; 1565 } 1566 if ((value == 0.0 && a->ignorezeroentries) && (is == ADD_VALUES)) continue; 1567 1568 /* search in this row for the specified column, i indicates the column to be set */ 1569 if (col <= lastcol) low = 0; 1570 else high = nrow; 1571 lastcol = col; 1572 while (high - low > 5) { 1573 t = (low + high) / 2; 1574 if (*(cp + a->sliceheight * t) > col) high = t; 1575 else low = t; 1576 } 1577 for (i = low; i < high; i++) { 1578 if (*(cp + a->sliceheight * i) > col) break; 1579 if (*(cp + a->sliceheight * i) == col) { 1580 if (is == ADD_VALUES) *(vp + a->sliceheight * i) += value; 1581 else *(vp + a->sliceheight * i) = value; 1582 #if defined(PETSC_HAVE_CUDA) 1583 inserted = PETSC_TRUE; 1584 #endif 1585 low = i + 1; 1586 goto noinsert; 1587 } 1588 } 1589 if (value == 0.0 && a->ignorezeroentries) goto noinsert; 1590 if (nonew == 1) goto noinsert; 1591 PetscCheck(nonew != -1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Inserting a new nonzero (%" PetscInt_FMT ", %" PetscInt_FMT ") in the matrix", row, col); 1592 #if defined(PETSC_HAVE_CUDA) 1593 MatSeqXSELLReallocateSELL(A, A->rmap->n, 1, nrow, a->sliidx, a->sliceheight, row / a->sliceheight, row, col, a->colidx, a->val, cp, vp, nonew, MatScalar, mul); 1594 #else 1595 /* If the current row length exceeds the slice width (e.g. nrow==slice_width), allocate a new space, otherwise do nothing */ 1596 MatSeqXSELLReallocateSELL(A, A->rmap->n, 1, nrow, a->sliidx, a->sliceheight, row / a->sliceheight, row, col, a->colidx, a->val, cp, vp, nonew, MatScalar, 1); 1597 #endif 1598 /* add the new nonzero to the high position, shift the remaining elements in current row to the right by one slot */ 1599 for (ii = nrow - 1; ii >= i; ii--) { 1600 *(cp + a->sliceheight * (ii + 1)) = *(cp + a->sliceheight * ii); 1601 *(vp + a->sliceheight * (ii + 1)) = *(vp + a->sliceheight * ii); 1602 } 1603 a->rlen[row]++; 1604 *(cp + a->sliceheight * i) = col; 1605 *(vp + a->sliceheight * i) = value; 1606 a->nz++; 1607 A->nonzerostate++; 1608 #if defined(PETSC_HAVE_CUDA) 1609 inserted = PETSC_TRUE; 1610 #endif 1611 low = i + 1; 1612 high++; 1613 nrow++; 1614 noinsert:; 1615 } 1616 a->rlen[row] = nrow; 1617 } 1618 #if defined(PETSC_HAVE_CUDA) 1619 if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED && inserted) A->offloadmask = PETSC_OFFLOAD_CPU; 1620 #endif 1621 PetscFunctionReturn(PETSC_SUCCESS); 1622 } 1623 1624 PetscErrorCode MatCopy_SeqSELL(Mat A, Mat B, MatStructure str) 1625 { 1626 PetscFunctionBegin; 1627 /* If the two matrices have the same copy implementation, use fast copy. */ 1628 if (str == SAME_NONZERO_PATTERN && (A->ops->copy == B->ops->copy)) { 1629 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1630 Mat_SeqSELL *b = (Mat_SeqSELL *)B->data; 1631 1632 PetscCheck(a->sliidx[a->totalslices] == b->sliidx[b->totalslices], PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Number of nonzeros in two matrices are different"); 1633 PetscCall(PetscArraycpy(b->val, a->val, a->sliidx[a->totalslices])); 1634 } else { 1635 PetscCall(MatCopy_Basic(A, B, str)); 1636 } 1637 PetscFunctionReturn(PETSC_SUCCESS); 1638 } 1639 1640 PetscErrorCode MatSetUp_SeqSELL(Mat A) 1641 { 1642 PetscFunctionBegin; 1643 PetscCall(MatSeqSELLSetPreallocation(A, PETSC_DEFAULT, NULL)); 1644 PetscFunctionReturn(PETSC_SUCCESS); 1645 } 1646 1647 PetscErrorCode MatSeqSELLGetArray_SeqSELL(Mat A, PetscScalar *array[]) 1648 { 1649 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1650 1651 PetscFunctionBegin; 1652 *array = a->val; 1653 PetscFunctionReturn(PETSC_SUCCESS); 1654 } 1655 1656 PetscErrorCode MatSeqSELLRestoreArray_SeqSELL(Mat A, PetscScalar *array[]) 1657 { 1658 PetscFunctionBegin; 1659 PetscFunctionReturn(PETSC_SUCCESS); 1660 } 1661 1662 PetscErrorCode MatRealPart_SeqSELL(Mat A) 1663 { 1664 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1665 PetscInt i; 1666 MatScalar *aval = a->val; 1667 1668 PetscFunctionBegin; 1669 for (i = 0; i < a->sliidx[a->totalslices]; i++) aval[i] = PetscRealPart(aval[i]); 1670 #if defined(PETSC_HAVE_CUDA) 1671 if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) A->offloadmask = PETSC_OFFLOAD_CPU; 1672 #endif 1673 PetscFunctionReturn(PETSC_SUCCESS); 1674 } 1675 1676 PetscErrorCode MatImaginaryPart_SeqSELL(Mat A) 1677 { 1678 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1679 PetscInt i; 1680 MatScalar *aval = a->val; 1681 1682 PetscFunctionBegin; 1683 for (i = 0; i < a->sliidx[a->totalslices]; i++) aval[i] = PetscImaginaryPart(aval[i]); 1684 PetscCall(MatSeqSELLInvalidateDiagonal(A)); 1685 #if defined(PETSC_HAVE_CUDA) 1686 if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) A->offloadmask = PETSC_OFFLOAD_CPU; 1687 #endif 1688 PetscFunctionReturn(PETSC_SUCCESS); 1689 } 1690 1691 PetscErrorCode MatScale_SeqSELL(Mat inA, PetscScalar alpha) 1692 { 1693 Mat_SeqSELL *a = (Mat_SeqSELL *)inA->data; 1694 MatScalar *aval = a->val; 1695 PetscScalar oalpha = alpha; 1696 PetscBLASInt one = 1, size; 1697 1698 PetscFunctionBegin; 1699 PetscCall(PetscBLASIntCast(a->sliidx[a->totalslices], &size)); 1700 PetscCallBLAS("BLASscal", BLASscal_(&size, &oalpha, aval, &one)); 1701 PetscCall(PetscLogFlops(a->nz)); 1702 PetscCall(MatSeqSELLInvalidateDiagonal(inA)); 1703 #if defined(PETSC_HAVE_CUDA) 1704 if (inA->offloadmask != PETSC_OFFLOAD_UNALLOCATED) inA->offloadmask = PETSC_OFFLOAD_CPU; 1705 #endif 1706 PetscFunctionReturn(PETSC_SUCCESS); 1707 } 1708 1709 PetscErrorCode MatShift_SeqSELL(Mat Y, PetscScalar a) 1710 { 1711 Mat_SeqSELL *y = (Mat_SeqSELL *)Y->data; 1712 1713 PetscFunctionBegin; 1714 if (!Y->preallocated || !y->nz) PetscCall(MatSeqSELLSetPreallocation(Y, 1, NULL)); 1715 PetscCall(MatShift_Basic(Y, a)); 1716 PetscFunctionReturn(PETSC_SUCCESS); 1717 } 1718 1719 PetscErrorCode MatSOR_SeqSELL(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx) 1720 { 1721 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1722 PetscScalar *x, sum, *t; 1723 const MatScalar *idiag = NULL, *mdiag; 1724 const PetscScalar *b, *xb; 1725 PetscInt n, m = A->rmap->n, i, j, shift; 1726 const PetscInt *diag; 1727 1728 PetscFunctionBegin; 1729 its = its * lits; 1730 1731 if (fshift != a->fshift || omega != a->omega) a->idiagvalid = PETSC_FALSE; /* must recompute idiag[] */ 1732 if (!a->idiagvalid) PetscCall(MatInvertDiagonal_SeqSELL(A, omega, fshift)); 1733 a->fshift = fshift; 1734 a->omega = omega; 1735 1736 diag = a->diag; 1737 t = a->ssor_work; 1738 idiag = a->idiag; 1739 mdiag = a->mdiag; 1740 1741 PetscCall(VecGetArray(xx, &x)); 1742 PetscCall(VecGetArrayRead(bb, &b)); 1743 /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */ 1744 PetscCheck(flag != SOR_APPLY_UPPER, PETSC_COMM_SELF, PETSC_ERR_SUP, "SOR_APPLY_UPPER is not implemented"); 1745 PetscCheck(flag != SOR_APPLY_LOWER, PETSC_COMM_SELF, PETSC_ERR_SUP, "SOR_APPLY_LOWER is not implemented"); 1746 PetscCheck(!(flag & SOR_EISENSTAT), PETSC_COMM_SELF, PETSC_ERR_SUP, "No support yet for Eisenstat"); 1747 1748 if (flag & SOR_ZERO_INITIAL_GUESS) { 1749 if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) { 1750 for (i = 0; i < m; i++) { 1751 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */ 1752 sum = b[i]; 1753 n = (diag[i] - shift) / a->sliceheight; 1754 for (j = 0; j < n; j++) sum -= a->val[shift + a->sliceheight * j] * x[a->colidx[shift + a->sliceheight * j]]; 1755 t[i] = sum; 1756 x[i] = sum * idiag[i]; 1757 } 1758 xb = t; 1759 PetscCall(PetscLogFlops(a->nz)); 1760 } else xb = b; 1761 if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) { 1762 for (i = m - 1; i >= 0; i--) { 1763 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */ 1764 sum = xb[i]; 1765 n = a->rlen[i] - (diag[i] - shift) / a->sliceheight - 1; 1766 for (j = 1; j <= n; j++) sum -= a->val[diag[i] + a->sliceheight * j] * x[a->colidx[diag[i] + a->sliceheight * j]]; 1767 if (xb == b) { 1768 x[i] = sum * idiag[i]; 1769 } else { 1770 x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */ 1771 } 1772 } 1773 PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */ 1774 } 1775 its--; 1776 } 1777 while (its--) { 1778 if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) { 1779 for (i = 0; i < m; i++) { 1780 /* lower */ 1781 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */ 1782 sum = b[i]; 1783 n = (diag[i] - shift) / a->sliceheight; 1784 for (j = 0; j < n; j++) sum -= a->val[shift + a->sliceheight * j] * x[a->colidx[shift + a->sliceheight * j]]; 1785 t[i] = sum; /* save application of the lower-triangular part */ 1786 /* upper */ 1787 n = a->rlen[i] - (diag[i] - shift) / a->sliceheight - 1; 1788 for (j = 1; j <= n; j++) sum -= a->val[diag[i] + a->sliceheight * j] * x[a->colidx[diag[i] + a->sliceheight * j]]; 1789 x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */ 1790 } 1791 xb = t; 1792 PetscCall(PetscLogFlops(2.0 * a->nz)); 1793 } else xb = b; 1794 if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) { 1795 for (i = m - 1; i >= 0; i--) { 1796 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */ 1797 sum = xb[i]; 1798 if (xb == b) { 1799 /* whole matrix (no checkpointing available) */ 1800 n = a->rlen[i]; 1801 for (j = 0; j < n; j++) sum -= a->val[shift + a->sliceheight * j] * x[a->colidx[shift + a->sliceheight * j]]; 1802 x[i] = (1. - omega) * x[i] + (sum + mdiag[i] * x[i]) * idiag[i]; 1803 } else { /* lower-triangular part has been saved, so only apply upper-triangular */ 1804 n = a->rlen[i] - (diag[i] - shift) / a->sliceheight - 1; 1805 for (j = 1; j <= n; j++) sum -= a->val[diag[i] + a->sliceheight * j] * x[a->colidx[diag[i] + a->sliceheight * j]]; 1806 x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */ 1807 } 1808 } 1809 if (xb == b) { 1810 PetscCall(PetscLogFlops(2.0 * a->nz)); 1811 } else { 1812 PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */ 1813 } 1814 } 1815 } 1816 PetscCall(VecRestoreArray(xx, &x)); 1817 PetscCall(VecRestoreArrayRead(bb, &b)); 1818 PetscFunctionReturn(PETSC_SUCCESS); 1819 } 1820 1821 static struct _MatOps MatOps_Values = {MatSetValues_SeqSELL, 1822 MatGetRow_SeqSELL, 1823 MatRestoreRow_SeqSELL, 1824 MatMult_SeqSELL, 1825 /* 4*/ MatMultAdd_SeqSELL, 1826 MatMultTranspose_SeqSELL, 1827 MatMultTransposeAdd_SeqSELL, 1828 NULL, 1829 NULL, 1830 NULL, 1831 /* 10*/ NULL, 1832 NULL, 1833 NULL, 1834 MatSOR_SeqSELL, 1835 NULL, 1836 /* 15*/ MatGetInfo_SeqSELL, 1837 MatEqual_SeqSELL, 1838 MatGetDiagonal_SeqSELL, 1839 MatDiagonalScale_SeqSELL, 1840 NULL, 1841 /* 20*/ NULL, 1842 MatAssemblyEnd_SeqSELL, 1843 MatSetOption_SeqSELL, 1844 MatZeroEntries_SeqSELL, 1845 /* 24*/ NULL, 1846 NULL, 1847 NULL, 1848 NULL, 1849 NULL, 1850 /* 29*/ MatSetUp_SeqSELL, 1851 NULL, 1852 NULL, 1853 NULL, 1854 NULL, 1855 /* 34*/ MatDuplicate_SeqSELL, 1856 NULL, 1857 NULL, 1858 NULL, 1859 NULL, 1860 /* 39*/ NULL, 1861 NULL, 1862 NULL, 1863 MatGetValues_SeqSELL, 1864 MatCopy_SeqSELL, 1865 /* 44*/ NULL, 1866 MatScale_SeqSELL, 1867 MatShift_SeqSELL, 1868 NULL, 1869 NULL, 1870 /* 49*/ NULL, 1871 NULL, 1872 NULL, 1873 NULL, 1874 NULL, 1875 /* 54*/ MatFDColoringCreate_SeqXAIJ, 1876 NULL, 1877 NULL, 1878 NULL, 1879 NULL, 1880 /* 59*/ NULL, 1881 MatDestroy_SeqSELL, 1882 MatView_SeqSELL, 1883 NULL, 1884 NULL, 1885 /* 64*/ NULL, 1886 NULL, 1887 NULL, 1888 NULL, 1889 NULL, 1890 /* 69*/ NULL, 1891 NULL, 1892 NULL, 1893 NULL, 1894 NULL, 1895 /* 74*/ NULL, 1896 MatFDColoringApply_AIJ, /* reuse the FDColoring function for AIJ */ 1897 NULL, 1898 NULL, 1899 NULL, 1900 /* 79*/ NULL, 1901 NULL, 1902 NULL, 1903 NULL, 1904 NULL, 1905 /* 84*/ NULL, 1906 NULL, 1907 NULL, 1908 NULL, 1909 NULL, 1910 /* 89*/ NULL, 1911 NULL, 1912 NULL, 1913 NULL, 1914 NULL, 1915 /* 94*/ NULL, 1916 NULL, 1917 NULL, 1918 NULL, 1919 NULL, 1920 /* 99*/ NULL, 1921 NULL, 1922 NULL, 1923 MatConjugate_SeqSELL, 1924 NULL, 1925 /*104*/ NULL, 1926 NULL, 1927 NULL, 1928 NULL, 1929 NULL, 1930 /*109*/ NULL, 1931 NULL, 1932 NULL, 1933 NULL, 1934 MatMissingDiagonal_SeqSELL, 1935 /*114*/ NULL, 1936 NULL, 1937 NULL, 1938 NULL, 1939 NULL, 1940 /*119*/ NULL, 1941 NULL, 1942 NULL, 1943 NULL, 1944 NULL, 1945 /*124*/ NULL, 1946 NULL, 1947 NULL, 1948 NULL, 1949 NULL, 1950 /*129*/ NULL, 1951 NULL, 1952 NULL, 1953 NULL, 1954 NULL, 1955 /*134*/ NULL, 1956 NULL, 1957 NULL, 1958 NULL, 1959 NULL, 1960 /*139*/ NULL, 1961 NULL, 1962 NULL, 1963 MatFDColoringSetUp_SeqXAIJ, 1964 NULL, 1965 /*144*/ NULL, 1966 NULL, 1967 NULL, 1968 NULL, 1969 NULL, 1970 NULL, 1971 /*150*/ NULL, 1972 NULL}; 1973 1974 PetscErrorCode MatStoreValues_SeqSELL(Mat mat) 1975 { 1976 Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data; 1977 1978 PetscFunctionBegin; 1979 PetscCheck(a->nonew, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first"); 1980 1981 /* allocate space for values if not already there */ 1982 if (!a->saved_values) PetscCall(PetscMalloc1(a->sliidx[a->totalslices] + 1, &a->saved_values)); 1983 1984 /* copy values over */ 1985 PetscCall(PetscArraycpy(a->saved_values, a->val, a->sliidx[a->totalslices])); 1986 PetscFunctionReturn(PETSC_SUCCESS); 1987 } 1988 1989 PetscErrorCode MatRetrieveValues_SeqSELL(Mat mat) 1990 { 1991 Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data; 1992 1993 PetscFunctionBegin; 1994 PetscCheck(a->nonew, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first"); 1995 PetscCheck(a->saved_values, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatStoreValues(A);first"); 1996 PetscCall(PetscArraycpy(a->val, a->saved_values, a->sliidx[a->totalslices])); 1997 PetscFunctionReturn(PETSC_SUCCESS); 1998 } 1999 2000 PetscErrorCode MatSeqSELLGetFillRatio_SeqSELL(Mat mat, PetscReal *ratio) 2001 { 2002 Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data; 2003 2004 PetscFunctionBegin; 2005 if (a->totalslices && a->sliidx[a->totalslices]) { 2006 *ratio = (PetscReal)(a->sliidx[a->totalslices] - a->nz) / a->sliidx[a->totalslices]; 2007 } else { 2008 *ratio = 0.0; 2009 } 2010 PetscFunctionReturn(PETSC_SUCCESS); 2011 } 2012 2013 PetscErrorCode MatSeqSELLGetMaxSliceWidth_SeqSELL(Mat mat, PetscInt *slicewidth) 2014 { 2015 Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data; 2016 PetscInt i, current_slicewidth; 2017 2018 PetscFunctionBegin; 2019 *slicewidth = 0; 2020 for (i = 0; i < a->totalslices; i++) { 2021 current_slicewidth = (a->sliidx[i + 1] - a->sliidx[i]) / a->sliceheight; 2022 if (current_slicewidth > *slicewidth) *slicewidth = current_slicewidth; 2023 } 2024 PetscFunctionReturn(PETSC_SUCCESS); 2025 } 2026 2027 PetscErrorCode MatSeqSELLGetAvgSliceWidth_SeqSELL(Mat mat, PetscReal *slicewidth) 2028 { 2029 Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data; 2030 2031 PetscFunctionBegin; 2032 *slicewidth = 0; 2033 if (a->totalslices) { *slicewidth = (PetscReal)a->sliidx[a->totalslices] / a->sliceheight / a->totalslices; } 2034 PetscFunctionReturn(PETSC_SUCCESS); 2035 } 2036 2037 PetscErrorCode MatSeqSELLSetSliceHeight_SeqSELL(Mat A, PetscInt sliceheight) 2038 { 2039 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 2040 2041 PetscFunctionBegin; 2042 if (A->preallocated) PetscFunctionReturn(PETSC_SUCCESS); 2043 PetscCheck(a->sliceheight <= 0 || a->sliceheight == sliceheight, PETSC_COMM_SELF, PETSC_ERR_SUP, "Cannot change slice height %" PetscInt_FMT " to %" PetscInt_FMT, a->sliceheight, sliceheight); 2044 a->sliceheight = sliceheight; 2045 #if defined(PETSC_HAVE_CUDA) 2046 PetscCheck(DEVICE_MEM_ALIGN % sliceheight == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "DEVICE_MEM_ALIGN is not divisible by the slice height %" PetscInt_FMT, sliceheight); 2047 #endif 2048 PetscFunctionReturn(PETSC_SUCCESS); 2049 } 2050 2051 /*@C 2052 MatSeqSELLRestoreArray - returns access to the array where the data for a `MATSEQSELL` matrix is stored obtained by `MatSeqSELLGetArray()` 2053 2054 Not Collective 2055 2056 Input Parameters: 2057 + A - a `MATSEQSELL` matrix 2058 - array - pointer to the data 2059 2060 Level: intermediate 2061 2062 .seealso: `Mat`, `MATSEQSELL`, `MatSeqSELLGetArray()`, `MatSeqSELLRestoreArrayF90()` 2063 @*/ 2064 PetscErrorCode MatSeqSELLRestoreArray(Mat A, PetscScalar **array) 2065 { 2066 PetscFunctionBegin; 2067 PetscUseMethod(A, "MatSeqSELLRestoreArray_C", (Mat, PetscScalar **), (A, array)); 2068 PetscFunctionReturn(PETSC_SUCCESS); 2069 } 2070 2071 /*@C 2072 MatSeqSELLGetFillRatio - returns a ratio that indicates the irregularity of the matrix. 2073 2074 Not Collective 2075 2076 Input Parameter: 2077 . A - a MATSEQSELL matrix 2078 2079 Output Parameter: 2080 . ratio - ratio of number of padded zeros to number of allocated elements 2081 2082 Level: intermediate 2083 @*/ 2084 PetscErrorCode MatSeqSELLGetFillRatio(Mat A, PetscReal *ratio) 2085 { 2086 PetscFunctionBegin; 2087 PetscUseMethod(A, "MatSeqSELLGetFillRatio_C", (Mat, PetscReal *), (A, ratio)); 2088 PetscFunctionReturn(PETSC_SUCCESS); 2089 } 2090 2091 /*@C 2092 MatSeqSELLGetMaxSliceWidth - returns the maximum slice width. 2093 2094 Not Collective 2095 2096 Input Parameter: 2097 . A - a MATSEQSELL matrix 2098 2099 Output Parameter: 2100 . slicewidth - maximum slice width 2101 2102 Level: intermediate 2103 @*/ 2104 PetscErrorCode MatSeqSELLGetMaxSliceWidth(Mat A, PetscInt *slicewidth) 2105 { 2106 PetscFunctionBegin; 2107 PetscUseMethod(A, "MatSeqSELLGetMaxSliceWidth_C", (Mat, PetscInt *), (A, slicewidth)); 2108 PetscFunctionReturn(PETSC_SUCCESS); 2109 } 2110 2111 /*@C 2112 MatSeqSELLGetAvgSliceWidth - returns the average slice width. 2113 2114 Not Collective 2115 2116 Input Parameter: 2117 . A - a MATSEQSELL matrix 2118 2119 Output Parameter: 2120 . slicewidth - average slice width 2121 2122 Level: intermediate 2123 @*/ 2124 PetscErrorCode MatSeqSELLGetAvgSliceWidth(Mat A, PetscReal *slicewidth) 2125 { 2126 PetscFunctionBegin; 2127 PetscUseMethod(A, "MatSeqSELLGetAvgSliceWidth_C", (Mat, PetscReal *), (A, slicewidth)); 2128 PetscFunctionReturn(PETSC_SUCCESS); 2129 } 2130 2131 /*@C 2132 MatSeqSELLSetSliceHeight - sets the slice height. 2133 2134 Not Collective 2135 2136 Input Parameters: 2137 + A - a MATSEQSELL matrix 2138 - sliceheight - slice height 2139 2140 Notes: 2141 You cannot change the slice height once it have been set. 2142 2143 The slice height must be set before MatSetUp() or MatXXXSetPreallocation() is called. 2144 2145 Level: intermediate 2146 @*/ 2147 PetscErrorCode MatSeqSELLSetSliceHeight(Mat A, PetscInt sliceheight) 2148 { 2149 PetscFunctionBegin; 2150 PetscUseMethod(A, "MatSeqSELLSetSliceHeight_C", (Mat, PetscInt), (A, sliceheight)); 2151 PetscFunctionReturn(PETSC_SUCCESS); 2152 } 2153 2154 /*@C 2155 MatSeqSELLGetVarSliceSize - returns the variance of the slice size. 2156 2157 Not Collective 2158 2159 Input Parameter: 2160 . A - a MATSEQSELL matrix 2161 2162 Output Parameter: 2163 . variance - variance of the slice size 2164 2165 Level: intermediate 2166 @*/ 2167 PetscErrorCode MatSeqSELLGetVarSliceSize(Mat A, PetscReal *variance) 2168 { 2169 PetscFunctionBegin; 2170 PetscUseMethod(A, "MatSeqSELLGetVarSliceSize_C", (Mat, PetscReal *), (A, variance)); 2171 PetscFunctionReturn(PETSC_SUCCESS); 2172 } 2173 2174 #if defined(PETSC_HAVE_CUDA) 2175 PETSC_EXTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLCUDA(Mat); 2176 #endif 2177 2178 PETSC_EXTERN PetscErrorCode MatCreate_SeqSELL(Mat B) 2179 { 2180 Mat_SeqSELL *b; 2181 PetscMPIInt size; 2182 2183 PetscFunctionBegin; 2184 PetscCall(PetscCitationsRegister(citation, &cited)); 2185 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size)); 2186 PetscCheck(size <= 1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Comm must be of size 1"); 2187 2188 PetscCall(PetscNew(&b)); 2189 2190 B->data = (void *)b; 2191 2192 PetscCall(PetscMemcpy(B->ops, &MatOps_Values, sizeof(struct _MatOps))); 2193 2194 b->row = NULL; 2195 b->col = NULL; 2196 b->icol = NULL; 2197 b->reallocs = 0; 2198 b->ignorezeroentries = PETSC_FALSE; 2199 b->roworiented = PETSC_TRUE; 2200 b->nonew = 0; 2201 b->diag = NULL; 2202 b->solve_work = NULL; 2203 B->spptr = NULL; 2204 b->saved_values = NULL; 2205 b->idiag = NULL; 2206 b->mdiag = NULL; 2207 b->ssor_work = NULL; 2208 b->omega = 1.0; 2209 b->fshift = 0.0; 2210 b->idiagvalid = PETSC_FALSE; 2211 b->keepnonzeropattern = PETSC_FALSE; 2212 b->sliceheight = 0; 2213 2214 PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQSELL)); 2215 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetArray_C", MatSeqSELLGetArray_SeqSELL)); 2216 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLRestoreArray_C", MatSeqSELLRestoreArray_SeqSELL)); 2217 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatStoreValues_C", MatStoreValues_SeqSELL)); 2218 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatRetrieveValues_C", MatRetrieveValues_SeqSELL)); 2219 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLSetPreallocation_C", MatSeqSELLSetPreallocation_SeqSELL)); 2220 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_SeqSELL_SeqAIJ_C", MatConvert_SeqSELL_SeqAIJ)); 2221 #if defined(PETSC_HAVE_CUDA) 2222 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_SeqSELL_SeqSELLCUDA_C", MatConvert_SeqSELL_SeqSELLCUDA)); 2223 #endif 2224 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetFillRatio_C", MatSeqSELLGetFillRatio_SeqSELL)); 2225 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetMaxSliceWidth_C", MatSeqSELLGetMaxSliceWidth_SeqSELL)); 2226 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetAvgSliceWidth_C", MatSeqSELLGetAvgSliceWidth_SeqSELL)); 2227 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLSetSliceHeight_C", MatSeqSELLSetSliceHeight_SeqSELL)); 2228 2229 PetscObjectOptionsBegin((PetscObject)B); 2230 { 2231 PetscInt newsh = -1; 2232 PetscBool flg; 2233 #if defined(PETSC_HAVE_CUDA) 2234 PetscInt chunksize = 0; 2235 #endif 2236 2237 PetscCall(PetscOptionsInt("-mat_sell_slice_height", "Set the slice height used to store SELL matrix", "MatSELLSetSliceHeight", newsh, &newsh, &flg)); 2238 if (flg) { PetscCall(MatSeqSELLSetSliceHeight(B, newsh)); } 2239 #if defined(PETSC_HAVE_CUDA) 2240 PetscCall(PetscOptionsInt("-mat_sell_chunk_size", "Set the chunksize for load-balanced CUDA kernels. Choices include 64,128,256,512,1024", NULL, chunksize, &chunksize, &flg)); 2241 if (flg) { 2242 PetscCheck(chunksize >= 64 && chunksize <= 1024 && chunksize % 64 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "chunksize must be a number in {64,128,256,512,1024}: value %" PetscInt_FMT, chunksize); 2243 b->chunksize = chunksize; 2244 } 2245 #endif 2246 } 2247 PetscOptionsEnd(); 2248 PetscFunctionReturn(PETSC_SUCCESS); 2249 } 2250 2251 /* 2252 Given a matrix generated with MatGetFactor() duplicates all the information in A into B 2253 */ 2254 PetscErrorCode MatDuplicateNoCreate_SeqSELL(Mat C, Mat A, MatDuplicateOption cpvalues, PetscBool mallocmatspace) 2255 { 2256 Mat_SeqSELL *c = (Mat_SeqSELL *)C->data, *a = (Mat_SeqSELL *)A->data; 2257 PetscInt i, m = A->rmap->n; 2258 PetscInt totalslices = a->totalslices; 2259 2260 PetscFunctionBegin; 2261 C->factortype = A->factortype; 2262 c->row = NULL; 2263 c->col = NULL; 2264 c->icol = NULL; 2265 c->reallocs = 0; 2266 C->assembled = PETSC_TRUE; 2267 2268 PetscCall(PetscLayoutReference(A->rmap, &C->rmap)); 2269 PetscCall(PetscLayoutReference(A->cmap, &C->cmap)); 2270 2271 PetscCall(PetscMalloc1(a->sliceheight * totalslices, &c->rlen)); 2272 PetscCall(PetscMalloc1(totalslices + 1, &c->sliidx)); 2273 2274 for (i = 0; i < m; i++) c->rlen[i] = a->rlen[i]; 2275 for (i = 0; i < totalslices + 1; i++) c->sliidx[i] = a->sliidx[i]; 2276 2277 /* allocate the matrix space */ 2278 if (mallocmatspace) { 2279 PetscCall(PetscMalloc2(a->maxallocmat, &c->val, a->maxallocmat, &c->colidx)); 2280 2281 c->singlemalloc = PETSC_TRUE; 2282 2283 if (m > 0) { 2284 PetscCall(PetscArraycpy(c->colidx, a->colidx, a->maxallocmat)); 2285 if (cpvalues == MAT_COPY_VALUES) { 2286 PetscCall(PetscArraycpy(c->val, a->val, a->maxallocmat)); 2287 } else { 2288 PetscCall(PetscArrayzero(c->val, a->maxallocmat)); 2289 } 2290 } 2291 } 2292 2293 c->ignorezeroentries = a->ignorezeroentries; 2294 c->roworiented = a->roworiented; 2295 c->nonew = a->nonew; 2296 if (a->diag) { 2297 PetscCall(PetscMalloc1(m, &c->diag)); 2298 for (i = 0; i < m; i++) c->diag[i] = a->diag[i]; 2299 } else c->diag = NULL; 2300 2301 c->solve_work = NULL; 2302 c->saved_values = NULL; 2303 c->idiag = NULL; 2304 c->ssor_work = NULL; 2305 c->keepnonzeropattern = a->keepnonzeropattern; 2306 c->free_val = PETSC_TRUE; 2307 c->free_colidx = PETSC_TRUE; 2308 2309 c->maxallocmat = a->maxallocmat; 2310 c->maxallocrow = a->maxallocrow; 2311 c->rlenmax = a->rlenmax; 2312 c->nz = a->nz; 2313 C->preallocated = PETSC_TRUE; 2314 2315 c->nonzerorowcnt = a->nonzerorowcnt; 2316 C->nonzerostate = A->nonzerostate; 2317 2318 PetscCall(PetscFunctionListDuplicate(((PetscObject)A)->qlist, &((PetscObject)C)->qlist)); 2319 PetscFunctionReturn(PETSC_SUCCESS); 2320 } 2321 2322 PetscErrorCode MatDuplicate_SeqSELL(Mat A, MatDuplicateOption cpvalues, Mat *B) 2323 { 2324 PetscFunctionBegin; 2325 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B)); 2326 PetscCall(MatSetSizes(*B, A->rmap->n, A->cmap->n, A->rmap->n, A->cmap->n)); 2327 if (!(A->rmap->n % A->rmap->bs) && !(A->cmap->n % A->cmap->bs)) PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 2328 PetscCall(MatSetType(*B, ((PetscObject)A)->type_name)); 2329 PetscCall(MatDuplicateNoCreate_SeqSELL(*B, A, cpvalues, PETSC_TRUE)); 2330 PetscFunctionReturn(PETSC_SUCCESS); 2331 } 2332 2333 /*MC 2334 MATSEQSELL - MATSEQSELL = "seqsell" - A matrix type to be used for sequential sparse matrices, 2335 based on the sliced Ellpack format 2336 2337 Options Database Key: 2338 . -mat_type seqsell - sets the matrix type to "`MATSEQELL` during a call to `MatSetFromOptions()` 2339 2340 Level: beginner 2341 2342 .seealso: `Mat`, `MatCreateSeqSell()`, `MATSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATAIJ`, `MATMPIAIJ` 2343 M*/ 2344 2345 /*MC 2346 MATSELL - MATSELL = "sell" - A matrix type to be used for sparse matrices. 2347 2348 This matrix type is identical to `MATSEQSELL` when constructed with a single process communicator, 2349 and `MATMPISELL` otherwise. As a result, for single process communicators, 2350 `MatSeqSELLSetPreallocation()` is supported, and similarly `MatMPISELLSetPreallocation()` is supported 2351 for communicators controlling multiple processes. It is recommended that you call both of 2352 the above preallocation routines for simplicity. 2353 2354 Options Database Key: 2355 . -mat_type sell - sets the matrix type to "sell" during a call to MatSetFromOptions() 2356 2357 Level: beginner 2358 2359 Notes: 2360 This format is only supported for real scalars, double precision, and 32 bit indices (the defaults). 2361 2362 It can provide better performance on Intel and AMD processes with AVX2 or AVX512 support for matrices that have a similar number of 2363 non-zeros in contiguous groups of rows. However if the computation is memory bandwidth limited it may not provide much improvement. 2364 2365 Developer Notes: 2366 On Intel (and AMD) systems some of the matrix operations use SIMD (AVX) instructions to achieve higher performance. 2367 2368 The sparse matrix format is as follows. For simplicity we assume a slice size of 2, it is actually 8 2369 .vb 2370 (2 0 3 4) 2371 Consider the matrix A = (5 0 6 0) 2372 (0 0 7 8) 2373 (0 0 9 9) 2374 2375 symbolically the Ellpack format can be written as 2376 2377 (2 3 4 |) (0 2 3 |) 2378 v = (5 6 0 |) colidx = (0 2 2 |) 2379 -------- --------- 2380 (7 8 |) (2 3 |) 2381 (9 9 |) (2 3 |) 2382 2383 The data for 2 contiguous rows of the matrix are stored together (in column-major format) (with any left-over rows handled as a special case). 2384 Any of the rows in a slice fewer columns than the rest of the slice (row 1 above) are padded with a previous valid column in their "extra" colidx[] locations and 2385 zeros in their "extra" v locations so that the matrix operations do not need special code to handle different length rows within the 2 rows in a slice. 2386 2387 The one-dimensional representation of v used in the code is (2 5 3 6 4 0 7 9 8 9) and for colidx is (0 0 2 2 3 2 2 2 3 3) 2388 2389 .ve 2390 2391 See MatMult_SeqSELL() for how this format is used with the SIMD operations to achieve high performance. 2392 2393 References: 2394 . * - Hong Zhang, Richard T. Mills, Karl Rupp, and Barry F. Smith, Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512}, 2395 Proceedings of the 47th International Conference on Parallel Processing, 2018. 2396 2397 .seealso: `Mat`, `MatCreateSeqSELL()`, `MatCreateSeqAIJ()`, `MatCreateSell()`, `MATSEQSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATMPIAIJ`, `MATAIJ` 2398 M*/ 2399 2400 /*@C 2401 MatCreateSeqSELL - Creates a sparse matrix in `MATSEQSELL` format. 2402 2403 Collective 2404 2405 Input Parameters: 2406 + comm - MPI communicator, set to `PETSC_COMM_SELF` 2407 . m - number of rows 2408 . n - number of columns 2409 . rlenmax - maximum number of nonzeros in a row, ignored if `rlen` is provided 2410 - rlen - array containing the number of nonzeros in the various rows (possibly different for each row) or NULL 2411 2412 Output Parameter: 2413 . A - the matrix 2414 2415 Level: intermediate 2416 2417 Notes: 2418 It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 2419 MatXXXXSetPreallocation() paradigm instead of this routine directly. 2420 [MatXXXXSetPreallocation() is, for example, `MatSeqSELLSetPreallocation()`] 2421 2422 Specify the preallocated storage with either `rlenmax` or `rlen` (not both). 2423 Set `rlenmax` = `PETSC_DEFAULT` and `rlen` = `NULL` for PETSc to control dynamic memory 2424 allocation. 2425 2426 .seealso: `Mat`, `MATSEQSELL`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatSeqSELLSetPreallocation()`, `MATSELL`, `MATSEQSELL`, `MATMPISELL` 2427 @*/ 2428 PetscErrorCode MatCreateSeqSELL(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt rlenmax, const PetscInt rlen[], Mat *A) 2429 { 2430 PetscFunctionBegin; 2431 PetscCall(MatCreate(comm, A)); 2432 PetscCall(MatSetSizes(*A, m, n, m, n)); 2433 PetscCall(MatSetType(*A, MATSEQSELL)); 2434 PetscCall(MatSeqSELLSetPreallocation_SeqSELL(*A, rlenmax, rlen)); 2435 PetscFunctionReturn(PETSC_SUCCESS); 2436 } 2437 2438 PetscErrorCode MatEqual_SeqSELL(Mat A, Mat B, PetscBool *flg) 2439 { 2440 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data, *b = (Mat_SeqSELL *)B->data; 2441 PetscInt totalslices = a->totalslices; 2442 2443 PetscFunctionBegin; 2444 /* If the matrix dimensions are not equal,or no of nonzeros */ 2445 if ((A->rmap->n != B->rmap->n) || (A->cmap->n != B->cmap->n) || (a->nz != b->nz) || (a->rlenmax != b->rlenmax)) { 2446 *flg = PETSC_FALSE; 2447 PetscFunctionReturn(PETSC_SUCCESS); 2448 } 2449 /* if the a->colidx are the same */ 2450 PetscCall(PetscArraycmp(a->colidx, b->colidx, a->sliidx[totalslices], flg)); 2451 if (!*flg) PetscFunctionReturn(PETSC_SUCCESS); 2452 /* if a->val are the same */ 2453 PetscCall(PetscArraycmp(a->val, b->val, a->sliidx[totalslices], flg)); 2454 PetscFunctionReturn(PETSC_SUCCESS); 2455 } 2456 2457 PetscErrorCode MatSeqSELLInvalidateDiagonal(Mat A) 2458 { 2459 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 2460 2461 PetscFunctionBegin; 2462 a->idiagvalid = PETSC_FALSE; 2463 PetscFunctionReturn(PETSC_SUCCESS); 2464 } 2465 2466 PetscErrorCode MatConjugate_SeqSELL(Mat A) 2467 { 2468 #if defined(PETSC_USE_COMPLEX) 2469 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 2470 PetscInt i; 2471 PetscScalar *val = a->val; 2472 2473 PetscFunctionBegin; 2474 for (i = 0; i < a->sliidx[a->totalslices]; i++) { val[i] = PetscConj(val[i]); } 2475 #if defined(PETSC_HAVE_CUDA) 2476 if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) A->offloadmask = PETSC_OFFLOAD_CPU; 2477 #endif 2478 #else 2479 PetscFunctionBegin; 2480 #endif 2481 PetscFunctionReturn(PETSC_SUCCESS); 2482 } 2483