1 2 /* 3 Defines the basic matrix operations for the SELL matrix storage format. 4 */ 5 #include <../src/mat/impls/sell/seq/sell.h> /*I "petscmat.h" I*/ 6 #include <petscblaslapack.h> 7 #include <petsc/private/kernels/blocktranspose.h> 8 9 static PetscBool cited = PETSC_FALSE; 10 static const char citation[] = 11 "@inproceedings{ZhangELLPACK2018,\n" 12 " author = {Hong Zhang and Richard T. Mills and Karl Rupp and Barry F. Smith},\n" 13 " title = {Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512}},\n" 14 " booktitle = {Proceedings of the 47th International Conference on Parallel Processing},\n" 15 " year = 2018\n" 16 "}\n"; 17 18 #if defined(PETSC_HAVE_IMMINTRIN_H) && (defined(__AVX512F__) || (defined(__AVX2__) && defined(__FMA__)) || defined(__AVX__)) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 19 20 #include <immintrin.h> 21 22 #if !defined(_MM_SCALE_8) 23 #define _MM_SCALE_8 8 24 #endif 25 26 #if defined(__AVX512F__) 27 /* these do not work 28 vec_idx = _mm512_loadunpackhi_epi32(vec_idx,acolidx); 29 vec_vals = _mm512_loadunpackhi_pd(vec_vals,aval); 30 */ 31 #define AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y) \ 32 /* if the mask bit is set, copy from acolidx, otherwise from vec_idx */ \ 33 vec_idx = _mm256_loadu_si256((__m256i const*)acolidx); \ 34 vec_vals = _mm512_loadu_pd(aval); \ 35 vec_x = _mm512_i32gather_pd(vec_idx,x,_MM_SCALE_8); \ 36 vec_y = _mm512_fmadd_pd(vec_x,vec_vals,vec_y) 37 #elif defined(__AVX2__) && defined(__FMA__) 38 #define AVX2_Mult_Private(vec_idx,vec_x,vec_vals,vec_y) \ 39 vec_vals = _mm256_loadu_pd(aval); \ 40 vec_idx = _mm_loadu_si128((__m128i const*)acolidx); /* SSE2 */ \ 41 vec_x = _mm256_i32gather_pd(x,vec_idx,_MM_SCALE_8); \ 42 vec_y = _mm256_fmadd_pd(vec_x,vec_vals,vec_y) 43 #endif 44 #endif /* PETSC_HAVE_IMMINTRIN_H */ 45 46 /*@C 47 MatSeqSELLSetPreallocation - For good matrix assembly performance 48 the user should preallocate the matrix storage by setting the parameter nz 49 (or the array nnz). By setting these parameters accurately, performance 50 during matrix assembly can be increased significantly. 51 52 Collective 53 54 Input Parameters: 55 + B - The matrix 56 . nz - number of nonzeros per row (same for all rows) 57 - nnz - array containing the number of nonzeros in the various rows 58 (possibly different for each row) or NULL 59 60 Notes: 61 If nnz is given then nz is ignored. 62 63 Specify the preallocated storage with either nz or nnz (not both). 64 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 65 allocation. For large problems you MUST preallocate memory or you 66 will get TERRIBLE performance, see the users' manual chapter on matrices. 67 68 You can call MatGetInfo() to get information on how effective the preallocation was; 69 for example the fields mallocs,nz_allocated,nz_used,nz_unneeded; 70 You can also run with the option -info and look for messages with the string 71 malloc in them to see if additional memory allocation was needed. 72 73 Developers: Use nz of MAT_SKIP_ALLOCATION to not allocate any space for the matrix 74 entries or columns indices. 75 76 The maximum number of nonzeos in any row should be as accurate as possible. 77 If it is underestimated, you will get bad performance due to reallocation 78 (MatSeqXSELLReallocateSELL). 79 80 Level: intermediate 81 82 .seealso: `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatGetInfo()` 83 84 @*/ 85 PetscErrorCode MatSeqSELLSetPreallocation(Mat B,PetscInt rlenmax,const PetscInt rlen[]) 86 { 87 PetscFunctionBegin; 88 PetscValidHeaderSpecific(B,MAT_CLASSID,1); 89 PetscValidType(B,1); 90 PetscTryMethod(B,"MatSeqSELLSetPreallocation_C",(Mat,PetscInt,const PetscInt[]),(B,rlenmax,rlen)); 91 PetscFunctionReturn(0); 92 } 93 94 PetscErrorCode MatSeqSELLSetPreallocation_SeqSELL(Mat B,PetscInt maxallocrow,const PetscInt rlen[]) 95 { 96 Mat_SeqSELL *b; 97 PetscInt i,j,totalslices; 98 PetscBool skipallocation=PETSC_FALSE,realalloc=PETSC_FALSE; 99 100 PetscFunctionBegin; 101 if (maxallocrow >= 0 || rlen) realalloc = PETSC_TRUE; 102 if (maxallocrow == MAT_SKIP_ALLOCATION) { 103 skipallocation = PETSC_TRUE; 104 maxallocrow = 0; 105 } 106 107 PetscCall(PetscLayoutSetUp(B->rmap)); 108 PetscCall(PetscLayoutSetUp(B->cmap)); 109 110 /* FIXME: if one preallocates more space than needed, the matrix does not shrink automatically, but for best performance it should */ 111 if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 5; 112 PetscCheck(maxallocrow >= 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"maxallocrow cannot be less than 0: value %" PetscInt_FMT,maxallocrow); 113 if (rlen) { 114 for (i=0; i<B->rmap->n; i++) { 115 PetscCheck(rlen[i] >= 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"rlen cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT,i,rlen[i]); 116 PetscCheck(rlen[i] <= B->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"rlen cannot be greater than row length: local row %" PetscInt_FMT " value %" PetscInt_FMT " rowlength %" PetscInt_FMT,i,rlen[i],B->cmap->n); 117 } 118 } 119 120 B->preallocated = PETSC_TRUE; 121 122 b = (Mat_SeqSELL*)B->data; 123 124 totalslices = PetscCeilInt(B->rmap->n,8); 125 b->totalslices = totalslices; 126 if (!skipallocation) { 127 if (B->rmap->n & 0x07) PetscCall(PetscInfo(B,"Padding rows to the SEQSELL matrix because the number of rows is not the multiple of 8 (value %" PetscInt_FMT ")\n",B->rmap->n)); 128 129 if (!b->sliidx) { /* sliidx gives the starting index of each slice, the last element is the total space allocated */ 130 PetscCall(PetscMalloc1(totalslices+1,&b->sliidx)); 131 PetscCall(PetscLogObjectMemory((PetscObject)B,(totalslices+1)*sizeof(PetscInt))); 132 } 133 if (!rlen) { /* if rlen is not provided, allocate same space for all the slices */ 134 if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 10; 135 else if (maxallocrow < 0) maxallocrow = 1; 136 for (i=0; i<=totalslices; i++) b->sliidx[i] = i*8*maxallocrow; 137 } else { 138 maxallocrow = 0; 139 b->sliidx[0] = 0; 140 for (i=1; i<totalslices; i++) { 141 b->sliidx[i] = 0; 142 for (j=0;j<8;j++) { 143 b->sliidx[i] = PetscMax(b->sliidx[i],rlen[8*(i-1)+j]); 144 } 145 maxallocrow = PetscMax(b->sliidx[i],maxallocrow); 146 PetscCall(PetscIntSumError(b->sliidx[i-1],8*b->sliidx[i],&b->sliidx[i])); 147 } 148 /* last slice */ 149 b->sliidx[totalslices] = 0; 150 for (j=(totalslices-1)*8;j<B->rmap->n;j++) b->sliidx[totalslices] = PetscMax(b->sliidx[totalslices],rlen[j]); 151 maxallocrow = PetscMax(b->sliidx[totalslices],maxallocrow); 152 b->sliidx[totalslices] = b->sliidx[totalslices-1] + 8*b->sliidx[totalslices]; 153 } 154 155 /* allocate space for val, colidx, rlen */ 156 /* FIXME: should B's old memory be unlogged? */ 157 PetscCall(MatSeqXSELLFreeSELL(B,&b->val,&b->colidx)); 158 /* FIXME: assuming an element of the bit array takes 8 bits */ 159 PetscCall(PetscMalloc2(b->sliidx[totalslices],&b->val,b->sliidx[totalslices],&b->colidx)); 160 PetscCall(PetscLogObjectMemory((PetscObject)B,b->sliidx[totalslices]*(sizeof(PetscScalar)+sizeof(PetscInt)))); 161 /* b->rlen will count nonzeros in each row so far. We dont copy rlen to b->rlen because the matrix has not been set. */ 162 PetscCall(PetscCalloc1(8*totalslices,&b->rlen)); 163 PetscCall(PetscLogObjectMemory((PetscObject)B,8*totalslices*sizeof(PetscInt))); 164 165 b->singlemalloc = PETSC_TRUE; 166 b->free_val = PETSC_TRUE; 167 b->free_colidx = PETSC_TRUE; 168 } else { 169 b->free_val = PETSC_FALSE; 170 b->free_colidx = PETSC_FALSE; 171 } 172 173 b->nz = 0; 174 b->maxallocrow = maxallocrow; 175 b->rlenmax = maxallocrow; 176 b->maxallocmat = b->sliidx[totalslices]; 177 B->info.nz_unneeded = (double)b->maxallocmat; 178 if (realalloc) { 179 PetscCall(MatSetOption(B,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE)); 180 } 181 PetscFunctionReturn(0); 182 } 183 184 PetscErrorCode MatGetRow_SeqSELL(Mat A,PetscInt row,PetscInt *nz,PetscInt **idx,PetscScalar **v) 185 { 186 Mat_SeqSELL *a = (Mat_SeqSELL*)A->data; 187 PetscInt shift; 188 189 PetscFunctionBegin; 190 PetscCheck(row >= 0 && row < A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Row %" PetscInt_FMT " out of range",row); 191 if (nz) *nz = a->rlen[row]; 192 shift = a->sliidx[row>>3]+(row&0x07); 193 if (!a->getrowcols) { 194 195 PetscCall(PetscMalloc2(a->rlenmax,&a->getrowcols,a->rlenmax,&a->getrowvals)); 196 } 197 if (idx) { 198 PetscInt j; 199 for (j=0; j<a->rlen[row]; j++) a->getrowcols[j] = a->colidx[shift+8*j]; 200 *idx = a->getrowcols; 201 } 202 if (v) { 203 PetscInt j; 204 for (j=0; j<a->rlen[row]; j++) a->getrowvals[j] = a->val[shift+8*j]; 205 *v = a->getrowvals; 206 } 207 PetscFunctionReturn(0); 208 } 209 210 PetscErrorCode MatRestoreRow_SeqSELL(Mat A,PetscInt row,PetscInt *nz,PetscInt **idx,PetscScalar **v) 211 { 212 PetscFunctionBegin; 213 PetscFunctionReturn(0); 214 } 215 216 PetscErrorCode MatConvert_SeqSELL_SeqAIJ(Mat A, MatType newtype,MatReuse reuse,Mat *newmat) 217 { 218 Mat B; 219 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 220 PetscInt i; 221 222 PetscFunctionBegin; 223 if (reuse == MAT_REUSE_MATRIX) { 224 B = *newmat; 225 PetscCall(MatZeroEntries(B)); 226 } else { 227 PetscCall(MatCreate(PetscObjectComm((PetscObject)A),&B)); 228 PetscCall(MatSetSizes(B,A->rmap->n,A->cmap->n,A->rmap->N,A->cmap->N)); 229 PetscCall(MatSetType(B,MATSEQAIJ)); 230 PetscCall(MatSeqAIJSetPreallocation(B,0,a->rlen)); 231 } 232 233 for (i=0; i<A->rmap->n; i++) { 234 PetscInt nz = 0,*cols = NULL; 235 PetscScalar *vals = NULL; 236 237 PetscCall(MatGetRow_SeqSELL(A,i,&nz,&cols,&vals)); 238 PetscCall(MatSetValues(B,1,&i,nz,cols,vals,INSERT_VALUES)); 239 PetscCall(MatRestoreRow_SeqSELL(A,i,&nz,&cols,&vals)); 240 } 241 242 PetscCall(MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY)); 243 PetscCall(MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY)); 244 B->rmap->bs = A->rmap->bs; 245 246 if (reuse == MAT_INPLACE_MATRIX) { 247 PetscCall(MatHeaderReplace(A,&B)); 248 } else { 249 *newmat = B; 250 } 251 PetscFunctionReturn(0); 252 } 253 254 #include <../src/mat/impls/aij/seq/aij.h> 255 256 PetscErrorCode MatConvert_SeqAIJ_SeqSELL(Mat A,MatType newtype,MatReuse reuse,Mat *newmat) 257 { 258 Mat B; 259 Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data; 260 PetscInt *ai=a->i,m=A->rmap->N,n=A->cmap->N,i,*rowlengths,row,ncols; 261 const PetscInt *cols; 262 const PetscScalar *vals; 263 264 PetscFunctionBegin; 265 266 if (reuse == MAT_REUSE_MATRIX) { 267 B = *newmat; 268 } else { 269 if (PetscDefined(USE_DEBUG) || !a->ilen) { 270 PetscCall(PetscMalloc1(m,&rowlengths)); 271 for (i=0; i<m; i++) { 272 rowlengths[i] = ai[i+1] - ai[i]; 273 } 274 } 275 if (PetscDefined(USE_DEBUG) && a->ilen) { 276 PetscBool eq; 277 PetscCall(PetscMemcmp(rowlengths,a->ilen,m*sizeof(PetscInt),&eq)); 278 PetscCheck(eq,PETSC_COMM_SELF,PETSC_ERR_PLIB,"SeqAIJ ilen array incorrect"); 279 PetscCall(PetscFree(rowlengths)); 280 rowlengths = a->ilen; 281 } else if (a->ilen) rowlengths = a->ilen; 282 PetscCall(MatCreate(PetscObjectComm((PetscObject)A),&B)); 283 PetscCall(MatSetSizes(B,m,n,m,n)); 284 PetscCall(MatSetType(B,MATSEQSELL)); 285 PetscCall(MatSeqSELLSetPreallocation(B,0,rowlengths)); 286 if (rowlengths != a->ilen) PetscCall(PetscFree(rowlengths)); 287 } 288 289 for (row=0; row<m; row++) { 290 PetscCall(MatGetRow_SeqAIJ(A,row,&ncols,(PetscInt**)&cols,(PetscScalar**)&vals)); 291 PetscCall(MatSetValues_SeqSELL(B,1,&row,ncols,cols,vals,INSERT_VALUES)); 292 PetscCall(MatRestoreRow_SeqAIJ(A,row,&ncols,(PetscInt**)&cols,(PetscScalar**)&vals)); 293 } 294 PetscCall(MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY)); 295 PetscCall(MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY)); 296 B->rmap->bs = A->rmap->bs; 297 298 if (reuse == MAT_INPLACE_MATRIX) { 299 PetscCall(MatHeaderReplace(A,&B)); 300 } else { 301 *newmat = B; 302 } 303 PetscFunctionReturn(0); 304 } 305 306 PetscErrorCode MatMult_SeqSELL(Mat A,Vec xx,Vec yy) 307 { 308 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 309 PetscScalar *y; 310 const PetscScalar *x; 311 const MatScalar *aval=a->val; 312 PetscInt totalslices=a->totalslices; 313 const PetscInt *acolidx=a->colidx; 314 PetscInt i,j; 315 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 316 __m512d vec_x,vec_y,vec_vals; 317 __m256i vec_idx; 318 __mmask8 mask; 319 __m512d vec_x2,vec_y2,vec_vals2,vec_x3,vec_y3,vec_vals3,vec_x4,vec_y4,vec_vals4; 320 __m256i vec_idx2,vec_idx3,vec_idx4; 321 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 322 __m128i vec_idx; 323 __m256d vec_x,vec_y,vec_y2,vec_vals; 324 MatScalar yval; 325 PetscInt r,rows_left,row,nnz_in_row; 326 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 327 __m128d vec_x_tmp; 328 __m256d vec_x,vec_y,vec_y2,vec_vals; 329 MatScalar yval; 330 PetscInt r,rows_left,row,nnz_in_row; 331 #else 332 PetscScalar sum[8]; 333 #endif 334 335 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 336 #pragma disjoint(*x,*y,*aval) 337 #endif 338 339 PetscFunctionBegin; 340 PetscCall(VecGetArrayRead(xx,&x)); 341 PetscCall(VecGetArray(yy,&y)); 342 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 343 for (i=0; i<totalslices; i++) { /* loop over slices */ 344 PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0); 345 PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0); 346 347 vec_y = _mm512_setzero_pd(); 348 vec_y2 = _mm512_setzero_pd(); 349 vec_y3 = _mm512_setzero_pd(); 350 vec_y4 = _mm512_setzero_pd(); 351 352 j = a->sliidx[i]>>3; /* 8 bytes are read at each time, corresponding to a slice columnn */ 353 switch ((a->sliidx[i+1]-a->sliidx[i])/8 & 3) { 354 case 3: 355 AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y); 356 acolidx += 8; aval += 8; 357 AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2); 358 acolidx += 8; aval += 8; 359 AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3); 360 acolidx += 8; aval += 8; 361 j += 3; 362 break; 363 case 2: 364 AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y); 365 acolidx += 8; aval += 8; 366 AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2); 367 acolidx += 8; aval += 8; 368 j += 2; 369 break; 370 case 1: 371 AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y); 372 acolidx += 8; aval += 8; 373 j += 1; 374 break; 375 } 376 #pragma novector 377 for (; j<(a->sliidx[i+1]>>3); j+=4) { 378 AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y); 379 acolidx += 8; aval += 8; 380 AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2); 381 acolidx += 8; aval += 8; 382 AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3); 383 acolidx += 8; aval += 8; 384 AVX512_Mult_Private(vec_idx4,vec_x4,vec_vals4,vec_y4); 385 acolidx += 8; aval += 8; 386 } 387 388 vec_y = _mm512_add_pd(vec_y,vec_y2); 389 vec_y = _mm512_add_pd(vec_y,vec_y3); 390 vec_y = _mm512_add_pd(vec_y,vec_y4); 391 if (i == totalslices-1 && A->rmap->n & 0x07) { /* if last slice has padding rows */ 392 mask = (__mmask8)(0xff >> (8-(A->rmap->n & 0x07))); 393 _mm512_mask_storeu_pd(&y[8*i],mask,vec_y); 394 } else { 395 _mm512_storeu_pd(&y[8*i],vec_y); 396 } 397 } 398 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 399 for (i=0; i<totalslices; i++) { /* loop over full slices */ 400 PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0); 401 PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0); 402 403 /* last slice may have padding rows. Don't use vectorization. */ 404 if (i == totalslices-1 && (A->rmap->n & 0x07)) { 405 rows_left = A->rmap->n - 8*i; 406 for (r=0; r<rows_left; ++r) { 407 yval = (MatScalar)0; 408 row = 8*i + r; 409 nnz_in_row = a->rlen[row]; 410 for (j=0; j<nnz_in_row; ++j) yval += aval[8*j+r] * x[acolidx[8*j+r]]; 411 y[row] = yval; 412 } 413 break; 414 } 415 416 vec_y = _mm256_setzero_pd(); 417 vec_y2 = _mm256_setzero_pd(); 418 419 /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */ 420 #pragma novector 421 #pragma unroll(2) 422 for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) { 423 AVX2_Mult_Private(vec_idx,vec_x,vec_vals,vec_y); 424 aval += 4; acolidx += 4; 425 AVX2_Mult_Private(vec_idx,vec_x,vec_vals,vec_y2); 426 aval += 4; acolidx += 4; 427 } 428 429 _mm256_storeu_pd(y+i*8,vec_y); 430 _mm256_storeu_pd(y+i*8+4,vec_y2); 431 } 432 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 433 for (i=0; i<totalslices; i++) { /* loop over full slices */ 434 PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0); 435 PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0); 436 437 vec_y = _mm256_setzero_pd(); 438 vec_y2 = _mm256_setzero_pd(); 439 440 /* last slice may have padding rows. Don't use vectorization. */ 441 if (i == totalslices-1 && (A->rmap->n & 0x07)) { 442 rows_left = A->rmap->n - 8*i; 443 for (r=0; r<rows_left; ++r) { 444 yval = (MatScalar)0; 445 row = 8*i + r; 446 nnz_in_row = a->rlen[row]; 447 for (j=0; j<nnz_in_row; ++j) yval += aval[8*j + r] * x[acolidx[8*j + r]]; 448 y[row] = yval; 449 } 450 break; 451 } 452 453 /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */ 454 #pragma novector 455 #pragma unroll(2) 456 for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) { 457 vec_vals = _mm256_loadu_pd(aval); 458 vec_x_tmp = _mm_setzero_pd(); 459 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 460 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 461 vec_x = _mm256_insertf128_pd(vec_x,vec_x_tmp,0); 462 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 463 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 464 vec_x = _mm256_insertf128_pd(vec_x,vec_x_tmp,1); 465 vec_y = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y); 466 aval += 4; 467 468 vec_vals = _mm256_loadu_pd(aval); 469 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 470 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 471 vec_x = _mm256_insertf128_pd(vec_x,vec_x_tmp,0); 472 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 473 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 474 vec_x = _mm256_insertf128_pd(vec_x,vec_x_tmp,1); 475 vec_y2 = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y2); 476 aval += 4; 477 } 478 479 _mm256_storeu_pd(y + i*8, vec_y); 480 _mm256_storeu_pd(y + i*8 + 4, vec_y2); 481 } 482 #else 483 for (i=0; i<totalslices; i++) { /* loop over slices */ 484 for (j=0; j<8; j++) sum[j] = 0.0; 485 for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) { 486 sum[0] += aval[j] * x[acolidx[j]]; 487 sum[1] += aval[j+1] * x[acolidx[j+1]]; 488 sum[2] += aval[j+2] * x[acolidx[j+2]]; 489 sum[3] += aval[j+3] * x[acolidx[j+3]]; 490 sum[4] += aval[j+4] * x[acolidx[j+4]]; 491 sum[5] += aval[j+5] * x[acolidx[j+5]]; 492 sum[6] += aval[j+6] * x[acolidx[j+6]]; 493 sum[7] += aval[j+7] * x[acolidx[j+7]]; 494 } 495 if (i == totalslices-1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */ 496 for (j=0; j<(A->rmap->n & 0x07); j++) y[8*i+j] = sum[j]; 497 } else { 498 for (j=0; j<8; j++) y[8*i+j] = sum[j]; 499 } 500 } 501 #endif 502 503 PetscCall(PetscLogFlops(2.0*a->nz-a->nonzerorowcnt)); /* theoretical minimal FLOPs */ 504 PetscCall(VecRestoreArrayRead(xx,&x)); 505 PetscCall(VecRestoreArray(yy,&y)); 506 PetscFunctionReturn(0); 507 } 508 509 #include <../src/mat/impls/aij/seq/ftn-kernels/fmultadd.h> 510 PetscErrorCode MatMultAdd_SeqSELL(Mat A,Vec xx,Vec yy,Vec zz) 511 { 512 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 513 PetscScalar *y,*z; 514 const PetscScalar *x; 515 const MatScalar *aval=a->val; 516 PetscInt totalslices=a->totalslices; 517 const PetscInt *acolidx=a->colidx; 518 PetscInt i,j; 519 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 520 __m512d vec_x,vec_y,vec_vals; 521 __m256i vec_idx; 522 __mmask8 mask; 523 __m512d vec_x2,vec_y2,vec_vals2,vec_x3,vec_y3,vec_vals3,vec_x4,vec_y4,vec_vals4; 524 __m256i vec_idx2,vec_idx3,vec_idx4; 525 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 526 __m128d vec_x_tmp; 527 __m256d vec_x,vec_y,vec_y2,vec_vals; 528 MatScalar yval; 529 PetscInt r,row,nnz_in_row; 530 #else 531 PetscScalar sum[8]; 532 #endif 533 534 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 535 #pragma disjoint(*x,*y,*aval) 536 #endif 537 538 PetscFunctionBegin; 539 PetscCall(VecGetArrayRead(xx,&x)); 540 PetscCall(VecGetArrayPair(yy,zz,&y,&z)); 541 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 542 for (i=0; i<totalslices; i++) { /* loop over slices */ 543 PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0); 544 PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0); 545 546 if (i == totalslices-1 && A->rmap->n & 0x07) { /* if last slice has padding rows */ 547 mask = (__mmask8)(0xff >> (8-(A->rmap->n & 0x07))); 548 vec_y = _mm512_mask_loadu_pd(vec_y,mask,&y[8*i]); 549 } else { 550 vec_y = _mm512_loadu_pd(&y[8*i]); 551 } 552 vec_y2 = _mm512_setzero_pd(); 553 vec_y3 = _mm512_setzero_pd(); 554 vec_y4 = _mm512_setzero_pd(); 555 556 j = a->sliidx[i]>>3; /* 8 bytes are read at each time, corresponding to a slice columnn */ 557 switch ((a->sliidx[i+1]-a->sliidx[i])/8 & 3) { 558 case 3: 559 AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y); 560 acolidx += 8; aval += 8; 561 AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2); 562 acolidx += 8; aval += 8; 563 AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3); 564 acolidx += 8; aval += 8; 565 j += 3; 566 break; 567 case 2: 568 AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y); 569 acolidx += 8; aval += 8; 570 AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2); 571 acolidx += 8; aval += 8; 572 j += 2; 573 break; 574 case 1: 575 AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y); 576 acolidx += 8; aval += 8; 577 j += 1; 578 break; 579 } 580 #pragma novector 581 for (; j<(a->sliidx[i+1]>>3); j+=4) { 582 AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y); 583 acolidx += 8; aval += 8; 584 AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2); 585 acolidx += 8; aval += 8; 586 AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3); 587 acolidx += 8; aval += 8; 588 AVX512_Mult_Private(vec_idx4,vec_x4,vec_vals4,vec_y4); 589 acolidx += 8; aval += 8; 590 } 591 592 vec_y = _mm512_add_pd(vec_y,vec_y2); 593 vec_y = _mm512_add_pd(vec_y,vec_y3); 594 vec_y = _mm512_add_pd(vec_y,vec_y4); 595 if (i == totalslices-1 && A->rmap->n & 0x07) { /* if last slice has padding rows */ 596 _mm512_mask_storeu_pd(&z[8*i],mask,vec_y); 597 } else { 598 _mm512_storeu_pd(&z[8*i],vec_y); 599 } 600 } 601 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 602 for (i=0; i<totalslices; i++) { /* loop over full slices */ 603 PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0); 604 PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0); 605 606 /* last slice may have padding rows. Don't use vectorization. */ 607 if (i == totalslices-1 && (A->rmap->n & 0x07)) { 608 for (r=0; r<(A->rmap->n & 0x07); ++r) { 609 row = 8*i + r; 610 yval = (MatScalar)0.0; 611 nnz_in_row = a->rlen[row]; 612 for (j=0; j<nnz_in_row; ++j) yval += aval[8*j+r] * x[acolidx[8*j+r]]; 613 z[row] = y[row] + yval; 614 } 615 break; 616 } 617 618 vec_y = _mm256_loadu_pd(y+8*i); 619 vec_y2 = _mm256_loadu_pd(y+8*i+4); 620 621 /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */ 622 for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) { 623 vec_vals = _mm256_loadu_pd(aval); 624 vec_x_tmp = _mm_setzero_pd(); 625 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 626 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 627 vec_x = _mm256_setzero_pd(); 628 vec_x = _mm256_insertf128_pd(vec_x,vec_x_tmp,0); 629 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 630 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 631 vec_x = _mm256_insertf128_pd(vec_x,vec_x_tmp,1); 632 vec_y = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y); 633 aval += 4; 634 635 vec_vals = _mm256_loadu_pd(aval); 636 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 637 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 638 vec_x = _mm256_insertf128_pd(vec_x,vec_x_tmp,0); 639 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 640 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 641 vec_x = _mm256_insertf128_pd(vec_x,vec_x_tmp,1); 642 vec_y2 = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y2); 643 aval += 4; 644 } 645 646 _mm256_storeu_pd(z+i*8,vec_y); 647 _mm256_storeu_pd(z+i*8+4,vec_y2); 648 } 649 #else 650 for (i=0; i<totalslices; i++) { /* loop over slices */ 651 for (j=0; j<8; j++) sum[j] = 0.0; 652 for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) { 653 sum[0] += aval[j] * x[acolidx[j]]; 654 sum[1] += aval[j+1] * x[acolidx[j+1]]; 655 sum[2] += aval[j+2] * x[acolidx[j+2]]; 656 sum[3] += aval[j+3] * x[acolidx[j+3]]; 657 sum[4] += aval[j+4] * x[acolidx[j+4]]; 658 sum[5] += aval[j+5] * x[acolidx[j+5]]; 659 sum[6] += aval[j+6] * x[acolidx[j+6]]; 660 sum[7] += aval[j+7] * x[acolidx[j+7]]; 661 } 662 if (i == totalslices-1 && (A->rmap->n & 0x07)) { 663 for (j=0; j<(A->rmap->n & 0x07); j++) z[8*i+j] = y[8*i+j] + sum[j]; 664 } else { 665 for (j=0; j<8; j++) z[8*i+j] = y[8*i+j] + sum[j]; 666 } 667 } 668 #endif 669 670 PetscCall(PetscLogFlops(2.0*a->nz)); 671 PetscCall(VecRestoreArrayRead(xx,&x)); 672 PetscCall(VecRestoreArrayPair(yy,zz,&y,&z)); 673 PetscFunctionReturn(0); 674 } 675 676 PetscErrorCode MatMultTransposeAdd_SeqSELL(Mat A,Vec xx,Vec zz,Vec yy) 677 { 678 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 679 PetscScalar *y; 680 const PetscScalar *x; 681 const MatScalar *aval=a->val; 682 const PetscInt *acolidx=a->colidx; 683 PetscInt i,j,r,row,nnz_in_row,totalslices=a->totalslices; 684 685 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 686 #pragma disjoint(*x,*y,*aval) 687 #endif 688 689 PetscFunctionBegin; 690 if (A->symmetric) { 691 PetscCall(MatMultAdd_SeqSELL(A,xx,zz,yy)); 692 PetscFunctionReturn(0); 693 } 694 if (zz != yy) PetscCall(VecCopy(zz,yy)); 695 PetscCall(VecGetArrayRead(xx,&x)); 696 PetscCall(VecGetArray(yy,&y)); 697 for (i=0; i<a->totalslices; i++) { /* loop over slices */ 698 if (i == totalslices-1 && (A->rmap->n & 0x07)) { 699 for (r=0; r<(A->rmap->n & 0x07); ++r) { 700 row = 8*i + r; 701 nnz_in_row = a->rlen[row]; 702 for (j=0; j<nnz_in_row; ++j) y[acolidx[8*j+r]] += aval[8*j+r] * x[row]; 703 } 704 break; 705 } 706 for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) { 707 y[acolidx[j]] += aval[j] * x[8*i]; 708 y[acolidx[j+1]] += aval[j+1] * x[8*i+1]; 709 y[acolidx[j+2]] += aval[j+2] * x[8*i+2]; 710 y[acolidx[j+3]] += aval[j+3] * x[8*i+3]; 711 y[acolidx[j+4]] += aval[j+4] * x[8*i+4]; 712 y[acolidx[j+5]] += aval[j+5] * x[8*i+5]; 713 y[acolidx[j+6]] += aval[j+6] * x[8*i+6]; 714 y[acolidx[j+7]] += aval[j+7] * x[8*i+7]; 715 } 716 } 717 PetscCall(PetscLogFlops(2.0*a->sliidx[a->totalslices])); 718 PetscCall(VecRestoreArrayRead(xx,&x)); 719 PetscCall(VecRestoreArray(yy,&y)); 720 PetscFunctionReturn(0); 721 } 722 723 PetscErrorCode MatMultTranspose_SeqSELL(Mat A,Vec xx,Vec yy) 724 { 725 PetscFunctionBegin; 726 if (A->symmetric) { 727 PetscCall(MatMult_SeqSELL(A,xx,yy)); 728 } else { 729 PetscCall(VecSet(yy,0.0)); 730 PetscCall(MatMultTransposeAdd_SeqSELL(A,xx,yy,yy)); 731 } 732 PetscFunctionReturn(0); 733 } 734 735 /* 736 Checks for missing diagonals 737 */ 738 PetscErrorCode MatMissingDiagonal_SeqSELL(Mat A,PetscBool *missing,PetscInt *d) 739 { 740 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 741 PetscInt *diag,i; 742 743 PetscFunctionBegin; 744 *missing = PETSC_FALSE; 745 if (A->rmap->n > 0 && !(a->colidx)) { 746 *missing = PETSC_TRUE; 747 if (d) *d = 0; 748 PetscCall(PetscInfo(A,"Matrix has no entries therefore is missing diagonal\n")); 749 } else { 750 diag = a->diag; 751 for (i=0; i<A->rmap->n; i++) { 752 if (diag[i] == -1) { 753 *missing = PETSC_TRUE; 754 if (d) *d = i; 755 PetscCall(PetscInfo(A,"Matrix is missing diagonal number %" PetscInt_FMT "\n",i)); 756 break; 757 } 758 } 759 } 760 PetscFunctionReturn(0); 761 } 762 763 PetscErrorCode MatMarkDiagonal_SeqSELL(Mat A) 764 { 765 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 766 PetscInt i,j,m=A->rmap->n,shift; 767 768 PetscFunctionBegin; 769 if (!a->diag) { 770 PetscCall(PetscMalloc1(m,&a->diag)); 771 PetscCall(PetscLogObjectMemory((PetscObject)A,m*sizeof(PetscInt))); 772 a->free_diag = PETSC_TRUE; 773 } 774 for (i=0; i<m; i++) { /* loop over rows */ 775 shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */ 776 a->diag[i] = -1; 777 for (j=0; j<a->rlen[i]; j++) { 778 if (a->colidx[shift+j*8] == i) { 779 a->diag[i] = shift+j*8; 780 break; 781 } 782 } 783 } 784 PetscFunctionReturn(0); 785 } 786 787 /* 788 Negative shift indicates do not generate an error if there is a zero diagonal, just invert it anyways 789 */ 790 PetscErrorCode MatInvertDiagonal_SeqSELL(Mat A,PetscScalar omega,PetscScalar fshift) 791 { 792 Mat_SeqSELL *a=(Mat_SeqSELL*) A->data; 793 PetscInt i,*diag,m = A->rmap->n; 794 MatScalar *val = a->val; 795 PetscScalar *idiag,*mdiag; 796 797 PetscFunctionBegin; 798 if (a->idiagvalid) PetscFunctionReturn(0); 799 PetscCall(MatMarkDiagonal_SeqSELL(A)); 800 diag = a->diag; 801 if (!a->idiag) { 802 PetscCall(PetscMalloc3(m,&a->idiag,m,&a->mdiag,m,&a->ssor_work)); 803 PetscCall(PetscLogObjectMemory((PetscObject)A, 3*m*sizeof(PetscScalar))); 804 val = a->val; 805 } 806 mdiag = a->mdiag; 807 idiag = a->idiag; 808 809 if (omega == 1.0 && PetscRealPart(fshift) <= 0.0) { 810 for (i=0; i<m; i++) { 811 mdiag[i] = val[diag[i]]; 812 if (!PetscAbsScalar(mdiag[i])) { /* zero diagonal */ 813 if (PetscRealPart(fshift)) { 814 PetscCall(PetscInfo(A,"Zero diagonal on row %" PetscInt_FMT "\n",i)); 815 A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 816 A->factorerror_zeropivot_value = 0.0; 817 A->factorerror_zeropivot_row = i; 818 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Zero diagonal on row %" PetscInt_FMT,i); 819 } 820 idiag[i] = 1.0/val[diag[i]]; 821 } 822 PetscCall(PetscLogFlops(m)); 823 } else { 824 for (i=0; i<m; i++) { 825 mdiag[i] = val[diag[i]]; 826 idiag[i] = omega/(fshift + val[diag[i]]); 827 } 828 PetscCall(PetscLogFlops(2.0*m)); 829 } 830 a->idiagvalid = PETSC_TRUE; 831 PetscFunctionReturn(0); 832 } 833 834 PetscErrorCode MatZeroEntries_SeqSELL(Mat A) 835 { 836 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 837 838 PetscFunctionBegin; 839 PetscCall(PetscArrayzero(a->val,a->sliidx[a->totalslices])); 840 PetscCall(MatSeqSELLInvalidateDiagonal(A)); 841 PetscFunctionReturn(0); 842 } 843 844 PetscErrorCode MatDestroy_SeqSELL(Mat A) 845 { 846 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 847 848 PetscFunctionBegin; 849 #if defined(PETSC_USE_LOG) 850 PetscLogObjectState((PetscObject)A,"Rows=%" PetscInt_FMT ", Cols=%" PetscInt_FMT ", NZ=%" PetscInt_FMT,A->rmap->n,A->cmap->n,a->nz); 851 #endif 852 PetscCall(MatSeqXSELLFreeSELL(A,&a->val,&a->colidx)); 853 PetscCall(ISDestroy(&a->row)); 854 PetscCall(ISDestroy(&a->col)); 855 PetscCall(PetscFree(a->diag)); 856 PetscCall(PetscFree(a->rlen)); 857 PetscCall(PetscFree(a->sliidx)); 858 PetscCall(PetscFree3(a->idiag,a->mdiag,a->ssor_work)); 859 PetscCall(PetscFree(a->solve_work)); 860 PetscCall(ISDestroy(&a->icol)); 861 PetscCall(PetscFree(a->saved_values)); 862 PetscCall(PetscFree2(a->getrowcols,a->getrowvals)); 863 864 PetscCall(PetscFree(A->data)); 865 866 PetscCall(PetscObjectChangeTypeName((PetscObject)A,NULL)); 867 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatStoreValues_C",NULL)); 868 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatRetrieveValues_C",NULL)); 869 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqSELLSetPreallocation_C",NULL)); 870 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqSELLGetArray_C",NULL)); 871 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqSELLRestoreArray_C",NULL)); 872 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqsell_seqaij_C",NULL)); 873 PetscFunctionReturn(0); 874 } 875 876 PetscErrorCode MatSetOption_SeqSELL(Mat A,MatOption op,PetscBool flg) 877 { 878 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 879 880 PetscFunctionBegin; 881 switch (op) { 882 case MAT_ROW_ORIENTED: 883 a->roworiented = flg; 884 break; 885 case MAT_KEEP_NONZERO_PATTERN: 886 a->keepnonzeropattern = flg; 887 break; 888 case MAT_NEW_NONZERO_LOCATIONS: 889 a->nonew = (flg ? 0 : 1); 890 break; 891 case MAT_NEW_NONZERO_LOCATION_ERR: 892 a->nonew = (flg ? -1 : 0); 893 break; 894 case MAT_NEW_NONZERO_ALLOCATION_ERR: 895 a->nonew = (flg ? -2 : 0); 896 break; 897 case MAT_UNUSED_NONZERO_LOCATION_ERR: 898 a->nounused = (flg ? -1 : 0); 899 break; 900 case MAT_FORCE_DIAGONAL_ENTRIES: 901 case MAT_IGNORE_OFF_PROC_ENTRIES: 902 case MAT_USE_HASH_TABLE: 903 case MAT_SORTED_FULL: 904 PetscCall(PetscInfo(A,"Option %s ignored\n",MatOptions[op])); 905 break; 906 case MAT_SPD: 907 case MAT_SYMMETRIC: 908 case MAT_STRUCTURALLY_SYMMETRIC: 909 case MAT_HERMITIAN: 910 case MAT_SYMMETRY_ETERNAL: 911 /* These options are handled directly by MatSetOption() */ 912 break; 913 default: 914 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"unknown option %d",op); 915 } 916 PetscFunctionReturn(0); 917 } 918 919 PetscErrorCode MatGetDiagonal_SeqSELL(Mat A,Vec v) 920 { 921 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 922 PetscInt i,j,n,shift; 923 PetscScalar *x,zero=0.0; 924 925 PetscFunctionBegin; 926 PetscCall(VecGetLocalSize(v,&n)); 927 PetscCheck(n == A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 928 929 if (A->factortype == MAT_FACTOR_ILU || A->factortype == MAT_FACTOR_LU) { 930 PetscInt *diag=a->diag; 931 PetscCall(VecGetArray(v,&x)); 932 for (i=0; i<n; i++) x[i] = 1.0/a->val[diag[i]]; 933 PetscCall(VecRestoreArray(v,&x)); 934 PetscFunctionReturn(0); 935 } 936 937 PetscCall(VecSet(v,zero)); 938 PetscCall(VecGetArray(v,&x)); 939 for (i=0; i<n; i++) { /* loop over rows */ 940 shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */ 941 x[i] = 0; 942 for (j=0; j<a->rlen[i]; j++) { 943 if (a->colidx[shift+j*8] == i) { 944 x[i] = a->val[shift+j*8]; 945 break; 946 } 947 } 948 } 949 PetscCall(VecRestoreArray(v,&x)); 950 PetscFunctionReturn(0); 951 } 952 953 PetscErrorCode MatDiagonalScale_SeqSELL(Mat A,Vec ll,Vec rr) 954 { 955 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 956 const PetscScalar *l,*r; 957 PetscInt i,j,m,n,row; 958 959 PetscFunctionBegin; 960 if (ll) { 961 /* The local size is used so that VecMPI can be passed to this routine 962 by MatDiagonalScale_MPISELL */ 963 PetscCall(VecGetLocalSize(ll,&m)); 964 PetscCheck(m == A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length"); 965 PetscCall(VecGetArrayRead(ll,&l)); 966 for (i=0; i<a->totalslices; i++) { /* loop over slices */ 967 if (i == a->totalslices-1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */ 968 for (j=a->sliidx[i],row=0; j<a->sliidx[i+1]; j++,row=((row+1)&0x07)) { 969 if (row < (A->rmap->n & 0x07)) a->val[j] *= l[8*i+row]; 970 } 971 } else { 972 for (j=a->sliidx[i],row=0; j<a->sliidx[i+1]; j++,row=((row+1)&0x07)) { 973 a->val[j] *= l[8*i+row]; 974 } 975 } 976 } 977 PetscCall(VecRestoreArrayRead(ll,&l)); 978 PetscCall(PetscLogFlops(a->nz)); 979 } 980 if (rr) { 981 PetscCall(VecGetLocalSize(rr,&n)); 982 PetscCheck(n == A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length"); 983 PetscCall(VecGetArrayRead(rr,&r)); 984 for (i=0; i<a->totalslices; i++) { /* loop over slices */ 985 if (i == a->totalslices-1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */ 986 for (j=a->sliidx[i],row=0; j<a->sliidx[i+1]; j++,row=((row+1)&0x07)) { 987 if (row < (A->rmap->n & 0x07)) a->val[j] *= r[a->colidx[j]]; 988 } 989 } else { 990 for (j=a->sliidx[i]; j<a->sliidx[i+1]; j++) { 991 a->val[j] *= r[a->colidx[j]]; 992 } 993 } 994 } 995 PetscCall(VecRestoreArrayRead(rr,&r)); 996 PetscCall(PetscLogFlops(a->nz)); 997 } 998 PetscCall(MatSeqSELLInvalidateDiagonal(A)); 999 PetscFunctionReturn(0); 1000 } 1001 1002 PetscErrorCode MatGetValues_SeqSELL(Mat A,PetscInt m,const PetscInt im[],PetscInt n,const PetscInt in[],PetscScalar v[]) 1003 { 1004 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 1005 PetscInt *cp,i,k,low,high,t,row,col,l; 1006 PetscInt shift; 1007 MatScalar *vp; 1008 1009 PetscFunctionBegin; 1010 for (k=0; k<m; k++) { /* loop over requested rows */ 1011 row = im[k]; 1012 if (row<0) continue; 1013 PetscCheck(row < A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT,row,A->rmap->n-1); 1014 shift = a->sliidx[row>>3]+(row&0x07); /* starting index of the row */ 1015 cp = a->colidx+shift; /* pointer to the row */ 1016 vp = a->val+shift; /* pointer to the row */ 1017 for (l=0; l<n; l++) { /* loop over requested columns */ 1018 col = in[l]; 1019 if (col<0) continue; 1020 PetscCheck(col < A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Column too large: row %" PetscInt_FMT " max %" PetscInt_FMT,col,A->cmap->n-1); 1021 high = a->rlen[row]; low = 0; /* assume unsorted */ 1022 while (high-low > 5) { 1023 t = (low+high)/2; 1024 if (*(cp+t*8) > col) high = t; 1025 else low = t; 1026 } 1027 for (i=low; i<high; i++) { 1028 if (*(cp+8*i) > col) break; 1029 if (*(cp+8*i) == col) { 1030 *v++ = *(vp+8*i); 1031 goto finished; 1032 } 1033 } 1034 *v++ = 0.0; 1035 finished:; 1036 } 1037 } 1038 PetscFunctionReturn(0); 1039 } 1040 1041 PetscErrorCode MatView_SeqSELL_ASCII(Mat A,PetscViewer viewer) 1042 { 1043 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 1044 PetscInt i,j,m=A->rmap->n,shift; 1045 const char *name; 1046 PetscViewerFormat format; 1047 1048 PetscFunctionBegin; 1049 PetscCall(PetscViewerGetFormat(viewer,&format)); 1050 if (format == PETSC_VIEWER_ASCII_MATLAB) { 1051 PetscInt nofinalvalue = 0; 1052 /* 1053 if (m && ((a->i[m] == a->i[m-1]) || (a->j[a->nz-1] != A->cmap->n-1))) { 1054 nofinalvalue = 1; 1055 } 1056 */ 1057 PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_FALSE)); 1058 PetscCall(PetscViewerASCIIPrintf(viewer,"%% Size = %" PetscInt_FMT " %" PetscInt_FMT " \n",m,A->cmap->n)); 1059 PetscCall(PetscViewerASCIIPrintf(viewer,"%% Nonzeros = %" PetscInt_FMT " \n",a->nz)); 1060 #if defined(PETSC_USE_COMPLEX) 1061 PetscCall(PetscViewerASCIIPrintf(viewer,"zzz = zeros(%" PetscInt_FMT ",4);\n",a->nz+nofinalvalue)); 1062 #else 1063 PetscCall(PetscViewerASCIIPrintf(viewer,"zzz = zeros(%" PetscInt_FMT ",3);\n",a->nz+nofinalvalue)); 1064 #endif 1065 PetscCall(PetscViewerASCIIPrintf(viewer,"zzz = [\n")); 1066 1067 for (i=0; i<m; i++) { 1068 shift = a->sliidx[i>>3]+(i&0x07); 1069 for (j=0; j<a->rlen[i]; j++) { 1070 #if defined(PETSC_USE_COMPLEX) 1071 PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %18.16e %18.16e\n",i+1,a->colidx[shift+8*j]+1,(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j]))); 1072 #else 1073 PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %18.16e\n",i+1,a->colidx[shift+8*j]+1,(double)a->val[shift+8*j])); 1074 #endif 1075 } 1076 } 1077 /* 1078 if (nofinalvalue) { 1079 #if defined(PETSC_USE_COMPLEX) 1080 PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %18.16e %18.16e\n",m,A->cmap->n,0.,0.)); 1081 #else 1082 PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %18.16e\n",m,A->cmap->n,0.0)); 1083 #endif 1084 } 1085 */ 1086 PetscCall(PetscObjectGetName((PetscObject)A,&name)); 1087 PetscCall(PetscViewerASCIIPrintf(viewer,"];\n %s = spconvert(zzz);\n",name)); 1088 PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_TRUE)); 1089 } else if (format == PETSC_VIEWER_ASCII_FACTOR_INFO || format == PETSC_VIEWER_ASCII_INFO) { 1090 PetscFunctionReturn(0); 1091 } else if (format == PETSC_VIEWER_ASCII_COMMON) { 1092 PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_FALSE)); 1093 for (i=0; i<m; i++) { 1094 PetscCall(PetscViewerASCIIPrintf(viewer,"row %" PetscInt_FMT ":",i)); 1095 shift = a->sliidx[i>>3]+(i&0x07); 1096 for (j=0; j<a->rlen[i]; j++) { 1097 #if defined(PETSC_USE_COMPLEX) 1098 if (PetscImaginaryPart(a->val[shift+8*j]) > 0.0 && PetscRealPart(a->val[shift+8*j]) != 0.0) { 1099 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g + %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j]))); 1100 } else if (PetscImaginaryPart(a->val[shift+8*j]) < 0.0 && PetscRealPart(a->val[shift+8*j]) != 0.0) { 1101 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g - %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)-PetscImaginaryPart(a->val[shift+8*j]))); 1102 } else if (PetscRealPart(a->val[shift+8*j]) != 0.0) { 1103 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]))); 1104 } 1105 #else 1106 if (a->val[shift+8*j] != 0.0) PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[shift+8*j],(double)a->val[shift+8*j])); 1107 #endif 1108 } 1109 PetscCall(PetscViewerASCIIPrintf(viewer,"\n")); 1110 } 1111 PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_TRUE)); 1112 } else if (format == PETSC_VIEWER_ASCII_DENSE) { 1113 PetscInt cnt=0,jcnt; 1114 PetscScalar value; 1115 #if defined(PETSC_USE_COMPLEX) 1116 PetscBool realonly=PETSC_TRUE; 1117 for (i=0; i<a->sliidx[a->totalslices]; i++) { 1118 if (PetscImaginaryPart(a->val[i]) != 0.0) { 1119 realonly = PETSC_FALSE; 1120 break; 1121 } 1122 } 1123 #endif 1124 1125 PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_FALSE)); 1126 for (i=0; i<m; i++) { 1127 jcnt = 0; 1128 shift = a->sliidx[i>>3]+(i&0x07); 1129 for (j=0; j<A->cmap->n; j++) { 1130 if (jcnt < a->rlen[i] && j == a->colidx[shift+8*j]) { 1131 value = a->val[cnt++]; 1132 jcnt++; 1133 } else { 1134 value = 0.0; 1135 } 1136 #if defined(PETSC_USE_COMPLEX) 1137 if (realonly) { 1138 PetscCall(PetscViewerASCIIPrintf(viewer," %7.5e ",(double)PetscRealPart(value))); 1139 } else { 1140 PetscCall(PetscViewerASCIIPrintf(viewer," %7.5e+%7.5e i ",(double)PetscRealPart(value),(double)PetscImaginaryPart(value))); 1141 } 1142 #else 1143 PetscCall(PetscViewerASCIIPrintf(viewer," %7.5e ",(double)value)); 1144 #endif 1145 } 1146 PetscCall(PetscViewerASCIIPrintf(viewer,"\n")); 1147 } 1148 PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_TRUE)); 1149 } else if (format == PETSC_VIEWER_ASCII_MATRIXMARKET) { 1150 PetscInt fshift=1; 1151 PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_FALSE)); 1152 #if defined(PETSC_USE_COMPLEX) 1153 PetscCall(PetscViewerASCIIPrintf(viewer,"%%%%MatrixMarket matrix coordinate complex general\n")); 1154 #else 1155 PetscCall(PetscViewerASCIIPrintf(viewer,"%%%%MatrixMarket matrix coordinate real general\n")); 1156 #endif 1157 PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %" PetscInt_FMT "\n", m, A->cmap->n, a->nz)); 1158 for (i=0; i<m; i++) { 1159 shift = a->sliidx[i>>3]+(i&0x07); 1160 for (j=0; j<a->rlen[i]; j++) { 1161 #if defined(PETSC_USE_COMPLEX) 1162 PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %g %g\n",i+fshift,a->colidx[shift+8*j]+fshift,(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j]))); 1163 #else 1164 PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %g\n",i+fshift,a->colidx[shift+8*j]+fshift,(double)a->val[shift+8*j])); 1165 #endif 1166 } 1167 } 1168 PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_TRUE)); 1169 } else if (format == PETSC_VIEWER_NATIVE) { 1170 for (i=0; i<a->totalslices; i++) { /* loop over slices */ 1171 PetscInt row; 1172 PetscCall(PetscViewerASCIIPrintf(viewer,"slice %" PetscInt_FMT ": %" PetscInt_FMT " %" PetscInt_FMT "\n",i,a->sliidx[i],a->sliidx[i+1])); 1173 for (j=a->sliidx[i],row=0; j<a->sliidx[i+1]; j++,row=((row+1)&0x07)) { 1174 #if defined(PETSC_USE_COMPLEX) 1175 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1176 PetscCall(PetscViewerASCIIPrintf(viewer," %" PetscInt_FMT " %" PetscInt_FMT " %g + %g i\n",8*i+row,a->colidx[j],(double)PetscRealPart(a->val[j]),(double)PetscImaginaryPart(a->val[j]))); 1177 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1178 PetscCall(PetscViewerASCIIPrintf(viewer," %" PetscInt_FMT " %" PetscInt_FMT " %g - %g i\n",8*i+row,a->colidx[j],(double)PetscRealPart(a->val[j]),-(double)PetscImaginaryPart(a->val[j]))); 1179 } else { 1180 PetscCall(PetscViewerASCIIPrintf(viewer," %" PetscInt_FMT " %" PetscInt_FMT " %g\n",8*i+row,a->colidx[j],(double)PetscRealPart(a->val[j]))); 1181 } 1182 #else 1183 PetscCall(PetscViewerASCIIPrintf(viewer," %" PetscInt_FMT " %" PetscInt_FMT " %g\n",8*i+row,a->colidx[j],(double)a->val[j])); 1184 #endif 1185 } 1186 } 1187 } else { 1188 PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_FALSE)); 1189 if (A->factortype) { 1190 for (i=0; i<m; i++) { 1191 shift = a->sliidx[i>>3]+(i&0x07); 1192 PetscCall(PetscViewerASCIIPrintf(viewer,"row %" PetscInt_FMT ":",i)); 1193 /* L part */ 1194 for (j=shift; j<a->diag[i]; j+=8) { 1195 #if defined(PETSC_USE_COMPLEX) 1196 if (PetscImaginaryPart(a->val[shift+8*j]) > 0.0) { 1197 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g + %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)PetscImaginaryPart(a->val[j]))); 1198 } else if (PetscImaginaryPart(a->val[shift+8*j]) < 0.0) { 1199 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g - %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)(-PetscImaginaryPart(a->val[j])))); 1200 } else { 1201 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[j],(double)PetscRealPart(a->val[j]))); 1202 } 1203 #else 1204 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[j],(double)a->val[j])); 1205 #endif 1206 } 1207 /* diagonal */ 1208 j = a->diag[i]; 1209 #if defined(PETSC_USE_COMPLEX) 1210 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1211 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g + %g i)",a->colidx[j],(double)PetscRealPart(1.0/a->val[j]),(double)PetscImaginaryPart(1.0/a->val[j]))); 1212 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1213 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g - %g i)",a->colidx[j],(double)PetscRealPart(1.0/a->val[j]),(double)(-PetscImaginaryPart(1.0/a->val[j])))); 1214 } else { 1215 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[j],(double)PetscRealPart(1.0/a->val[j]))); 1216 } 1217 #else 1218 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[j],(double)(1.0/a->val[j]))); 1219 #endif 1220 1221 /* U part */ 1222 for (j=a->diag[i]+1; j<shift+8*a->rlen[i]; j+=8) { 1223 #if defined(PETSC_USE_COMPLEX) 1224 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1225 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g + %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)PetscImaginaryPart(a->val[j]))); 1226 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1227 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g - %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)(-PetscImaginaryPart(a->val[j])))); 1228 } else { 1229 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[j],(double)PetscRealPart(a->val[j]))); 1230 } 1231 #else 1232 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[j],(double)a->val[j])); 1233 #endif 1234 } 1235 PetscCall(PetscViewerASCIIPrintf(viewer,"\n")); 1236 } 1237 } else { 1238 for (i=0; i<m; i++) { 1239 shift = a->sliidx[i>>3]+(i&0x07); 1240 PetscCall(PetscViewerASCIIPrintf(viewer,"row %" PetscInt_FMT ":",i)); 1241 for (j=0; j<a->rlen[i]; j++) { 1242 #if defined(PETSC_USE_COMPLEX) 1243 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1244 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g + %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j]))); 1245 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1246 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g - %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)-PetscImaginaryPart(a->val[shift+8*j]))); 1247 } else { 1248 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]))); 1249 } 1250 #else 1251 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[shift+8*j],(double)a->val[shift+8*j])); 1252 #endif 1253 } 1254 PetscCall(PetscViewerASCIIPrintf(viewer,"\n")); 1255 } 1256 } 1257 PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_TRUE)); 1258 } 1259 PetscCall(PetscViewerFlush(viewer)); 1260 PetscFunctionReturn(0); 1261 } 1262 1263 #include <petscdraw.h> 1264 PetscErrorCode MatView_SeqSELL_Draw_Zoom(PetscDraw draw,void *Aa) 1265 { 1266 Mat A=(Mat)Aa; 1267 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 1268 PetscInt i,j,m=A->rmap->n,shift; 1269 int color; 1270 PetscReal xl,yl,xr,yr,x_l,x_r,y_l,y_r; 1271 PetscViewer viewer; 1272 PetscViewerFormat format; 1273 1274 PetscFunctionBegin; 1275 PetscCall(PetscObjectQuery((PetscObject)A,"Zoomviewer",(PetscObject*)&viewer)); 1276 PetscCall(PetscViewerGetFormat(viewer,&format)); 1277 PetscCall(PetscDrawGetCoordinates(draw,&xl,&yl,&xr,&yr)); 1278 1279 /* loop over matrix elements drawing boxes */ 1280 1281 if (format != PETSC_VIEWER_DRAW_CONTOUR) { 1282 PetscDrawCollectiveBegin(draw); 1283 /* Blue for negative, Cyan for zero and Red for positive */ 1284 color = PETSC_DRAW_BLUE; 1285 for (i=0; i<m; i++) { 1286 shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */ 1287 y_l = m - i - 1.0; y_r = y_l + 1.0; 1288 for (j=0; j<a->rlen[i]; j++) { 1289 x_l = a->colidx[shift+j*8]; x_r = x_l + 1.0; 1290 if (PetscRealPart(a->val[shift+8*j]) >= 0.) continue; 1291 PetscCall(PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color)); 1292 } 1293 } 1294 color = PETSC_DRAW_CYAN; 1295 for (i=0; i<m; i++) { 1296 shift = a->sliidx[i>>3]+(i&0x07); 1297 y_l = m - i - 1.0; y_r = y_l + 1.0; 1298 for (j=0; j<a->rlen[i]; j++) { 1299 x_l = a->colidx[shift+j*8]; x_r = x_l + 1.0; 1300 if (a->val[shift+8*j] != 0.) continue; 1301 PetscCall(PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color)); 1302 } 1303 } 1304 color = PETSC_DRAW_RED; 1305 for (i=0; i<m; i++) { 1306 shift = a->sliidx[i>>3]+(i&0x07); 1307 y_l = m - i - 1.0; y_r = y_l + 1.0; 1308 for (j=0; j<a->rlen[i]; j++) { 1309 x_l = a->colidx[shift+j*8]; x_r = x_l + 1.0; 1310 if (PetscRealPart(a->val[shift+8*j]) <= 0.) continue; 1311 PetscCall(PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color)); 1312 } 1313 } 1314 PetscDrawCollectiveEnd(draw); 1315 } else { 1316 /* use contour shading to indicate magnitude of values */ 1317 /* first determine max of all nonzero values */ 1318 PetscReal minv=0.0,maxv=0.0; 1319 PetscInt count=0; 1320 PetscDraw popup; 1321 for (i=0; i<a->sliidx[a->totalslices]; i++) { 1322 if (PetscAbsScalar(a->val[i]) > maxv) maxv = PetscAbsScalar(a->val[i]); 1323 } 1324 if (minv >= maxv) maxv = minv + PETSC_SMALL; 1325 PetscCall(PetscDrawGetPopup(draw,&popup)); 1326 PetscCall(PetscDrawScalePopup(popup,minv,maxv)); 1327 1328 PetscDrawCollectiveBegin(draw); 1329 for (i=0; i<m; i++) { 1330 shift = a->sliidx[i>>3]+(i&0x07); 1331 y_l = m - i - 1.0; 1332 y_r = y_l + 1.0; 1333 for (j=0; j<a->rlen[i]; j++) { 1334 x_l = a->colidx[shift+j*8]; 1335 x_r = x_l + 1.0; 1336 color = PetscDrawRealToColor(PetscAbsScalar(a->val[count]),minv,maxv); 1337 PetscCall(PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color)); 1338 count++; 1339 } 1340 } 1341 PetscDrawCollectiveEnd(draw); 1342 } 1343 PetscFunctionReturn(0); 1344 } 1345 1346 #include <petscdraw.h> 1347 PetscErrorCode MatView_SeqSELL_Draw(Mat A,PetscViewer viewer) 1348 { 1349 PetscDraw draw; 1350 PetscReal xr,yr,xl,yl,h,w; 1351 PetscBool isnull; 1352 1353 PetscFunctionBegin; 1354 PetscCall(PetscViewerDrawGetDraw(viewer,0,&draw)); 1355 PetscCall(PetscDrawIsNull(draw,&isnull)); 1356 if (isnull) PetscFunctionReturn(0); 1357 1358 xr = A->cmap->n; yr = A->rmap->n; h = yr/10.0; w = xr/10.0; 1359 xr += w; yr += h; xl = -w; yl = -h; 1360 PetscCall(PetscDrawSetCoordinates(draw,xl,yl,xr,yr)); 1361 PetscCall(PetscObjectCompose((PetscObject)A,"Zoomviewer",(PetscObject)viewer)); 1362 PetscCall(PetscDrawZoom(draw,MatView_SeqSELL_Draw_Zoom,A)); 1363 PetscCall(PetscObjectCompose((PetscObject)A,"Zoomviewer",NULL)); 1364 PetscCall(PetscDrawSave(draw)); 1365 PetscFunctionReturn(0); 1366 } 1367 1368 PetscErrorCode MatView_SeqSELL(Mat A,PetscViewer viewer) 1369 { 1370 PetscBool iascii,isbinary,isdraw; 1371 1372 PetscFunctionBegin; 1373 PetscCall(PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&iascii)); 1374 PetscCall(PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERBINARY,&isbinary)); 1375 PetscCall(PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERDRAW,&isdraw)); 1376 if (iascii) { 1377 PetscCall(MatView_SeqSELL_ASCII(A,viewer)); 1378 } else if (isbinary) { 1379 /* PetscCall(MatView_SeqSELL_Binary(A,viewer)); */ 1380 } else if (isdraw) { 1381 PetscCall(MatView_SeqSELL_Draw(A,viewer)); 1382 } 1383 PetscFunctionReturn(0); 1384 } 1385 1386 PetscErrorCode MatAssemblyEnd_SeqSELL(Mat A,MatAssemblyType mode) 1387 { 1388 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 1389 PetscInt i,shift,row_in_slice,row,nrow,*cp,lastcol,j,k; 1390 MatScalar *vp; 1391 1392 PetscFunctionBegin; 1393 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 1394 /* To do: compress out the unused elements */ 1395 PetscCall(MatMarkDiagonal_SeqSELL(A)); 1396 PetscCall(PetscInfo(A,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: %" PetscInt_FMT " allocated %" PetscInt_FMT " used (%" PetscInt_FMT " nonzeros+%" PetscInt_FMT " paddedzeros)\n",A->rmap->n,A->cmap->n,a->maxallocmat,a->sliidx[a->totalslices],a->nz,a->sliidx[a->totalslices]-a->nz)); 1397 PetscCall(PetscInfo(A,"Number of mallocs during MatSetValues() is %" PetscInt_FMT "\n",a->reallocs)); 1398 PetscCall(PetscInfo(A,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",a->rlenmax)); 1399 /* Set unused slots for column indices to last valid column index. Set unused slots for values to zero. This allows for a use of unmasked intrinsics -> higher performance */ 1400 for (i=0; i<a->totalslices; ++i) { 1401 shift = a->sliidx[i]; /* starting index of the slice */ 1402 cp = a->colidx+shift; /* pointer to the column indices of the slice */ 1403 vp = a->val+shift; /* pointer to the nonzero values of the slice */ 1404 for (row_in_slice=0; row_in_slice<8; ++row_in_slice) { /* loop over rows in the slice */ 1405 row = 8*i + row_in_slice; 1406 nrow = a->rlen[row]; /* number of nonzeros in row */ 1407 /* 1408 Search for the nearest nonzero. Normally setting the index to zero may cause extra communication. 1409 But if the entire slice are empty, it is fine to use 0 since the index will not be loaded. 1410 */ 1411 lastcol = 0; 1412 if (nrow>0) { /* nonempty row */ 1413 lastcol = cp[8*(nrow-1)+row_in_slice]; /* use the index from the last nonzero at current row */ 1414 } else if (!row_in_slice) { /* first row of the currect slice is empty */ 1415 for (j=1;j<8;j++) { 1416 if (a->rlen[8*i+j]) { 1417 lastcol = cp[j]; 1418 break; 1419 } 1420 } 1421 } else { 1422 if (a->sliidx[i+1] != shift) lastcol = cp[row_in_slice-1]; /* use the index from the previous row */ 1423 } 1424 1425 for (k=nrow; k<(a->sliidx[i+1]-shift)/8; ++k) { 1426 cp[8*k+row_in_slice] = lastcol; 1427 vp[8*k+row_in_slice] = (MatScalar)0; 1428 } 1429 } 1430 } 1431 1432 A->info.mallocs += a->reallocs; 1433 a->reallocs = 0; 1434 1435 PetscCall(MatSeqSELLInvalidateDiagonal(A)); 1436 PetscFunctionReturn(0); 1437 } 1438 1439 PetscErrorCode MatGetInfo_SeqSELL(Mat A,MatInfoType flag,MatInfo *info) 1440 { 1441 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 1442 1443 PetscFunctionBegin; 1444 info->block_size = 1.0; 1445 info->nz_allocated = a->maxallocmat; 1446 info->nz_used = a->sliidx[a->totalslices]; /* include padding zeros */ 1447 info->nz_unneeded = (a->maxallocmat-a->sliidx[a->totalslices]); 1448 info->assemblies = A->num_ass; 1449 info->mallocs = A->info.mallocs; 1450 info->memory = ((PetscObject)A)->mem; 1451 if (A->factortype) { 1452 info->fill_ratio_given = A->info.fill_ratio_given; 1453 info->fill_ratio_needed = A->info.fill_ratio_needed; 1454 info->factor_mallocs = A->info.factor_mallocs; 1455 } else { 1456 info->fill_ratio_given = 0; 1457 info->fill_ratio_needed = 0; 1458 info->factor_mallocs = 0; 1459 } 1460 PetscFunctionReturn(0); 1461 } 1462 1463 PetscErrorCode MatSetValues_SeqSELL(Mat A,PetscInt m,const PetscInt im[],PetscInt n,const PetscInt in[],const PetscScalar v[],InsertMode is) 1464 { 1465 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 1466 PetscInt shift,i,k,l,low,high,t,ii,row,col,nrow; 1467 PetscInt *cp,nonew=a->nonew,lastcol=-1; 1468 MatScalar *vp,value; 1469 1470 PetscFunctionBegin; 1471 for (k=0; k<m; k++) { /* loop over added rows */ 1472 row = im[k]; 1473 if (row < 0) continue; 1474 PetscCheck(row < A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT,row,A->rmap->n-1); 1475 shift = a->sliidx[row>>3]+(row&0x07); /* starting index of the row */ 1476 cp = a->colidx+shift; /* pointer to the row */ 1477 vp = a->val+shift; /* pointer to the row */ 1478 nrow = a->rlen[row]; 1479 low = 0; 1480 high = nrow; 1481 1482 for (l=0; l<n; l++) { /* loop over added columns */ 1483 col = in[l]; 1484 if (col<0) continue; 1485 PetscCheck(col < A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Col too large: row %" PetscInt_FMT " max %" PetscInt_FMT,col,A->cmap->n-1); 1486 if (a->roworiented) { 1487 value = v[l+k*n]; 1488 } else { 1489 value = v[k+l*m]; 1490 } 1491 if ((value == 0.0 && a->ignorezeroentries) && (is == ADD_VALUES)) continue; 1492 1493 /* search in this row for the specified column, i indicates the column to be set */ 1494 if (col <= lastcol) low = 0; 1495 else high = nrow; 1496 lastcol = col; 1497 while (high-low > 5) { 1498 t = (low+high)/2; 1499 if (*(cp+t*8) > col) high = t; 1500 else low = t; 1501 } 1502 for (i=low; i<high; i++) { 1503 if (*(cp+i*8) > col) break; 1504 if (*(cp+i*8) == col) { 1505 if (is == ADD_VALUES) *(vp+i*8) += value; 1506 else *(vp+i*8) = value; 1507 low = i + 1; 1508 goto noinsert; 1509 } 1510 } 1511 if (value == 0.0 && a->ignorezeroentries) goto noinsert; 1512 if (nonew == 1) goto noinsert; 1513 PetscCheck(nonew != -1,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Inserting a new nonzero (%" PetscInt_FMT ", %" PetscInt_FMT ") in the matrix", row, col); 1514 /* If the current row length exceeds the slice width (e.g. nrow==slice_width), allocate a new space, otherwise do nothing */ 1515 MatSeqXSELLReallocateSELL(A,A->rmap->n,1,nrow,a->sliidx,row/8,row,col,a->colidx,a->val,cp,vp,nonew,MatScalar); 1516 /* add the new nonzero to the high position, shift the remaining elements in current row to the right by one slot */ 1517 for (ii=nrow-1; ii>=i; ii--) { 1518 *(cp+(ii+1)*8) = *(cp+ii*8); 1519 *(vp+(ii+1)*8) = *(vp+ii*8); 1520 } 1521 a->rlen[row]++; 1522 *(cp+i*8) = col; 1523 *(vp+i*8) = value; 1524 a->nz++; 1525 A->nonzerostate++; 1526 low = i+1; high++; nrow++; 1527 noinsert:; 1528 } 1529 a->rlen[row] = nrow; 1530 } 1531 PetscFunctionReturn(0); 1532 } 1533 1534 PetscErrorCode MatCopy_SeqSELL(Mat A,Mat B,MatStructure str) 1535 { 1536 PetscFunctionBegin; 1537 /* If the two matrices have the same copy implementation, use fast copy. */ 1538 if (str == SAME_NONZERO_PATTERN && (A->ops->copy == B->ops->copy)) { 1539 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 1540 Mat_SeqSELL *b=(Mat_SeqSELL*)B->data; 1541 1542 PetscCheck(a->sliidx[a->totalslices] == b->sliidx[b->totalslices],PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Number of nonzeros in two matrices are different"); 1543 PetscCall(PetscArraycpy(b->val,a->val,a->sliidx[a->totalslices])); 1544 } else { 1545 PetscCall(MatCopy_Basic(A,B,str)); 1546 } 1547 PetscFunctionReturn(0); 1548 } 1549 1550 PetscErrorCode MatSetUp_SeqSELL(Mat A) 1551 { 1552 PetscFunctionBegin; 1553 PetscCall(MatSeqSELLSetPreallocation(A,PETSC_DEFAULT,NULL)); 1554 PetscFunctionReturn(0); 1555 } 1556 1557 PetscErrorCode MatSeqSELLGetArray_SeqSELL(Mat A,PetscScalar *array[]) 1558 { 1559 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 1560 1561 PetscFunctionBegin; 1562 *array = a->val; 1563 PetscFunctionReturn(0); 1564 } 1565 1566 PetscErrorCode MatSeqSELLRestoreArray_SeqSELL(Mat A,PetscScalar *array[]) 1567 { 1568 PetscFunctionBegin; 1569 PetscFunctionReturn(0); 1570 } 1571 1572 PetscErrorCode MatRealPart_SeqSELL(Mat A) 1573 { 1574 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 1575 PetscInt i; 1576 MatScalar *aval=a->val; 1577 1578 PetscFunctionBegin; 1579 for (i=0; i<a->sliidx[a->totalslices]; i++) aval[i]=PetscRealPart(aval[i]); 1580 PetscFunctionReturn(0); 1581 } 1582 1583 PetscErrorCode MatImaginaryPart_SeqSELL(Mat A) 1584 { 1585 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 1586 PetscInt i; 1587 MatScalar *aval=a->val; 1588 1589 PetscFunctionBegin; 1590 for (i=0; i<a->sliidx[a->totalslices]; i++) aval[i] = PetscImaginaryPart(aval[i]); 1591 PetscCall(MatSeqSELLInvalidateDiagonal(A)); 1592 PetscFunctionReturn(0); 1593 } 1594 1595 PetscErrorCode MatScale_SeqSELL(Mat inA,PetscScalar alpha) 1596 { 1597 Mat_SeqSELL *a=(Mat_SeqSELL*)inA->data; 1598 MatScalar *aval=a->val; 1599 PetscScalar oalpha=alpha; 1600 PetscBLASInt one=1,size; 1601 1602 PetscFunctionBegin; 1603 PetscCall(PetscBLASIntCast(a->sliidx[a->totalslices],&size)); 1604 PetscStackCallBLAS("BLASscal",BLASscal_(&size,&oalpha,aval,&one)); 1605 PetscCall(PetscLogFlops(a->nz)); 1606 PetscCall(MatSeqSELLInvalidateDiagonal(inA)); 1607 PetscFunctionReturn(0); 1608 } 1609 1610 PetscErrorCode MatShift_SeqSELL(Mat Y,PetscScalar a) 1611 { 1612 Mat_SeqSELL *y=(Mat_SeqSELL*)Y->data; 1613 1614 PetscFunctionBegin; 1615 if (!Y->preallocated || !y->nz) { 1616 PetscCall(MatSeqSELLSetPreallocation(Y,1,NULL)); 1617 } 1618 PetscCall(MatShift_Basic(Y,a)); 1619 PetscFunctionReturn(0); 1620 } 1621 1622 PetscErrorCode MatSOR_SeqSELL(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx) 1623 { 1624 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 1625 PetscScalar *x,sum,*t; 1626 const MatScalar *idiag=NULL,*mdiag; 1627 const PetscScalar *b,*xb; 1628 PetscInt n,m=A->rmap->n,i,j,shift; 1629 const PetscInt *diag; 1630 1631 PetscFunctionBegin; 1632 its = its*lits; 1633 1634 if (fshift != a->fshift || omega != a->omega) a->idiagvalid = PETSC_FALSE; /* must recompute idiag[] */ 1635 if (!a->idiagvalid) PetscCall(MatInvertDiagonal_SeqSELL(A,omega,fshift)); 1636 a->fshift = fshift; 1637 a->omega = omega; 1638 1639 diag = a->diag; 1640 t = a->ssor_work; 1641 idiag = a->idiag; 1642 mdiag = a->mdiag; 1643 1644 PetscCall(VecGetArray(xx,&x)); 1645 PetscCall(VecGetArrayRead(bb,&b)); 1646 /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */ 1647 PetscCheck(flag != SOR_APPLY_UPPER,PETSC_COMM_SELF,PETSC_ERR_SUP,"SOR_APPLY_UPPER is not implemented"); 1648 PetscCheck(flag != SOR_APPLY_LOWER,PETSC_COMM_SELF,PETSC_ERR_SUP,"SOR_APPLY_LOWER is not implemented"); 1649 PetscCheck(!(flag & SOR_EISENSTAT),PETSC_COMM_SELF,PETSC_ERR_SUP,"No support yet for Eisenstat"); 1650 1651 if (flag & SOR_ZERO_INITIAL_GUESS) { 1652 if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) { 1653 for (i=0; i<m; i++) { 1654 shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */ 1655 sum = b[i]; 1656 n = (diag[i]-shift)/8; 1657 for (j=0; j<n; j++) sum -= a->val[shift+j*8]*x[a->colidx[shift+j*8]]; 1658 t[i] = sum; 1659 x[i] = sum*idiag[i]; 1660 } 1661 xb = t; 1662 PetscCall(PetscLogFlops(a->nz)); 1663 } else xb = b; 1664 if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) { 1665 for (i=m-1; i>=0; i--) { 1666 shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */ 1667 sum = xb[i]; 1668 n = a->rlen[i]-(diag[i]-shift)/8-1; 1669 for (j=1; j<=n; j++) sum -= a->val[diag[i]+j*8]*x[a->colidx[diag[i]+j*8]]; 1670 if (xb == b) { 1671 x[i] = sum*idiag[i]; 1672 } else { 1673 x[i] = (1.-omega)*x[i]+sum*idiag[i]; /* omega in idiag */ 1674 } 1675 } 1676 PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */ 1677 } 1678 its--; 1679 } 1680 while (its--) { 1681 if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) { 1682 for (i=0; i<m; i++) { 1683 /* lower */ 1684 shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */ 1685 sum = b[i]; 1686 n = (diag[i]-shift)/8; 1687 for (j=0; j<n; j++) sum -= a->val[shift+j*8]*x[a->colidx[shift+j*8]]; 1688 t[i] = sum; /* save application of the lower-triangular part */ 1689 /* upper */ 1690 n = a->rlen[i]-(diag[i]-shift)/8-1; 1691 for (j=1; j<=n; j++) sum -= a->val[diag[i]+j*8]*x[a->colidx[diag[i]+j*8]]; 1692 x[i] = (1.-omega)*x[i]+sum*idiag[i]; /* omega in idiag */ 1693 } 1694 xb = t; 1695 PetscCall(PetscLogFlops(2.0*a->nz)); 1696 } else xb = b; 1697 if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) { 1698 for (i=m-1; i>=0; i--) { 1699 shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */ 1700 sum = xb[i]; 1701 if (xb == b) { 1702 /* whole matrix (no checkpointing available) */ 1703 n = a->rlen[i]; 1704 for (j=0; j<n; j++) sum -= a->val[shift+j*8]*x[a->colidx[shift+j*8]]; 1705 x[i] = (1.-omega)*x[i]+(sum+mdiag[i]*x[i])*idiag[i]; 1706 } else { /* lower-triangular part has been saved, so only apply upper-triangular */ 1707 n = a->rlen[i]-(diag[i]-shift)/8-1; 1708 for (j=1; j<=n; j++) sum -= a->val[diag[i]+j*8]*x[a->colidx[diag[i]+j*8]]; 1709 x[i] = (1.-omega)*x[i]+sum*idiag[i]; /* omega in idiag */ 1710 } 1711 } 1712 if (xb == b) { 1713 PetscCall(PetscLogFlops(2.0*a->nz)); 1714 } else { 1715 PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */ 1716 } 1717 } 1718 } 1719 PetscCall(VecRestoreArray(xx,&x)); 1720 PetscCall(VecRestoreArrayRead(bb,&b)); 1721 PetscFunctionReturn(0); 1722 } 1723 1724 /* -------------------------------------------------------------------*/ 1725 static struct _MatOps MatOps_Values = {MatSetValues_SeqSELL, 1726 MatGetRow_SeqSELL, 1727 MatRestoreRow_SeqSELL, 1728 MatMult_SeqSELL, 1729 /* 4*/ MatMultAdd_SeqSELL, 1730 MatMultTranspose_SeqSELL, 1731 MatMultTransposeAdd_SeqSELL, 1732 NULL, 1733 NULL, 1734 NULL, 1735 /* 10*/ NULL, 1736 NULL, 1737 NULL, 1738 MatSOR_SeqSELL, 1739 NULL, 1740 /* 15*/ MatGetInfo_SeqSELL, 1741 MatEqual_SeqSELL, 1742 MatGetDiagonal_SeqSELL, 1743 MatDiagonalScale_SeqSELL, 1744 NULL, 1745 /* 20*/ NULL, 1746 MatAssemblyEnd_SeqSELL, 1747 MatSetOption_SeqSELL, 1748 MatZeroEntries_SeqSELL, 1749 /* 24*/ NULL, 1750 NULL, 1751 NULL, 1752 NULL, 1753 NULL, 1754 /* 29*/ MatSetUp_SeqSELL, 1755 NULL, 1756 NULL, 1757 NULL, 1758 NULL, 1759 /* 34*/ MatDuplicate_SeqSELL, 1760 NULL, 1761 NULL, 1762 NULL, 1763 NULL, 1764 /* 39*/ NULL, 1765 NULL, 1766 NULL, 1767 MatGetValues_SeqSELL, 1768 MatCopy_SeqSELL, 1769 /* 44*/ NULL, 1770 MatScale_SeqSELL, 1771 MatShift_SeqSELL, 1772 NULL, 1773 NULL, 1774 /* 49*/ NULL, 1775 NULL, 1776 NULL, 1777 NULL, 1778 NULL, 1779 /* 54*/ MatFDColoringCreate_SeqXAIJ, 1780 NULL, 1781 NULL, 1782 NULL, 1783 NULL, 1784 /* 59*/ NULL, 1785 MatDestroy_SeqSELL, 1786 MatView_SeqSELL, 1787 NULL, 1788 NULL, 1789 /* 64*/ NULL, 1790 NULL, 1791 NULL, 1792 NULL, 1793 NULL, 1794 /* 69*/ NULL, 1795 NULL, 1796 NULL, 1797 NULL, 1798 NULL, 1799 /* 74*/ NULL, 1800 MatFDColoringApply_AIJ, /* reuse the FDColoring function for AIJ */ 1801 NULL, 1802 NULL, 1803 NULL, 1804 /* 79*/ NULL, 1805 NULL, 1806 NULL, 1807 NULL, 1808 NULL, 1809 /* 84*/ NULL, 1810 NULL, 1811 NULL, 1812 NULL, 1813 NULL, 1814 /* 89*/ NULL, 1815 NULL, 1816 NULL, 1817 NULL, 1818 NULL, 1819 /* 94*/ NULL, 1820 NULL, 1821 NULL, 1822 NULL, 1823 NULL, 1824 /* 99*/ NULL, 1825 NULL, 1826 NULL, 1827 MatConjugate_SeqSELL, 1828 NULL, 1829 /*104*/ NULL, 1830 NULL, 1831 NULL, 1832 NULL, 1833 NULL, 1834 /*109*/ NULL, 1835 NULL, 1836 NULL, 1837 NULL, 1838 MatMissingDiagonal_SeqSELL, 1839 /*114*/ NULL, 1840 NULL, 1841 NULL, 1842 NULL, 1843 NULL, 1844 /*119*/ NULL, 1845 NULL, 1846 NULL, 1847 NULL, 1848 NULL, 1849 /*124*/ NULL, 1850 NULL, 1851 NULL, 1852 NULL, 1853 NULL, 1854 /*129*/ NULL, 1855 NULL, 1856 NULL, 1857 NULL, 1858 NULL, 1859 /*134*/ NULL, 1860 NULL, 1861 NULL, 1862 NULL, 1863 NULL, 1864 /*139*/ NULL, 1865 NULL, 1866 NULL, 1867 MatFDColoringSetUp_SeqXAIJ, 1868 NULL, 1869 /*144*/ NULL, 1870 NULL, 1871 NULL, 1872 NULL 1873 }; 1874 1875 PetscErrorCode MatStoreValues_SeqSELL(Mat mat) 1876 { 1877 Mat_SeqSELL *a=(Mat_SeqSELL*)mat->data; 1878 1879 PetscFunctionBegin; 1880 PetscCheck(a->nonew,PETSC_COMM_SELF,PETSC_ERR_ORDER,"Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first"); 1881 1882 /* allocate space for values if not already there */ 1883 if (!a->saved_values) { 1884 PetscCall(PetscMalloc1(a->sliidx[a->totalslices]+1,&a->saved_values)); 1885 PetscCall(PetscLogObjectMemory((PetscObject)mat,(a->sliidx[a->totalslices]+1)*sizeof(PetscScalar))); 1886 } 1887 1888 /* copy values over */ 1889 PetscCall(PetscArraycpy(a->saved_values,a->val,a->sliidx[a->totalslices])); 1890 PetscFunctionReturn(0); 1891 } 1892 1893 PetscErrorCode MatRetrieveValues_SeqSELL(Mat mat) 1894 { 1895 Mat_SeqSELL *a=(Mat_SeqSELL*)mat->data; 1896 1897 PetscFunctionBegin; 1898 PetscCheck(a->nonew,PETSC_COMM_SELF,PETSC_ERR_ORDER,"Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first"); 1899 PetscCheck(a->saved_values,PETSC_COMM_SELF,PETSC_ERR_ORDER,"Must call MatStoreValues(A);first"); 1900 PetscCall(PetscArraycpy(a->val,a->saved_values,a->sliidx[a->totalslices])); 1901 PetscFunctionReturn(0); 1902 } 1903 1904 /*@C 1905 MatSeqSELLRestoreArray - returns access to the array where the data for a MATSEQSELL matrix is stored obtained by MatSeqSELLGetArray() 1906 1907 Not Collective 1908 1909 Input Parameters: 1910 . mat - a MATSEQSELL matrix 1911 . array - pointer to the data 1912 1913 Level: intermediate 1914 1915 .seealso: `MatSeqSELLGetArray()`, `MatSeqSELLRestoreArrayF90()` 1916 @*/ 1917 PetscErrorCode MatSeqSELLRestoreArray(Mat A,PetscScalar **array) 1918 { 1919 PetscFunctionBegin; 1920 PetscUseMethod(A,"MatSeqSELLRestoreArray_C",(Mat,PetscScalar**),(A,array)); 1921 PetscFunctionReturn(0); 1922 } 1923 1924 PETSC_EXTERN PetscErrorCode MatCreate_SeqSELL(Mat B) 1925 { 1926 Mat_SeqSELL *b; 1927 PetscMPIInt size; 1928 1929 PetscFunctionBegin; 1930 PetscCall(PetscCitationsRegister(citation,&cited)); 1931 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B),&size)); 1932 PetscCheck(size <= 1,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Comm must be of size 1"); 1933 1934 PetscCall(PetscNewLog(B,&b)); 1935 1936 B->data = (void*)b; 1937 1938 PetscCall(PetscMemcpy(B->ops,&MatOps_Values,sizeof(struct _MatOps))); 1939 1940 b->row = NULL; 1941 b->col = NULL; 1942 b->icol = NULL; 1943 b->reallocs = 0; 1944 b->ignorezeroentries = PETSC_FALSE; 1945 b->roworiented = PETSC_TRUE; 1946 b->nonew = 0; 1947 b->diag = NULL; 1948 b->solve_work = NULL; 1949 B->spptr = NULL; 1950 b->saved_values = NULL; 1951 b->idiag = NULL; 1952 b->mdiag = NULL; 1953 b->ssor_work = NULL; 1954 b->omega = 1.0; 1955 b->fshift = 0.0; 1956 b->idiagvalid = PETSC_FALSE; 1957 b->keepnonzeropattern = PETSC_FALSE; 1958 1959 PetscCall(PetscObjectChangeTypeName((PetscObject)B,MATSEQSELL)); 1960 PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatSeqSELLGetArray_C",MatSeqSELLGetArray_SeqSELL)); 1961 PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatSeqSELLRestoreArray_C",MatSeqSELLRestoreArray_SeqSELL)); 1962 PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatStoreValues_C",MatStoreValues_SeqSELL)); 1963 PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatRetrieveValues_C",MatRetrieveValues_SeqSELL)); 1964 PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatSeqSELLSetPreallocation_C",MatSeqSELLSetPreallocation_SeqSELL)); 1965 PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqsell_seqaij_C",MatConvert_SeqSELL_SeqAIJ)); 1966 PetscFunctionReturn(0); 1967 } 1968 1969 /* 1970 Given a matrix generated with MatGetFactor() duplicates all the information in A into B 1971 */ 1972 PetscErrorCode MatDuplicateNoCreate_SeqSELL(Mat C,Mat A,MatDuplicateOption cpvalues,PetscBool mallocmatspace) 1973 { 1974 Mat_SeqSELL *c = (Mat_SeqSELL*)C->data,*a = (Mat_SeqSELL*)A->data; 1975 PetscInt i,m=A->rmap->n; 1976 PetscInt totalslices=a->totalslices; 1977 1978 PetscFunctionBegin; 1979 C->factortype = A->factortype; 1980 c->row = NULL; 1981 c->col = NULL; 1982 c->icol = NULL; 1983 c->reallocs = 0; 1984 C->assembled = PETSC_TRUE; 1985 1986 PetscCall(PetscLayoutReference(A->rmap,&C->rmap)); 1987 PetscCall(PetscLayoutReference(A->cmap,&C->cmap)); 1988 1989 PetscCall(PetscMalloc1(8*totalslices,&c->rlen)); 1990 PetscCall(PetscLogObjectMemory((PetscObject)C,m*sizeof(PetscInt))); 1991 PetscCall(PetscMalloc1(totalslices+1,&c->sliidx)); 1992 PetscCall(PetscLogObjectMemory((PetscObject)C, (totalslices+1)*sizeof(PetscInt))); 1993 1994 for (i=0; i<m; i++) c->rlen[i] = a->rlen[i]; 1995 for (i=0; i<totalslices+1; i++) c->sliidx[i] = a->sliidx[i]; 1996 1997 /* allocate the matrix space */ 1998 if (mallocmatspace) { 1999 PetscCall(PetscMalloc2(a->maxallocmat,&c->val,a->maxallocmat,&c->colidx)); 2000 PetscCall(PetscLogObjectMemory((PetscObject)C,a->maxallocmat*(sizeof(PetscScalar)+sizeof(PetscInt)))); 2001 2002 c->singlemalloc = PETSC_TRUE; 2003 2004 if (m > 0) { 2005 PetscCall(PetscArraycpy(c->colidx,a->colidx,a->maxallocmat)); 2006 if (cpvalues == MAT_COPY_VALUES) { 2007 PetscCall(PetscArraycpy(c->val,a->val,a->maxallocmat)); 2008 } else { 2009 PetscCall(PetscArrayzero(c->val,a->maxallocmat)); 2010 } 2011 } 2012 } 2013 2014 c->ignorezeroentries = a->ignorezeroentries; 2015 c->roworiented = a->roworiented; 2016 c->nonew = a->nonew; 2017 if (a->diag) { 2018 PetscCall(PetscMalloc1(m,&c->diag)); 2019 PetscCall(PetscLogObjectMemory((PetscObject)C,m*sizeof(PetscInt))); 2020 for (i=0; i<m; i++) { 2021 c->diag[i] = a->diag[i]; 2022 } 2023 } else c->diag = NULL; 2024 2025 c->solve_work = NULL; 2026 c->saved_values = NULL; 2027 c->idiag = NULL; 2028 c->ssor_work = NULL; 2029 c->keepnonzeropattern = a->keepnonzeropattern; 2030 c->free_val = PETSC_TRUE; 2031 c->free_colidx = PETSC_TRUE; 2032 2033 c->maxallocmat = a->maxallocmat; 2034 c->maxallocrow = a->maxallocrow; 2035 c->rlenmax = a->rlenmax; 2036 c->nz = a->nz; 2037 C->preallocated = PETSC_TRUE; 2038 2039 c->nonzerorowcnt = a->nonzerorowcnt; 2040 C->nonzerostate = A->nonzerostate; 2041 2042 PetscCall(PetscFunctionListDuplicate(((PetscObject)A)->qlist,&((PetscObject)C)->qlist)); 2043 PetscFunctionReturn(0); 2044 } 2045 2046 PetscErrorCode MatDuplicate_SeqSELL(Mat A,MatDuplicateOption cpvalues,Mat *B) 2047 { 2048 PetscFunctionBegin; 2049 PetscCall(MatCreate(PetscObjectComm((PetscObject)A),B)); 2050 PetscCall(MatSetSizes(*B,A->rmap->n,A->cmap->n,A->rmap->n,A->cmap->n)); 2051 if (!(A->rmap->n % A->rmap->bs) && !(A->cmap->n % A->cmap->bs)) { 2052 PetscCall(MatSetBlockSizesFromMats(*B,A,A)); 2053 } 2054 PetscCall(MatSetType(*B,((PetscObject)A)->type_name)); 2055 PetscCall(MatDuplicateNoCreate_SeqSELL(*B,A,cpvalues,PETSC_TRUE)); 2056 PetscFunctionReturn(0); 2057 } 2058 2059 /*MC 2060 MATSEQSELL - MATSEQSELL = "seqsell" - A matrix type to be used for sequential sparse matrices, 2061 based on the sliced Ellpack format 2062 2063 Options Database Keys: 2064 . -mat_type seqsell - sets the matrix type to "seqsell" during a call to MatSetFromOptions() 2065 2066 Level: beginner 2067 2068 .seealso: `MatCreateSeqSell()`, `MATSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATAIJ`, `MATMPIAIJ` 2069 M*/ 2070 2071 /*MC 2072 MATSELL - MATSELL = "sell" - A matrix type to be used for sparse matrices. 2073 2074 This matrix type is identical to MATSEQSELL when constructed with a single process communicator, 2075 and MATMPISELL otherwise. As a result, for single process communicators, 2076 MatSeqSELLSetPreallocation() is supported, and similarly MatMPISELLSetPreallocation() is supported 2077 for communicators controlling multiple processes. It is recommended that you call both of 2078 the above preallocation routines for simplicity. 2079 2080 Options Database Keys: 2081 . -mat_type sell - sets the matrix type to "sell" during a call to MatSetFromOptions() 2082 2083 Level: beginner 2084 2085 Notes: 2086 This format is only supported for real scalars, double precision, and 32 bit indices (the defaults). 2087 2088 It can provide better performance on Intel and AMD processes with AVX2 or AVX512 support for matrices that have a similar number of 2089 non-zeros in contiguous groups of rows. However if the computation is memory bandwidth limited it may not provide much improvement. 2090 2091 Developer Notes: 2092 On Intel (and AMD) systems some of the matrix operations use SIMD (AVX) instructions to achieve higher performance. 2093 2094 The sparse matrix format is as follows. For simplicity we assume a slice size of 2, it is actually 8 2095 .vb 2096 (2 0 3 4) 2097 Consider the matrix A = (5 0 6 0) 2098 (0 0 7 8) 2099 (0 0 9 9) 2100 2101 symbolically the Ellpack format can be written as 2102 2103 (2 3 4 |) (0 2 3 |) 2104 v = (5 6 0 |) colidx = (0 2 2 |) 2105 -------- --------- 2106 (7 8 |) (2 3 |) 2107 (9 9 |) (2 3 |) 2108 2109 The data for 2 contiguous rows of the matrix are stored together (in column-major format) (with any left-over rows handled as a special case). 2110 Any of the rows in a slice fewer columns than the rest of the slice (row 1 above) are padded with a previous valid column in their "extra" colidx[] locations and 2111 zeros in their "extra" v locations so that the matrix operations do not need special code to handle different length rows within the 2 rows in a slice. 2112 2113 The one-dimensional representation of v used in the code is (2 5 3 6 4 0 7 9 8 9) and for colidx is (0 0 2 2 3 2 2 2 3 3) 2114 2115 .ve 2116 2117 See MatMult_SeqSELL() for how this format is used with the SIMD operations to achieve high performance. 2118 2119 References: 2120 . * - Hong Zhang, Richard T. Mills, Karl Rupp, and Barry F. Smith, Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512}, 2121 Proceedings of the 47th International Conference on Parallel Processing, 2018. 2122 2123 .seealso: `MatCreateSeqSELL()`, `MatCreateSeqAIJ()`, `MatCreateSell()`, `MATSEQSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATMPIAIJ`, `MATAIJ` 2124 M*/ 2125 2126 /*@C 2127 MatCreateSeqSELL - Creates a sparse matrix in SELL format. 2128 2129 Collective on comm 2130 2131 Input Parameters: 2132 + comm - MPI communicator, set to PETSC_COMM_SELF 2133 . m - number of rows 2134 . n - number of columns 2135 . rlenmax - maximum number of nonzeros in a row 2136 - rlen - array containing the number of nonzeros in the various rows 2137 (possibly different for each row) or NULL 2138 2139 Output Parameter: 2140 . A - the matrix 2141 2142 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 2143 MatXXXXSetPreallocation() paradigm instead of this routine directly. 2144 [MatXXXXSetPreallocation() is, for example, MatSeqSELLSetPreallocation] 2145 2146 Notes: 2147 If nnz is given then nz is ignored 2148 2149 Specify the preallocated storage with either rlenmax or rlen (not both). 2150 Set rlenmax=PETSC_DEFAULT and rlen=NULL for PETSc to control dynamic memory 2151 allocation. For large problems you MUST preallocate memory or you 2152 will get TERRIBLE performance, see the users' manual chapter on matrices. 2153 2154 Level: intermediate 2155 2156 .seealso: `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatSeqSELLSetPreallocation()`, `MATSELL`, `MATSEQSELL`, `MATMPISELL` 2157 2158 @*/ 2159 PetscErrorCode MatCreateSeqSELL(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt maxallocrow,const PetscInt rlen[],Mat *A) 2160 { 2161 PetscFunctionBegin; 2162 PetscCall(MatCreate(comm,A)); 2163 PetscCall(MatSetSizes(*A,m,n,m,n)); 2164 PetscCall(MatSetType(*A,MATSEQSELL)); 2165 PetscCall(MatSeqSELLSetPreallocation_SeqSELL(*A,maxallocrow,rlen)); 2166 PetscFunctionReturn(0); 2167 } 2168 2169 PetscErrorCode MatEqual_SeqSELL(Mat A,Mat B,PetscBool * flg) 2170 { 2171 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data,*b=(Mat_SeqSELL*)B->data; 2172 PetscInt totalslices=a->totalslices; 2173 2174 PetscFunctionBegin; 2175 /* If the matrix dimensions are not equal,or no of nonzeros */ 2176 if ((A->rmap->n != B->rmap->n) || (A->cmap->n != B->cmap->n) ||(a->nz != b->nz) || (a->rlenmax != b->rlenmax)) { 2177 *flg = PETSC_FALSE; 2178 PetscFunctionReturn(0); 2179 } 2180 /* if the a->colidx are the same */ 2181 PetscCall(PetscArraycmp(a->colidx,b->colidx,a->sliidx[totalslices],flg)); 2182 if (!*flg) PetscFunctionReturn(0); 2183 /* if a->val are the same */ 2184 PetscCall(PetscArraycmp(a->val,b->val,a->sliidx[totalslices],flg)); 2185 PetscFunctionReturn(0); 2186 } 2187 2188 PetscErrorCode MatSeqSELLInvalidateDiagonal(Mat A) 2189 { 2190 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 2191 2192 PetscFunctionBegin; 2193 a->idiagvalid = PETSC_FALSE; 2194 PetscFunctionReturn(0); 2195 } 2196 2197 PetscErrorCode MatConjugate_SeqSELL(Mat A) 2198 { 2199 #if defined(PETSC_USE_COMPLEX) 2200 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 2201 PetscInt i; 2202 PetscScalar *val = a->val; 2203 2204 PetscFunctionBegin; 2205 for (i=0; i<a->sliidx[a->totalslices]; i++) { 2206 val[i] = PetscConj(val[i]); 2207 } 2208 #else 2209 PetscFunctionBegin; 2210 #endif 2211 PetscFunctionReturn(0); 2212 } 2213