1 2 /* 3 Defines the basic matrix operations for the SELL matrix storage format. 4 */ 5 #include <../src/mat/impls/sell/seq/sell.h> /*I "petscmat.h" I*/ 6 #include <petscblaslapack.h> 7 #include <petsc/private/kernels/blocktranspose.h> 8 9 static PetscBool cited = PETSC_FALSE; 10 static const char citation[] = 11 "@inproceedings{ZhangELLPACK2018,\n" 12 " author = {Hong Zhang and Richard T. Mills and Karl Rupp and Barry F. Smith},\n" 13 " title = {Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512}},\n" 14 " booktitle = {Proceedings of the 47th International Conference on Parallel Processing},\n" 15 " year = 2018\n" 16 "}\n"; 17 18 #if defined(PETSC_HAVE_IMMINTRIN_H) && (defined(__AVX512F__) || (defined(__AVX2__) && defined(__FMA__)) || defined(__AVX__)) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 19 20 #include <immintrin.h> 21 22 #if !defined(_MM_SCALE_8) 23 #define _MM_SCALE_8 8 24 #endif 25 26 #if defined(__AVX512F__) 27 /* these do not work 28 vec_idx = _mm512_loadunpackhi_epi32(vec_idx,acolidx); 29 vec_vals = _mm512_loadunpackhi_pd(vec_vals,aval); 30 */ 31 #define AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y) \ 32 /* if the mask bit is set, copy from acolidx, otherwise from vec_idx */ \ 33 vec_idx = _mm256_loadu_si256((__m256i const*)acolidx); \ 34 vec_vals = _mm512_loadu_pd(aval); \ 35 vec_x = _mm512_i32gather_pd(vec_idx,x,_MM_SCALE_8); \ 36 vec_y = _mm512_fmadd_pd(vec_x,vec_vals,vec_y) 37 #elif defined(__AVX2__) && defined(__FMA__) 38 #define AVX2_Mult_Private(vec_idx,vec_x,vec_vals,vec_y) \ 39 vec_vals = _mm256_loadu_pd(aval); \ 40 vec_idx = _mm_loadu_si128((__m128i const*)acolidx); /* SSE2 */ \ 41 vec_x = _mm256_i32gather_pd(x,vec_idx,_MM_SCALE_8); \ 42 vec_y = _mm256_fmadd_pd(vec_x,vec_vals,vec_y) 43 #endif 44 #endif /* PETSC_HAVE_IMMINTRIN_H */ 45 46 /*@C 47 MatSeqSELLSetPreallocation - For good matrix assembly performance 48 the user should preallocate the matrix storage by setting the parameter nz 49 (or the array nnz). By setting these parameters accurately, performance 50 during matrix assembly can be increased significantly. 51 52 Collective 53 54 Input Parameters: 55 + B - The matrix 56 . nz - number of nonzeros per row (same for all rows) 57 - nnz - array containing the number of nonzeros in the various rows 58 (possibly different for each row) or NULL 59 60 Notes: 61 If nnz is given then nz is ignored. 62 63 Specify the preallocated storage with either nz or nnz (not both). 64 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 65 allocation. For large problems you MUST preallocate memory or you 66 will get TERRIBLE performance, see the users' manual chapter on matrices. 67 68 You can call MatGetInfo() to get information on how effective the preallocation was; 69 for example the fields mallocs,nz_allocated,nz_used,nz_unneeded; 70 You can also run with the option -info and look for messages with the string 71 malloc in them to see if additional memory allocation was needed. 72 73 Developers: Use nz of MAT_SKIP_ALLOCATION to not allocate any space for the matrix 74 entries or columns indices. 75 76 The maximum number of nonzeos in any row should be as accurate as possible. 77 If it is underestimated, you will get bad performance due to reallocation 78 (MatSeqXSELLReallocateSELL). 79 80 Level: intermediate 81 82 .seealso: `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatGetInfo()` 83 84 @*/ 85 PetscErrorCode MatSeqSELLSetPreallocation(Mat B,PetscInt rlenmax,const PetscInt rlen[]) 86 { 87 PetscFunctionBegin; 88 PetscValidHeaderSpecific(B,MAT_CLASSID,1); 89 PetscValidType(B,1); 90 PetscTryMethod(B,"MatSeqSELLSetPreallocation_C",(Mat,PetscInt,const PetscInt[]),(B,rlenmax,rlen)); 91 PetscFunctionReturn(0); 92 } 93 94 PetscErrorCode MatSeqSELLSetPreallocation_SeqSELL(Mat B,PetscInt maxallocrow,const PetscInt rlen[]) 95 { 96 Mat_SeqSELL *b; 97 PetscInt i,j,totalslices; 98 PetscBool skipallocation=PETSC_FALSE,realalloc=PETSC_FALSE; 99 100 PetscFunctionBegin; 101 if (maxallocrow >= 0 || rlen) realalloc = PETSC_TRUE; 102 if (maxallocrow == MAT_SKIP_ALLOCATION) { 103 skipallocation = PETSC_TRUE; 104 maxallocrow = 0; 105 } 106 107 PetscCall(PetscLayoutSetUp(B->rmap)); 108 PetscCall(PetscLayoutSetUp(B->cmap)); 109 110 /* FIXME: if one preallocates more space than needed, the matrix does not shrink automatically, but for best performance it should */ 111 if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 5; 112 PetscCheck(maxallocrow >= 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"maxallocrow cannot be less than 0: value %" PetscInt_FMT,maxallocrow); 113 if (rlen) { 114 for (i=0; i<B->rmap->n; i++) { 115 PetscCheck(rlen[i] >= 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"rlen cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT,i,rlen[i]); 116 PetscCheck(rlen[i] <= B->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"rlen cannot be greater than row length: local row %" PetscInt_FMT " value %" PetscInt_FMT " rowlength %" PetscInt_FMT,i,rlen[i],B->cmap->n); 117 } 118 } 119 120 B->preallocated = PETSC_TRUE; 121 122 b = (Mat_SeqSELL*)B->data; 123 124 totalslices = PetscCeilInt(B->rmap->n,8); 125 b->totalslices = totalslices; 126 if (!skipallocation) { 127 if (B->rmap->n & 0x07) PetscCall(PetscInfo(B,"Padding rows to the SEQSELL matrix because the number of rows is not the multiple of 8 (value %" PetscInt_FMT ")\n",B->rmap->n)); 128 129 if (!b->sliidx) { /* sliidx gives the starting index of each slice, the last element is the total space allocated */ 130 PetscCall(PetscMalloc1(totalslices+1,&b->sliidx)); 131 PetscCall(PetscLogObjectMemory((PetscObject)B,(totalslices+1)*sizeof(PetscInt))); 132 } 133 if (!rlen) { /* if rlen is not provided, allocate same space for all the slices */ 134 if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 10; 135 else if (maxallocrow < 0) maxallocrow = 1; 136 for (i=0; i<=totalslices; i++) b->sliidx[i] = i*8*maxallocrow; 137 } else { 138 maxallocrow = 0; 139 b->sliidx[0] = 0; 140 for (i=1; i<totalslices; i++) { 141 b->sliidx[i] = 0; 142 for (j=0;j<8;j++) { 143 b->sliidx[i] = PetscMax(b->sliidx[i],rlen[8*(i-1)+j]); 144 } 145 maxallocrow = PetscMax(b->sliidx[i],maxallocrow); 146 PetscCall(PetscIntSumError(b->sliidx[i-1],8*b->sliidx[i],&b->sliidx[i])); 147 } 148 /* last slice */ 149 b->sliidx[totalslices] = 0; 150 for (j=(totalslices-1)*8;j<B->rmap->n;j++) b->sliidx[totalslices] = PetscMax(b->sliidx[totalslices],rlen[j]); 151 maxallocrow = PetscMax(b->sliidx[totalslices],maxallocrow); 152 b->sliidx[totalslices] = b->sliidx[totalslices-1] + 8*b->sliidx[totalslices]; 153 } 154 155 /* allocate space for val, colidx, rlen */ 156 /* FIXME: should B's old memory be unlogged? */ 157 PetscCall(MatSeqXSELLFreeSELL(B,&b->val,&b->colidx)); 158 /* FIXME: assuming an element of the bit array takes 8 bits */ 159 PetscCall(PetscMalloc2(b->sliidx[totalslices],&b->val,b->sliidx[totalslices],&b->colidx)); 160 PetscCall(PetscLogObjectMemory((PetscObject)B,b->sliidx[totalslices]*(sizeof(PetscScalar)+sizeof(PetscInt)))); 161 /* b->rlen will count nonzeros in each row so far. We dont copy rlen to b->rlen because the matrix has not been set. */ 162 PetscCall(PetscCalloc1(8*totalslices,&b->rlen)); 163 PetscCall(PetscLogObjectMemory((PetscObject)B,8*totalslices*sizeof(PetscInt))); 164 165 b->singlemalloc = PETSC_TRUE; 166 b->free_val = PETSC_TRUE; 167 b->free_colidx = PETSC_TRUE; 168 } else { 169 b->free_val = PETSC_FALSE; 170 b->free_colidx = PETSC_FALSE; 171 } 172 173 b->nz = 0; 174 b->maxallocrow = maxallocrow; 175 b->rlenmax = maxallocrow; 176 b->maxallocmat = b->sliidx[totalslices]; 177 B->info.nz_unneeded = (double)b->maxallocmat; 178 if (realalloc) PetscCall(MatSetOption(B,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE)); 179 PetscFunctionReturn(0); 180 } 181 182 PetscErrorCode MatGetRow_SeqSELL(Mat A,PetscInt row,PetscInt *nz,PetscInt **idx,PetscScalar **v) 183 { 184 Mat_SeqSELL *a = (Mat_SeqSELL*)A->data; 185 PetscInt shift; 186 187 PetscFunctionBegin; 188 PetscCheck(row >= 0 && row < A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Row %" PetscInt_FMT " out of range",row); 189 if (nz) *nz = a->rlen[row]; 190 shift = a->sliidx[row>>3]+(row&0x07); 191 if (!a->getrowcols) { 192 193 PetscCall(PetscMalloc2(a->rlenmax,&a->getrowcols,a->rlenmax,&a->getrowvals)); 194 } 195 if (idx) { 196 PetscInt j; 197 for (j=0; j<a->rlen[row]; j++) a->getrowcols[j] = a->colidx[shift+8*j]; 198 *idx = a->getrowcols; 199 } 200 if (v) { 201 PetscInt j; 202 for (j=0; j<a->rlen[row]; j++) a->getrowvals[j] = a->val[shift+8*j]; 203 *v = a->getrowvals; 204 } 205 PetscFunctionReturn(0); 206 } 207 208 PetscErrorCode MatRestoreRow_SeqSELL(Mat A,PetscInt row,PetscInt *nz,PetscInt **idx,PetscScalar **v) 209 { 210 PetscFunctionBegin; 211 PetscFunctionReturn(0); 212 } 213 214 PetscErrorCode MatConvert_SeqSELL_SeqAIJ(Mat A, MatType newtype,MatReuse reuse,Mat *newmat) 215 { 216 Mat B; 217 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 218 PetscInt i; 219 220 PetscFunctionBegin; 221 if (reuse == MAT_REUSE_MATRIX) { 222 B = *newmat; 223 PetscCall(MatZeroEntries(B)); 224 } else { 225 PetscCall(MatCreate(PetscObjectComm((PetscObject)A),&B)); 226 PetscCall(MatSetSizes(B,A->rmap->n,A->cmap->n,A->rmap->N,A->cmap->N)); 227 PetscCall(MatSetType(B,MATSEQAIJ)); 228 PetscCall(MatSeqAIJSetPreallocation(B,0,a->rlen)); 229 } 230 231 for (i=0; i<A->rmap->n; i++) { 232 PetscInt nz = 0,*cols = NULL; 233 PetscScalar *vals = NULL; 234 235 PetscCall(MatGetRow_SeqSELL(A,i,&nz,&cols,&vals)); 236 PetscCall(MatSetValues(B,1,&i,nz,cols,vals,INSERT_VALUES)); 237 PetscCall(MatRestoreRow_SeqSELL(A,i,&nz,&cols,&vals)); 238 } 239 240 PetscCall(MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY)); 241 PetscCall(MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY)); 242 B->rmap->bs = A->rmap->bs; 243 244 if (reuse == MAT_INPLACE_MATRIX) { 245 PetscCall(MatHeaderReplace(A,&B)); 246 } else { 247 *newmat = B; 248 } 249 PetscFunctionReturn(0); 250 } 251 252 #include <../src/mat/impls/aij/seq/aij.h> 253 254 PetscErrorCode MatConvert_SeqAIJ_SeqSELL(Mat A,MatType newtype,MatReuse reuse,Mat *newmat) 255 { 256 Mat B; 257 Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data; 258 PetscInt *ai=a->i,m=A->rmap->N,n=A->cmap->N,i,*rowlengths,row,ncols; 259 const PetscInt *cols; 260 const PetscScalar *vals; 261 262 PetscFunctionBegin; 263 264 if (reuse == MAT_REUSE_MATRIX) { 265 B = *newmat; 266 } else { 267 if (PetscDefined(USE_DEBUG) || !a->ilen) { 268 PetscCall(PetscMalloc1(m,&rowlengths)); 269 for (i=0; i<m; i++) { 270 rowlengths[i] = ai[i+1] - ai[i]; 271 } 272 } 273 if (PetscDefined(USE_DEBUG) && a->ilen) { 274 PetscBool eq; 275 PetscCall(PetscMemcmp(rowlengths,a->ilen,m*sizeof(PetscInt),&eq)); 276 PetscCheck(eq,PETSC_COMM_SELF,PETSC_ERR_PLIB,"SeqAIJ ilen array incorrect"); 277 PetscCall(PetscFree(rowlengths)); 278 rowlengths = a->ilen; 279 } else if (a->ilen) rowlengths = a->ilen; 280 PetscCall(MatCreate(PetscObjectComm((PetscObject)A),&B)); 281 PetscCall(MatSetSizes(B,m,n,m,n)); 282 PetscCall(MatSetType(B,MATSEQSELL)); 283 PetscCall(MatSeqSELLSetPreallocation(B,0,rowlengths)); 284 if (rowlengths != a->ilen) PetscCall(PetscFree(rowlengths)); 285 } 286 287 for (row=0; row<m; row++) { 288 PetscCall(MatGetRow_SeqAIJ(A,row,&ncols,(PetscInt**)&cols,(PetscScalar**)&vals)); 289 PetscCall(MatSetValues_SeqSELL(B,1,&row,ncols,cols,vals,INSERT_VALUES)); 290 PetscCall(MatRestoreRow_SeqAIJ(A,row,&ncols,(PetscInt**)&cols,(PetscScalar**)&vals)); 291 } 292 PetscCall(MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY)); 293 PetscCall(MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY)); 294 B->rmap->bs = A->rmap->bs; 295 296 if (reuse == MAT_INPLACE_MATRIX) { 297 PetscCall(MatHeaderReplace(A,&B)); 298 } else { 299 *newmat = B; 300 } 301 PetscFunctionReturn(0); 302 } 303 304 PetscErrorCode MatMult_SeqSELL(Mat A,Vec xx,Vec yy) 305 { 306 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 307 PetscScalar *y; 308 const PetscScalar *x; 309 const MatScalar *aval=a->val; 310 PetscInt totalslices=a->totalslices; 311 const PetscInt *acolidx=a->colidx; 312 PetscInt i,j; 313 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 314 __m512d vec_x,vec_y,vec_vals; 315 __m256i vec_idx; 316 __mmask8 mask; 317 __m512d vec_x2,vec_y2,vec_vals2,vec_x3,vec_y3,vec_vals3,vec_x4,vec_y4,vec_vals4; 318 __m256i vec_idx2,vec_idx3,vec_idx4; 319 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 320 __m128i vec_idx; 321 __m256d vec_x,vec_y,vec_y2,vec_vals; 322 MatScalar yval; 323 PetscInt r,rows_left,row,nnz_in_row; 324 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 325 __m128d vec_x_tmp; 326 __m256d vec_x,vec_y,vec_y2,vec_vals; 327 MatScalar yval; 328 PetscInt r,rows_left,row,nnz_in_row; 329 #else 330 PetscScalar sum[8]; 331 #endif 332 333 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 334 #pragma disjoint(*x,*y,*aval) 335 #endif 336 337 PetscFunctionBegin; 338 PetscCall(VecGetArrayRead(xx,&x)); 339 PetscCall(VecGetArray(yy,&y)); 340 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 341 for (i=0; i<totalslices; i++) { /* loop over slices */ 342 PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0); 343 PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0); 344 345 vec_y = _mm512_setzero_pd(); 346 vec_y2 = _mm512_setzero_pd(); 347 vec_y3 = _mm512_setzero_pd(); 348 vec_y4 = _mm512_setzero_pd(); 349 350 j = a->sliidx[i]>>3; /* 8 bytes are read at each time, corresponding to a slice columnn */ 351 switch ((a->sliidx[i+1]-a->sliidx[i])/8 & 3) { 352 case 3: 353 AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y); 354 acolidx += 8; aval += 8; 355 AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2); 356 acolidx += 8; aval += 8; 357 AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3); 358 acolidx += 8; aval += 8; 359 j += 3; 360 break; 361 case 2: 362 AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y); 363 acolidx += 8; aval += 8; 364 AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2); 365 acolidx += 8; aval += 8; 366 j += 2; 367 break; 368 case 1: 369 AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y); 370 acolidx += 8; aval += 8; 371 j += 1; 372 break; 373 } 374 #pragma novector 375 for (; j<(a->sliidx[i+1]>>3); j+=4) { 376 AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y); 377 acolidx += 8; aval += 8; 378 AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2); 379 acolidx += 8; aval += 8; 380 AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3); 381 acolidx += 8; aval += 8; 382 AVX512_Mult_Private(vec_idx4,vec_x4,vec_vals4,vec_y4); 383 acolidx += 8; aval += 8; 384 } 385 386 vec_y = _mm512_add_pd(vec_y,vec_y2); 387 vec_y = _mm512_add_pd(vec_y,vec_y3); 388 vec_y = _mm512_add_pd(vec_y,vec_y4); 389 if (i == totalslices-1 && A->rmap->n & 0x07) { /* if last slice has padding rows */ 390 mask = (__mmask8)(0xff >> (8-(A->rmap->n & 0x07))); 391 _mm512_mask_storeu_pd(&y[8*i],mask,vec_y); 392 } else { 393 _mm512_storeu_pd(&y[8*i],vec_y); 394 } 395 } 396 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 397 for (i=0; i<totalslices; i++) { /* loop over full slices */ 398 PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0); 399 PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0); 400 401 /* last slice may have padding rows. Don't use vectorization. */ 402 if (i == totalslices-1 && (A->rmap->n & 0x07)) { 403 rows_left = A->rmap->n - 8*i; 404 for (r=0; r<rows_left; ++r) { 405 yval = (MatScalar)0; 406 row = 8*i + r; 407 nnz_in_row = a->rlen[row]; 408 for (j=0; j<nnz_in_row; ++j) yval += aval[8*j+r] * x[acolidx[8*j+r]]; 409 y[row] = yval; 410 } 411 break; 412 } 413 414 vec_y = _mm256_setzero_pd(); 415 vec_y2 = _mm256_setzero_pd(); 416 417 /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */ 418 #pragma novector 419 #pragma unroll(2) 420 for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) { 421 AVX2_Mult_Private(vec_idx,vec_x,vec_vals,vec_y); 422 aval += 4; acolidx += 4; 423 AVX2_Mult_Private(vec_idx,vec_x,vec_vals,vec_y2); 424 aval += 4; acolidx += 4; 425 } 426 427 _mm256_storeu_pd(y+i*8,vec_y); 428 _mm256_storeu_pd(y+i*8+4,vec_y2); 429 } 430 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 431 for (i=0; i<totalslices; i++) { /* loop over full slices */ 432 PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0); 433 PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0); 434 435 vec_y = _mm256_setzero_pd(); 436 vec_y2 = _mm256_setzero_pd(); 437 438 /* last slice may have padding rows. Don't use vectorization. */ 439 if (i == totalslices-1 && (A->rmap->n & 0x07)) { 440 rows_left = A->rmap->n - 8*i; 441 for (r=0; r<rows_left; ++r) { 442 yval = (MatScalar)0; 443 row = 8*i + r; 444 nnz_in_row = a->rlen[row]; 445 for (j=0; j<nnz_in_row; ++j) yval += aval[8*j + r] * x[acolidx[8*j + r]]; 446 y[row] = yval; 447 } 448 break; 449 } 450 451 /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */ 452 #pragma novector 453 #pragma unroll(2) 454 for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) { 455 vec_vals = _mm256_loadu_pd(aval); 456 vec_x_tmp = _mm_setzero_pd(); 457 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 458 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 459 vec_x = _mm256_insertf128_pd(vec_x,vec_x_tmp,0); 460 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 461 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 462 vec_x = _mm256_insertf128_pd(vec_x,vec_x_tmp,1); 463 vec_y = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y); 464 aval += 4; 465 466 vec_vals = _mm256_loadu_pd(aval); 467 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 468 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 469 vec_x = _mm256_insertf128_pd(vec_x,vec_x_tmp,0); 470 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 471 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 472 vec_x = _mm256_insertf128_pd(vec_x,vec_x_tmp,1); 473 vec_y2 = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y2); 474 aval += 4; 475 } 476 477 _mm256_storeu_pd(y + i*8, vec_y); 478 _mm256_storeu_pd(y + i*8 + 4, vec_y2); 479 } 480 #else 481 for (i=0; i<totalslices; i++) { /* loop over slices */ 482 for (j=0; j<8; j++) sum[j] = 0.0; 483 for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) { 484 sum[0] += aval[j] * x[acolidx[j]]; 485 sum[1] += aval[j+1] * x[acolidx[j+1]]; 486 sum[2] += aval[j+2] * x[acolidx[j+2]]; 487 sum[3] += aval[j+3] * x[acolidx[j+3]]; 488 sum[4] += aval[j+4] * x[acolidx[j+4]]; 489 sum[5] += aval[j+5] * x[acolidx[j+5]]; 490 sum[6] += aval[j+6] * x[acolidx[j+6]]; 491 sum[7] += aval[j+7] * x[acolidx[j+7]]; 492 } 493 if (i == totalslices-1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */ 494 for (j=0; j<(A->rmap->n & 0x07); j++) y[8*i+j] = sum[j]; 495 } else { 496 for (j=0; j<8; j++) y[8*i+j] = sum[j]; 497 } 498 } 499 #endif 500 501 PetscCall(PetscLogFlops(2.0*a->nz-a->nonzerorowcnt)); /* theoretical minimal FLOPs */ 502 PetscCall(VecRestoreArrayRead(xx,&x)); 503 PetscCall(VecRestoreArray(yy,&y)); 504 PetscFunctionReturn(0); 505 } 506 507 #include <../src/mat/impls/aij/seq/ftn-kernels/fmultadd.h> 508 PetscErrorCode MatMultAdd_SeqSELL(Mat A,Vec xx,Vec yy,Vec zz) 509 { 510 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 511 PetscScalar *y,*z; 512 const PetscScalar *x; 513 const MatScalar *aval=a->val; 514 PetscInt totalslices=a->totalslices; 515 const PetscInt *acolidx=a->colidx; 516 PetscInt i,j; 517 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 518 __m512d vec_x,vec_y,vec_vals; 519 __m256i vec_idx; 520 __mmask8 mask; 521 __m512d vec_x2,vec_y2,vec_vals2,vec_x3,vec_y3,vec_vals3,vec_x4,vec_y4,vec_vals4; 522 __m256i vec_idx2,vec_idx3,vec_idx4; 523 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 524 __m128d vec_x_tmp; 525 __m256d vec_x,vec_y,vec_y2,vec_vals; 526 MatScalar yval; 527 PetscInt r,row,nnz_in_row; 528 #else 529 PetscScalar sum[8]; 530 #endif 531 532 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 533 #pragma disjoint(*x,*y,*aval) 534 #endif 535 536 PetscFunctionBegin; 537 PetscCall(VecGetArrayRead(xx,&x)); 538 PetscCall(VecGetArrayPair(yy,zz,&y,&z)); 539 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 540 for (i=0; i<totalslices; i++) { /* loop over slices */ 541 PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0); 542 PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0); 543 544 if (i == totalslices-1 && A->rmap->n & 0x07) { /* if last slice has padding rows */ 545 mask = (__mmask8)(0xff >> (8-(A->rmap->n & 0x07))); 546 vec_y = _mm512_mask_loadu_pd(vec_y,mask,&y[8*i]); 547 } else { 548 vec_y = _mm512_loadu_pd(&y[8*i]); 549 } 550 vec_y2 = _mm512_setzero_pd(); 551 vec_y3 = _mm512_setzero_pd(); 552 vec_y4 = _mm512_setzero_pd(); 553 554 j = a->sliidx[i]>>3; /* 8 bytes are read at each time, corresponding to a slice columnn */ 555 switch ((a->sliidx[i+1]-a->sliidx[i])/8 & 3) { 556 case 3: 557 AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y); 558 acolidx += 8; aval += 8; 559 AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2); 560 acolidx += 8; aval += 8; 561 AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3); 562 acolidx += 8; aval += 8; 563 j += 3; 564 break; 565 case 2: 566 AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y); 567 acolidx += 8; aval += 8; 568 AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2); 569 acolidx += 8; aval += 8; 570 j += 2; 571 break; 572 case 1: 573 AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y); 574 acolidx += 8; aval += 8; 575 j += 1; 576 break; 577 } 578 #pragma novector 579 for (; j<(a->sliidx[i+1]>>3); j+=4) { 580 AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y); 581 acolidx += 8; aval += 8; 582 AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2); 583 acolidx += 8; aval += 8; 584 AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3); 585 acolidx += 8; aval += 8; 586 AVX512_Mult_Private(vec_idx4,vec_x4,vec_vals4,vec_y4); 587 acolidx += 8; aval += 8; 588 } 589 590 vec_y = _mm512_add_pd(vec_y,vec_y2); 591 vec_y = _mm512_add_pd(vec_y,vec_y3); 592 vec_y = _mm512_add_pd(vec_y,vec_y4); 593 if (i == totalslices-1 && A->rmap->n & 0x07) { /* if last slice has padding rows */ 594 _mm512_mask_storeu_pd(&z[8*i],mask,vec_y); 595 } else { 596 _mm512_storeu_pd(&z[8*i],vec_y); 597 } 598 } 599 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 600 for (i=0; i<totalslices; i++) { /* loop over full slices */ 601 PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0); 602 PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0); 603 604 /* last slice may have padding rows. Don't use vectorization. */ 605 if (i == totalslices-1 && (A->rmap->n & 0x07)) { 606 for (r=0; r<(A->rmap->n & 0x07); ++r) { 607 row = 8*i + r; 608 yval = (MatScalar)0.0; 609 nnz_in_row = a->rlen[row]; 610 for (j=0; j<nnz_in_row; ++j) yval += aval[8*j+r] * x[acolidx[8*j+r]]; 611 z[row] = y[row] + yval; 612 } 613 break; 614 } 615 616 vec_y = _mm256_loadu_pd(y+8*i); 617 vec_y2 = _mm256_loadu_pd(y+8*i+4); 618 619 /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */ 620 for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) { 621 vec_vals = _mm256_loadu_pd(aval); 622 vec_x_tmp = _mm_setzero_pd(); 623 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 624 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 625 vec_x = _mm256_setzero_pd(); 626 vec_x = _mm256_insertf128_pd(vec_x,vec_x_tmp,0); 627 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 628 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 629 vec_x = _mm256_insertf128_pd(vec_x,vec_x_tmp,1); 630 vec_y = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y); 631 aval += 4; 632 633 vec_vals = _mm256_loadu_pd(aval); 634 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 635 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 636 vec_x = _mm256_insertf128_pd(vec_x,vec_x_tmp,0); 637 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 638 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 639 vec_x = _mm256_insertf128_pd(vec_x,vec_x_tmp,1); 640 vec_y2 = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y2); 641 aval += 4; 642 } 643 644 _mm256_storeu_pd(z+i*8,vec_y); 645 _mm256_storeu_pd(z+i*8+4,vec_y2); 646 } 647 #else 648 for (i=0; i<totalslices; i++) { /* loop over slices */ 649 for (j=0; j<8; j++) sum[j] = 0.0; 650 for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) { 651 sum[0] += aval[j] * x[acolidx[j]]; 652 sum[1] += aval[j+1] * x[acolidx[j+1]]; 653 sum[2] += aval[j+2] * x[acolidx[j+2]]; 654 sum[3] += aval[j+3] * x[acolidx[j+3]]; 655 sum[4] += aval[j+4] * x[acolidx[j+4]]; 656 sum[5] += aval[j+5] * x[acolidx[j+5]]; 657 sum[6] += aval[j+6] * x[acolidx[j+6]]; 658 sum[7] += aval[j+7] * x[acolidx[j+7]]; 659 } 660 if (i == totalslices-1 && (A->rmap->n & 0x07)) { 661 for (j=0; j<(A->rmap->n & 0x07); j++) z[8*i+j] = y[8*i+j] + sum[j]; 662 } else { 663 for (j=0; j<8; j++) z[8*i+j] = y[8*i+j] + sum[j]; 664 } 665 } 666 #endif 667 668 PetscCall(PetscLogFlops(2.0*a->nz)); 669 PetscCall(VecRestoreArrayRead(xx,&x)); 670 PetscCall(VecRestoreArrayPair(yy,zz,&y,&z)); 671 PetscFunctionReturn(0); 672 } 673 674 PetscErrorCode MatMultTransposeAdd_SeqSELL(Mat A,Vec xx,Vec zz,Vec yy) 675 { 676 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 677 PetscScalar *y; 678 const PetscScalar *x; 679 const MatScalar *aval=a->val; 680 const PetscInt *acolidx=a->colidx; 681 PetscInt i,j,r,row,nnz_in_row,totalslices=a->totalslices; 682 683 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 684 #pragma disjoint(*x,*y,*aval) 685 #endif 686 687 PetscFunctionBegin; 688 if (A->symmetric) { 689 PetscCall(MatMultAdd_SeqSELL(A,xx,zz,yy)); 690 PetscFunctionReturn(0); 691 } 692 if (zz != yy) PetscCall(VecCopy(zz,yy)); 693 PetscCall(VecGetArrayRead(xx,&x)); 694 PetscCall(VecGetArray(yy,&y)); 695 for (i=0; i<a->totalslices; i++) { /* loop over slices */ 696 if (i == totalslices-1 && (A->rmap->n & 0x07)) { 697 for (r=0; r<(A->rmap->n & 0x07); ++r) { 698 row = 8*i + r; 699 nnz_in_row = a->rlen[row]; 700 for (j=0; j<nnz_in_row; ++j) y[acolidx[8*j+r]] += aval[8*j+r] * x[row]; 701 } 702 break; 703 } 704 for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) { 705 y[acolidx[j]] += aval[j] * x[8*i]; 706 y[acolidx[j+1]] += aval[j+1] * x[8*i+1]; 707 y[acolidx[j+2]] += aval[j+2] * x[8*i+2]; 708 y[acolidx[j+3]] += aval[j+3] * x[8*i+3]; 709 y[acolidx[j+4]] += aval[j+4] * x[8*i+4]; 710 y[acolidx[j+5]] += aval[j+5] * x[8*i+5]; 711 y[acolidx[j+6]] += aval[j+6] * x[8*i+6]; 712 y[acolidx[j+7]] += aval[j+7] * x[8*i+7]; 713 } 714 } 715 PetscCall(PetscLogFlops(2.0*a->sliidx[a->totalslices])); 716 PetscCall(VecRestoreArrayRead(xx,&x)); 717 PetscCall(VecRestoreArray(yy,&y)); 718 PetscFunctionReturn(0); 719 } 720 721 PetscErrorCode MatMultTranspose_SeqSELL(Mat A,Vec xx,Vec yy) 722 { 723 PetscFunctionBegin; 724 if (A->symmetric) { 725 PetscCall(MatMult_SeqSELL(A,xx,yy)); 726 } else { 727 PetscCall(VecSet(yy,0.0)); 728 PetscCall(MatMultTransposeAdd_SeqSELL(A,xx,yy,yy)); 729 } 730 PetscFunctionReturn(0); 731 } 732 733 /* 734 Checks for missing diagonals 735 */ 736 PetscErrorCode MatMissingDiagonal_SeqSELL(Mat A,PetscBool *missing,PetscInt *d) 737 { 738 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 739 PetscInt *diag,i; 740 741 PetscFunctionBegin; 742 *missing = PETSC_FALSE; 743 if (A->rmap->n > 0 && !(a->colidx)) { 744 *missing = PETSC_TRUE; 745 if (d) *d = 0; 746 PetscCall(PetscInfo(A,"Matrix has no entries therefore is missing diagonal\n")); 747 } else { 748 diag = a->diag; 749 for (i=0; i<A->rmap->n; i++) { 750 if (diag[i] == -1) { 751 *missing = PETSC_TRUE; 752 if (d) *d = i; 753 PetscCall(PetscInfo(A,"Matrix is missing diagonal number %" PetscInt_FMT "\n",i)); 754 break; 755 } 756 } 757 } 758 PetscFunctionReturn(0); 759 } 760 761 PetscErrorCode MatMarkDiagonal_SeqSELL(Mat A) 762 { 763 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 764 PetscInt i,j,m=A->rmap->n,shift; 765 766 PetscFunctionBegin; 767 if (!a->diag) { 768 PetscCall(PetscMalloc1(m,&a->diag)); 769 PetscCall(PetscLogObjectMemory((PetscObject)A,m*sizeof(PetscInt))); 770 a->free_diag = PETSC_TRUE; 771 } 772 for (i=0; i<m; i++) { /* loop over rows */ 773 shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */ 774 a->diag[i] = -1; 775 for (j=0; j<a->rlen[i]; j++) { 776 if (a->colidx[shift+j*8] == i) { 777 a->diag[i] = shift+j*8; 778 break; 779 } 780 } 781 } 782 PetscFunctionReturn(0); 783 } 784 785 /* 786 Negative shift indicates do not generate an error if there is a zero diagonal, just invert it anyways 787 */ 788 PetscErrorCode MatInvertDiagonal_SeqSELL(Mat A,PetscScalar omega,PetscScalar fshift) 789 { 790 Mat_SeqSELL *a=(Mat_SeqSELL*) A->data; 791 PetscInt i,*diag,m = A->rmap->n; 792 MatScalar *val = a->val; 793 PetscScalar *idiag,*mdiag; 794 795 PetscFunctionBegin; 796 if (a->idiagvalid) PetscFunctionReturn(0); 797 PetscCall(MatMarkDiagonal_SeqSELL(A)); 798 diag = a->diag; 799 if (!a->idiag) { 800 PetscCall(PetscMalloc3(m,&a->idiag,m,&a->mdiag,m,&a->ssor_work)); 801 PetscCall(PetscLogObjectMemory((PetscObject)A, 3*m*sizeof(PetscScalar))); 802 val = a->val; 803 } 804 mdiag = a->mdiag; 805 idiag = a->idiag; 806 807 if (omega == 1.0 && PetscRealPart(fshift) <= 0.0) { 808 for (i=0; i<m; i++) { 809 mdiag[i] = val[diag[i]]; 810 if (!PetscAbsScalar(mdiag[i])) { /* zero diagonal */ 811 if (PetscRealPart(fshift)) { 812 PetscCall(PetscInfo(A,"Zero diagonal on row %" PetscInt_FMT "\n",i)); 813 A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 814 A->factorerror_zeropivot_value = 0.0; 815 A->factorerror_zeropivot_row = i; 816 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Zero diagonal on row %" PetscInt_FMT,i); 817 } 818 idiag[i] = 1.0/val[diag[i]]; 819 } 820 PetscCall(PetscLogFlops(m)); 821 } else { 822 for (i=0; i<m; i++) { 823 mdiag[i] = val[diag[i]]; 824 idiag[i] = omega/(fshift + val[diag[i]]); 825 } 826 PetscCall(PetscLogFlops(2.0*m)); 827 } 828 a->idiagvalid = PETSC_TRUE; 829 PetscFunctionReturn(0); 830 } 831 832 PetscErrorCode MatZeroEntries_SeqSELL(Mat A) 833 { 834 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 835 836 PetscFunctionBegin; 837 PetscCall(PetscArrayzero(a->val,a->sliidx[a->totalslices])); 838 PetscCall(MatSeqSELLInvalidateDiagonal(A)); 839 PetscFunctionReturn(0); 840 } 841 842 PetscErrorCode MatDestroy_SeqSELL(Mat A) 843 { 844 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 845 846 PetscFunctionBegin; 847 #if defined(PETSC_USE_LOG) 848 PetscLogObjectState((PetscObject)A,"Rows=%" PetscInt_FMT ", Cols=%" PetscInt_FMT ", NZ=%" PetscInt_FMT,A->rmap->n,A->cmap->n,a->nz); 849 #endif 850 PetscCall(MatSeqXSELLFreeSELL(A,&a->val,&a->colidx)); 851 PetscCall(ISDestroy(&a->row)); 852 PetscCall(ISDestroy(&a->col)); 853 PetscCall(PetscFree(a->diag)); 854 PetscCall(PetscFree(a->rlen)); 855 PetscCall(PetscFree(a->sliidx)); 856 PetscCall(PetscFree3(a->idiag,a->mdiag,a->ssor_work)); 857 PetscCall(PetscFree(a->solve_work)); 858 PetscCall(ISDestroy(&a->icol)); 859 PetscCall(PetscFree(a->saved_values)); 860 PetscCall(PetscFree2(a->getrowcols,a->getrowvals)); 861 862 PetscCall(PetscFree(A->data)); 863 864 PetscCall(PetscObjectChangeTypeName((PetscObject)A,NULL)); 865 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatStoreValues_C",NULL)); 866 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatRetrieveValues_C",NULL)); 867 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqSELLSetPreallocation_C",NULL)); 868 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqSELLGetArray_C",NULL)); 869 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqSELLRestoreArray_C",NULL)); 870 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqsell_seqaij_C",NULL)); 871 PetscFunctionReturn(0); 872 } 873 874 PetscErrorCode MatSetOption_SeqSELL(Mat A,MatOption op,PetscBool flg) 875 { 876 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 877 878 PetscFunctionBegin; 879 switch (op) { 880 case MAT_ROW_ORIENTED: 881 a->roworiented = flg; 882 break; 883 case MAT_KEEP_NONZERO_PATTERN: 884 a->keepnonzeropattern = flg; 885 break; 886 case MAT_NEW_NONZERO_LOCATIONS: 887 a->nonew = (flg ? 0 : 1); 888 break; 889 case MAT_NEW_NONZERO_LOCATION_ERR: 890 a->nonew = (flg ? -1 : 0); 891 break; 892 case MAT_NEW_NONZERO_ALLOCATION_ERR: 893 a->nonew = (flg ? -2 : 0); 894 break; 895 case MAT_UNUSED_NONZERO_LOCATION_ERR: 896 a->nounused = (flg ? -1 : 0); 897 break; 898 case MAT_FORCE_DIAGONAL_ENTRIES: 899 case MAT_IGNORE_OFF_PROC_ENTRIES: 900 case MAT_USE_HASH_TABLE: 901 case MAT_SORTED_FULL: 902 PetscCall(PetscInfo(A,"Option %s ignored\n",MatOptions[op])); 903 break; 904 case MAT_SPD: 905 case MAT_SYMMETRIC: 906 case MAT_STRUCTURALLY_SYMMETRIC: 907 case MAT_HERMITIAN: 908 case MAT_SYMMETRY_ETERNAL: 909 /* These options are handled directly by MatSetOption() */ 910 break; 911 default: 912 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"unknown option %d",op); 913 } 914 PetscFunctionReturn(0); 915 } 916 917 PetscErrorCode MatGetDiagonal_SeqSELL(Mat A,Vec v) 918 { 919 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 920 PetscInt i,j,n,shift; 921 PetscScalar *x,zero=0.0; 922 923 PetscFunctionBegin; 924 PetscCall(VecGetLocalSize(v,&n)); 925 PetscCheck(n == A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 926 927 if (A->factortype == MAT_FACTOR_ILU || A->factortype == MAT_FACTOR_LU) { 928 PetscInt *diag=a->diag; 929 PetscCall(VecGetArray(v,&x)); 930 for (i=0; i<n; i++) x[i] = 1.0/a->val[diag[i]]; 931 PetscCall(VecRestoreArray(v,&x)); 932 PetscFunctionReturn(0); 933 } 934 935 PetscCall(VecSet(v,zero)); 936 PetscCall(VecGetArray(v,&x)); 937 for (i=0; i<n; i++) { /* loop over rows */ 938 shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */ 939 x[i] = 0; 940 for (j=0; j<a->rlen[i]; j++) { 941 if (a->colidx[shift+j*8] == i) { 942 x[i] = a->val[shift+j*8]; 943 break; 944 } 945 } 946 } 947 PetscCall(VecRestoreArray(v,&x)); 948 PetscFunctionReturn(0); 949 } 950 951 PetscErrorCode MatDiagonalScale_SeqSELL(Mat A,Vec ll,Vec rr) 952 { 953 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 954 const PetscScalar *l,*r; 955 PetscInt i,j,m,n,row; 956 957 PetscFunctionBegin; 958 if (ll) { 959 /* The local size is used so that VecMPI can be passed to this routine 960 by MatDiagonalScale_MPISELL */ 961 PetscCall(VecGetLocalSize(ll,&m)); 962 PetscCheck(m == A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length"); 963 PetscCall(VecGetArrayRead(ll,&l)); 964 for (i=0; i<a->totalslices; i++) { /* loop over slices */ 965 if (i == a->totalslices-1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */ 966 for (j=a->sliidx[i],row=0; j<a->sliidx[i+1]; j++,row=((row+1)&0x07)) { 967 if (row < (A->rmap->n & 0x07)) a->val[j] *= l[8*i+row]; 968 } 969 } else { 970 for (j=a->sliidx[i],row=0; j<a->sliidx[i+1]; j++,row=((row+1)&0x07)) { 971 a->val[j] *= l[8*i+row]; 972 } 973 } 974 } 975 PetscCall(VecRestoreArrayRead(ll,&l)); 976 PetscCall(PetscLogFlops(a->nz)); 977 } 978 if (rr) { 979 PetscCall(VecGetLocalSize(rr,&n)); 980 PetscCheck(n == A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length"); 981 PetscCall(VecGetArrayRead(rr,&r)); 982 for (i=0; i<a->totalslices; i++) { /* loop over slices */ 983 if (i == a->totalslices-1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */ 984 for (j=a->sliidx[i],row=0; j<a->sliidx[i+1]; j++,row=((row+1)&0x07)) { 985 if (row < (A->rmap->n & 0x07)) a->val[j] *= r[a->colidx[j]]; 986 } 987 } else { 988 for (j=a->sliidx[i]; j<a->sliidx[i+1]; j++) { 989 a->val[j] *= r[a->colidx[j]]; 990 } 991 } 992 } 993 PetscCall(VecRestoreArrayRead(rr,&r)); 994 PetscCall(PetscLogFlops(a->nz)); 995 } 996 PetscCall(MatSeqSELLInvalidateDiagonal(A)); 997 PetscFunctionReturn(0); 998 } 999 1000 PetscErrorCode MatGetValues_SeqSELL(Mat A,PetscInt m,const PetscInt im[],PetscInt n,const PetscInt in[],PetscScalar v[]) 1001 { 1002 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 1003 PetscInt *cp,i,k,low,high,t,row,col,l; 1004 PetscInt shift; 1005 MatScalar *vp; 1006 1007 PetscFunctionBegin; 1008 for (k=0; k<m; k++) { /* loop over requested rows */ 1009 row = im[k]; 1010 if (row<0) continue; 1011 PetscCheck(row < A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT,row,A->rmap->n-1); 1012 shift = a->sliidx[row>>3]+(row&0x07); /* starting index of the row */ 1013 cp = a->colidx+shift; /* pointer to the row */ 1014 vp = a->val+shift; /* pointer to the row */ 1015 for (l=0; l<n; l++) { /* loop over requested columns */ 1016 col = in[l]; 1017 if (col<0) continue; 1018 PetscCheck(col < A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Column too large: row %" PetscInt_FMT " max %" PetscInt_FMT,col,A->cmap->n-1); 1019 high = a->rlen[row]; low = 0; /* assume unsorted */ 1020 while (high-low > 5) { 1021 t = (low+high)/2; 1022 if (*(cp+t*8) > col) high = t; 1023 else low = t; 1024 } 1025 for (i=low; i<high; i++) { 1026 if (*(cp+8*i) > col) break; 1027 if (*(cp+8*i) == col) { 1028 *v++ = *(vp+8*i); 1029 goto finished; 1030 } 1031 } 1032 *v++ = 0.0; 1033 finished:; 1034 } 1035 } 1036 PetscFunctionReturn(0); 1037 } 1038 1039 PetscErrorCode MatView_SeqSELL_ASCII(Mat A,PetscViewer viewer) 1040 { 1041 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 1042 PetscInt i,j,m=A->rmap->n,shift; 1043 const char *name; 1044 PetscViewerFormat format; 1045 1046 PetscFunctionBegin; 1047 PetscCall(PetscViewerGetFormat(viewer,&format)); 1048 if (format == PETSC_VIEWER_ASCII_MATLAB) { 1049 PetscInt nofinalvalue = 0; 1050 /* 1051 if (m && ((a->i[m] == a->i[m-1]) || (a->j[a->nz-1] != A->cmap->n-1))) { 1052 nofinalvalue = 1; 1053 } 1054 */ 1055 PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_FALSE)); 1056 PetscCall(PetscViewerASCIIPrintf(viewer,"%% Size = %" PetscInt_FMT " %" PetscInt_FMT " \n",m,A->cmap->n)); 1057 PetscCall(PetscViewerASCIIPrintf(viewer,"%% Nonzeros = %" PetscInt_FMT " \n",a->nz)); 1058 #if defined(PETSC_USE_COMPLEX) 1059 PetscCall(PetscViewerASCIIPrintf(viewer,"zzz = zeros(%" PetscInt_FMT ",4);\n",a->nz+nofinalvalue)); 1060 #else 1061 PetscCall(PetscViewerASCIIPrintf(viewer,"zzz = zeros(%" PetscInt_FMT ",3);\n",a->nz+nofinalvalue)); 1062 #endif 1063 PetscCall(PetscViewerASCIIPrintf(viewer,"zzz = [\n")); 1064 1065 for (i=0; i<m; i++) { 1066 shift = a->sliidx[i>>3]+(i&0x07); 1067 for (j=0; j<a->rlen[i]; j++) { 1068 #if defined(PETSC_USE_COMPLEX) 1069 PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %18.16e %18.16e\n",i+1,a->colidx[shift+8*j]+1,(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j]))); 1070 #else 1071 PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %18.16e\n",i+1,a->colidx[shift+8*j]+1,(double)a->val[shift+8*j])); 1072 #endif 1073 } 1074 } 1075 /* 1076 if (nofinalvalue) { 1077 #if defined(PETSC_USE_COMPLEX) 1078 PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %18.16e %18.16e\n",m,A->cmap->n,0.,0.)); 1079 #else 1080 PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %18.16e\n",m,A->cmap->n,0.0)); 1081 #endif 1082 } 1083 */ 1084 PetscCall(PetscObjectGetName((PetscObject)A,&name)); 1085 PetscCall(PetscViewerASCIIPrintf(viewer,"];\n %s = spconvert(zzz);\n",name)); 1086 PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_TRUE)); 1087 } else if (format == PETSC_VIEWER_ASCII_FACTOR_INFO || format == PETSC_VIEWER_ASCII_INFO) { 1088 PetscFunctionReturn(0); 1089 } else if (format == PETSC_VIEWER_ASCII_COMMON) { 1090 PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_FALSE)); 1091 for (i=0; i<m; i++) { 1092 PetscCall(PetscViewerASCIIPrintf(viewer,"row %" PetscInt_FMT ":",i)); 1093 shift = a->sliidx[i>>3]+(i&0x07); 1094 for (j=0; j<a->rlen[i]; j++) { 1095 #if defined(PETSC_USE_COMPLEX) 1096 if (PetscImaginaryPart(a->val[shift+8*j]) > 0.0 && PetscRealPart(a->val[shift+8*j]) != 0.0) { 1097 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g + %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j]))); 1098 } else if (PetscImaginaryPart(a->val[shift+8*j]) < 0.0 && PetscRealPart(a->val[shift+8*j]) != 0.0) { 1099 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g - %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)-PetscImaginaryPart(a->val[shift+8*j]))); 1100 } else if (PetscRealPart(a->val[shift+8*j]) != 0.0) { 1101 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]))); 1102 } 1103 #else 1104 if (a->val[shift+8*j] != 0.0) PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[shift+8*j],(double)a->val[shift+8*j])); 1105 #endif 1106 } 1107 PetscCall(PetscViewerASCIIPrintf(viewer,"\n")); 1108 } 1109 PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_TRUE)); 1110 } else if (format == PETSC_VIEWER_ASCII_DENSE) { 1111 PetscInt cnt=0,jcnt; 1112 PetscScalar value; 1113 #if defined(PETSC_USE_COMPLEX) 1114 PetscBool realonly=PETSC_TRUE; 1115 for (i=0; i<a->sliidx[a->totalslices]; i++) { 1116 if (PetscImaginaryPart(a->val[i]) != 0.0) { 1117 realonly = PETSC_FALSE; 1118 break; 1119 } 1120 } 1121 #endif 1122 1123 PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_FALSE)); 1124 for (i=0; i<m; i++) { 1125 jcnt = 0; 1126 shift = a->sliidx[i>>3]+(i&0x07); 1127 for (j=0; j<A->cmap->n; j++) { 1128 if (jcnt < a->rlen[i] && j == a->colidx[shift+8*j]) { 1129 value = a->val[cnt++]; 1130 jcnt++; 1131 } else { 1132 value = 0.0; 1133 } 1134 #if defined(PETSC_USE_COMPLEX) 1135 if (realonly) { 1136 PetscCall(PetscViewerASCIIPrintf(viewer," %7.5e ",(double)PetscRealPart(value))); 1137 } else { 1138 PetscCall(PetscViewerASCIIPrintf(viewer," %7.5e+%7.5e i ",(double)PetscRealPart(value),(double)PetscImaginaryPart(value))); 1139 } 1140 #else 1141 PetscCall(PetscViewerASCIIPrintf(viewer," %7.5e ",(double)value)); 1142 #endif 1143 } 1144 PetscCall(PetscViewerASCIIPrintf(viewer,"\n")); 1145 } 1146 PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_TRUE)); 1147 } else if (format == PETSC_VIEWER_ASCII_MATRIXMARKET) { 1148 PetscInt fshift=1; 1149 PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_FALSE)); 1150 #if defined(PETSC_USE_COMPLEX) 1151 PetscCall(PetscViewerASCIIPrintf(viewer,"%%%%MatrixMarket matrix coordinate complex general\n")); 1152 #else 1153 PetscCall(PetscViewerASCIIPrintf(viewer,"%%%%MatrixMarket matrix coordinate real general\n")); 1154 #endif 1155 PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %" PetscInt_FMT "\n", m, A->cmap->n, a->nz)); 1156 for (i=0; i<m; i++) { 1157 shift = a->sliidx[i>>3]+(i&0x07); 1158 for (j=0; j<a->rlen[i]; j++) { 1159 #if defined(PETSC_USE_COMPLEX) 1160 PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %g %g\n",i+fshift,a->colidx[shift+8*j]+fshift,(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j]))); 1161 #else 1162 PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %g\n",i+fshift,a->colidx[shift+8*j]+fshift,(double)a->val[shift+8*j])); 1163 #endif 1164 } 1165 } 1166 PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_TRUE)); 1167 } else if (format == PETSC_VIEWER_NATIVE) { 1168 for (i=0; i<a->totalslices; i++) { /* loop over slices */ 1169 PetscInt row; 1170 PetscCall(PetscViewerASCIIPrintf(viewer,"slice %" PetscInt_FMT ": %" PetscInt_FMT " %" PetscInt_FMT "\n",i,a->sliidx[i],a->sliidx[i+1])); 1171 for (j=a->sliidx[i],row=0; j<a->sliidx[i+1]; j++,row=((row+1)&0x07)) { 1172 #if defined(PETSC_USE_COMPLEX) 1173 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1174 PetscCall(PetscViewerASCIIPrintf(viewer," %" PetscInt_FMT " %" PetscInt_FMT " %g + %g i\n",8*i+row,a->colidx[j],(double)PetscRealPart(a->val[j]),(double)PetscImaginaryPart(a->val[j]))); 1175 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1176 PetscCall(PetscViewerASCIIPrintf(viewer," %" PetscInt_FMT " %" PetscInt_FMT " %g - %g i\n",8*i+row,a->colidx[j],(double)PetscRealPart(a->val[j]),-(double)PetscImaginaryPart(a->val[j]))); 1177 } else { 1178 PetscCall(PetscViewerASCIIPrintf(viewer," %" PetscInt_FMT " %" PetscInt_FMT " %g\n",8*i+row,a->colidx[j],(double)PetscRealPart(a->val[j]))); 1179 } 1180 #else 1181 PetscCall(PetscViewerASCIIPrintf(viewer," %" PetscInt_FMT " %" PetscInt_FMT " %g\n",8*i+row,a->colidx[j],(double)a->val[j])); 1182 #endif 1183 } 1184 } 1185 } else { 1186 PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_FALSE)); 1187 if (A->factortype) { 1188 for (i=0; i<m; i++) { 1189 shift = a->sliidx[i>>3]+(i&0x07); 1190 PetscCall(PetscViewerASCIIPrintf(viewer,"row %" PetscInt_FMT ":",i)); 1191 /* L part */ 1192 for (j=shift; j<a->diag[i]; j+=8) { 1193 #if defined(PETSC_USE_COMPLEX) 1194 if (PetscImaginaryPart(a->val[shift+8*j]) > 0.0) { 1195 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g + %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)PetscImaginaryPart(a->val[j]))); 1196 } else if (PetscImaginaryPart(a->val[shift+8*j]) < 0.0) { 1197 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g - %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)(-PetscImaginaryPart(a->val[j])))); 1198 } else { 1199 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[j],(double)PetscRealPart(a->val[j]))); 1200 } 1201 #else 1202 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[j],(double)a->val[j])); 1203 #endif 1204 } 1205 /* diagonal */ 1206 j = a->diag[i]; 1207 #if defined(PETSC_USE_COMPLEX) 1208 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1209 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g + %g i)",a->colidx[j],(double)PetscRealPart(1.0/a->val[j]),(double)PetscImaginaryPart(1.0/a->val[j]))); 1210 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1211 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g - %g i)",a->colidx[j],(double)PetscRealPart(1.0/a->val[j]),(double)(-PetscImaginaryPart(1.0/a->val[j])))); 1212 } else { 1213 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[j],(double)PetscRealPart(1.0/a->val[j]))); 1214 } 1215 #else 1216 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[j],(double)(1.0/a->val[j]))); 1217 #endif 1218 1219 /* U part */ 1220 for (j=a->diag[i]+1; j<shift+8*a->rlen[i]; j+=8) { 1221 #if defined(PETSC_USE_COMPLEX) 1222 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1223 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g + %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)PetscImaginaryPart(a->val[j]))); 1224 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1225 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g - %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)(-PetscImaginaryPart(a->val[j])))); 1226 } else { 1227 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[j],(double)PetscRealPart(a->val[j]))); 1228 } 1229 #else 1230 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[j],(double)a->val[j])); 1231 #endif 1232 } 1233 PetscCall(PetscViewerASCIIPrintf(viewer,"\n")); 1234 } 1235 } else { 1236 for (i=0; i<m; i++) { 1237 shift = a->sliidx[i>>3]+(i&0x07); 1238 PetscCall(PetscViewerASCIIPrintf(viewer,"row %" PetscInt_FMT ":",i)); 1239 for (j=0; j<a->rlen[i]; j++) { 1240 #if defined(PETSC_USE_COMPLEX) 1241 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1242 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g + %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j]))); 1243 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1244 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g - %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)-PetscImaginaryPart(a->val[shift+8*j]))); 1245 } else { 1246 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]))); 1247 } 1248 #else 1249 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[shift+8*j],(double)a->val[shift+8*j])); 1250 #endif 1251 } 1252 PetscCall(PetscViewerASCIIPrintf(viewer,"\n")); 1253 } 1254 } 1255 PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_TRUE)); 1256 } 1257 PetscCall(PetscViewerFlush(viewer)); 1258 PetscFunctionReturn(0); 1259 } 1260 1261 #include <petscdraw.h> 1262 PetscErrorCode MatView_SeqSELL_Draw_Zoom(PetscDraw draw,void *Aa) 1263 { 1264 Mat A=(Mat)Aa; 1265 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 1266 PetscInt i,j,m=A->rmap->n,shift; 1267 int color; 1268 PetscReal xl,yl,xr,yr,x_l,x_r,y_l,y_r; 1269 PetscViewer viewer; 1270 PetscViewerFormat format; 1271 1272 PetscFunctionBegin; 1273 PetscCall(PetscObjectQuery((PetscObject)A,"Zoomviewer",(PetscObject*)&viewer)); 1274 PetscCall(PetscViewerGetFormat(viewer,&format)); 1275 PetscCall(PetscDrawGetCoordinates(draw,&xl,&yl,&xr,&yr)); 1276 1277 /* loop over matrix elements drawing boxes */ 1278 1279 if (format != PETSC_VIEWER_DRAW_CONTOUR) { 1280 PetscDrawCollectiveBegin(draw); 1281 /* Blue for negative, Cyan for zero and Red for positive */ 1282 color = PETSC_DRAW_BLUE; 1283 for (i=0; i<m; i++) { 1284 shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */ 1285 y_l = m - i - 1.0; y_r = y_l + 1.0; 1286 for (j=0; j<a->rlen[i]; j++) { 1287 x_l = a->colidx[shift+j*8]; x_r = x_l + 1.0; 1288 if (PetscRealPart(a->val[shift+8*j]) >= 0.) continue; 1289 PetscCall(PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color)); 1290 } 1291 } 1292 color = PETSC_DRAW_CYAN; 1293 for (i=0; i<m; i++) { 1294 shift = a->sliidx[i>>3]+(i&0x07); 1295 y_l = m - i - 1.0; y_r = y_l + 1.0; 1296 for (j=0; j<a->rlen[i]; j++) { 1297 x_l = a->colidx[shift+j*8]; x_r = x_l + 1.0; 1298 if (a->val[shift+8*j] != 0.) continue; 1299 PetscCall(PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color)); 1300 } 1301 } 1302 color = PETSC_DRAW_RED; 1303 for (i=0; i<m; i++) { 1304 shift = a->sliidx[i>>3]+(i&0x07); 1305 y_l = m - i - 1.0; y_r = y_l + 1.0; 1306 for (j=0; j<a->rlen[i]; j++) { 1307 x_l = a->colidx[shift+j*8]; x_r = x_l + 1.0; 1308 if (PetscRealPart(a->val[shift+8*j]) <= 0.) continue; 1309 PetscCall(PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color)); 1310 } 1311 } 1312 PetscDrawCollectiveEnd(draw); 1313 } else { 1314 /* use contour shading to indicate magnitude of values */ 1315 /* first determine max of all nonzero values */ 1316 PetscReal minv=0.0,maxv=0.0; 1317 PetscInt count=0; 1318 PetscDraw popup; 1319 for (i=0; i<a->sliidx[a->totalslices]; i++) { 1320 if (PetscAbsScalar(a->val[i]) > maxv) maxv = PetscAbsScalar(a->val[i]); 1321 } 1322 if (minv >= maxv) maxv = minv + PETSC_SMALL; 1323 PetscCall(PetscDrawGetPopup(draw,&popup)); 1324 PetscCall(PetscDrawScalePopup(popup,minv,maxv)); 1325 1326 PetscDrawCollectiveBegin(draw); 1327 for (i=0; i<m; i++) { 1328 shift = a->sliidx[i>>3]+(i&0x07); 1329 y_l = m - i - 1.0; 1330 y_r = y_l + 1.0; 1331 for (j=0; j<a->rlen[i]; j++) { 1332 x_l = a->colidx[shift+j*8]; 1333 x_r = x_l + 1.0; 1334 color = PetscDrawRealToColor(PetscAbsScalar(a->val[count]),minv,maxv); 1335 PetscCall(PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color)); 1336 count++; 1337 } 1338 } 1339 PetscDrawCollectiveEnd(draw); 1340 } 1341 PetscFunctionReturn(0); 1342 } 1343 1344 #include <petscdraw.h> 1345 PetscErrorCode MatView_SeqSELL_Draw(Mat A,PetscViewer viewer) 1346 { 1347 PetscDraw draw; 1348 PetscReal xr,yr,xl,yl,h,w; 1349 PetscBool isnull; 1350 1351 PetscFunctionBegin; 1352 PetscCall(PetscViewerDrawGetDraw(viewer,0,&draw)); 1353 PetscCall(PetscDrawIsNull(draw,&isnull)); 1354 if (isnull) PetscFunctionReturn(0); 1355 1356 xr = A->cmap->n; yr = A->rmap->n; h = yr/10.0; w = xr/10.0; 1357 xr += w; yr += h; xl = -w; yl = -h; 1358 PetscCall(PetscDrawSetCoordinates(draw,xl,yl,xr,yr)); 1359 PetscCall(PetscObjectCompose((PetscObject)A,"Zoomviewer",(PetscObject)viewer)); 1360 PetscCall(PetscDrawZoom(draw,MatView_SeqSELL_Draw_Zoom,A)); 1361 PetscCall(PetscObjectCompose((PetscObject)A,"Zoomviewer",NULL)); 1362 PetscCall(PetscDrawSave(draw)); 1363 PetscFunctionReturn(0); 1364 } 1365 1366 PetscErrorCode MatView_SeqSELL(Mat A,PetscViewer viewer) 1367 { 1368 PetscBool iascii,isbinary,isdraw; 1369 1370 PetscFunctionBegin; 1371 PetscCall(PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&iascii)); 1372 PetscCall(PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERBINARY,&isbinary)); 1373 PetscCall(PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERDRAW,&isdraw)); 1374 if (iascii) { 1375 PetscCall(MatView_SeqSELL_ASCII(A,viewer)); 1376 } else if (isbinary) { 1377 /* PetscCall(MatView_SeqSELL_Binary(A,viewer)); */ 1378 } else if (isdraw) PetscCall(MatView_SeqSELL_Draw(A,viewer)); 1379 PetscFunctionReturn(0); 1380 } 1381 1382 PetscErrorCode MatAssemblyEnd_SeqSELL(Mat A,MatAssemblyType mode) 1383 { 1384 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 1385 PetscInt i,shift,row_in_slice,row,nrow,*cp,lastcol,j,k; 1386 MatScalar *vp; 1387 1388 PetscFunctionBegin; 1389 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 1390 /* To do: compress out the unused elements */ 1391 PetscCall(MatMarkDiagonal_SeqSELL(A)); 1392 PetscCall(PetscInfo(A,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: %" PetscInt_FMT " allocated %" PetscInt_FMT " used (%" PetscInt_FMT " nonzeros+%" PetscInt_FMT " paddedzeros)\n",A->rmap->n,A->cmap->n,a->maxallocmat,a->sliidx[a->totalslices],a->nz,a->sliidx[a->totalslices]-a->nz)); 1393 PetscCall(PetscInfo(A,"Number of mallocs during MatSetValues() is %" PetscInt_FMT "\n",a->reallocs)); 1394 PetscCall(PetscInfo(A,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",a->rlenmax)); 1395 /* Set unused slots for column indices to last valid column index. Set unused slots for values to zero. This allows for a use of unmasked intrinsics -> higher performance */ 1396 for (i=0; i<a->totalslices; ++i) { 1397 shift = a->sliidx[i]; /* starting index of the slice */ 1398 cp = a->colidx+shift; /* pointer to the column indices of the slice */ 1399 vp = a->val+shift; /* pointer to the nonzero values of the slice */ 1400 for (row_in_slice=0; row_in_slice<8; ++row_in_slice) { /* loop over rows in the slice */ 1401 row = 8*i + row_in_slice; 1402 nrow = a->rlen[row]; /* number of nonzeros in row */ 1403 /* 1404 Search for the nearest nonzero. Normally setting the index to zero may cause extra communication. 1405 But if the entire slice are empty, it is fine to use 0 since the index will not be loaded. 1406 */ 1407 lastcol = 0; 1408 if (nrow>0) { /* nonempty row */ 1409 lastcol = cp[8*(nrow-1)+row_in_slice]; /* use the index from the last nonzero at current row */ 1410 } else if (!row_in_slice) { /* first row of the currect slice is empty */ 1411 for (j=1;j<8;j++) { 1412 if (a->rlen[8*i+j]) { 1413 lastcol = cp[j]; 1414 break; 1415 } 1416 } 1417 } else { 1418 if (a->sliidx[i+1] != shift) lastcol = cp[row_in_slice-1]; /* use the index from the previous row */ 1419 } 1420 1421 for (k=nrow; k<(a->sliidx[i+1]-shift)/8; ++k) { 1422 cp[8*k+row_in_slice] = lastcol; 1423 vp[8*k+row_in_slice] = (MatScalar)0; 1424 } 1425 } 1426 } 1427 1428 A->info.mallocs += a->reallocs; 1429 a->reallocs = 0; 1430 1431 PetscCall(MatSeqSELLInvalidateDiagonal(A)); 1432 PetscFunctionReturn(0); 1433 } 1434 1435 PetscErrorCode MatGetInfo_SeqSELL(Mat A,MatInfoType flag,MatInfo *info) 1436 { 1437 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 1438 1439 PetscFunctionBegin; 1440 info->block_size = 1.0; 1441 info->nz_allocated = a->maxallocmat; 1442 info->nz_used = a->sliidx[a->totalslices]; /* include padding zeros */ 1443 info->nz_unneeded = (a->maxallocmat-a->sliidx[a->totalslices]); 1444 info->assemblies = A->num_ass; 1445 info->mallocs = A->info.mallocs; 1446 info->memory = ((PetscObject)A)->mem; 1447 if (A->factortype) { 1448 info->fill_ratio_given = A->info.fill_ratio_given; 1449 info->fill_ratio_needed = A->info.fill_ratio_needed; 1450 info->factor_mallocs = A->info.factor_mallocs; 1451 } else { 1452 info->fill_ratio_given = 0; 1453 info->fill_ratio_needed = 0; 1454 info->factor_mallocs = 0; 1455 } 1456 PetscFunctionReturn(0); 1457 } 1458 1459 PetscErrorCode MatSetValues_SeqSELL(Mat A,PetscInt m,const PetscInt im[],PetscInt n,const PetscInt in[],const PetscScalar v[],InsertMode is) 1460 { 1461 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 1462 PetscInt shift,i,k,l,low,high,t,ii,row,col,nrow; 1463 PetscInt *cp,nonew=a->nonew,lastcol=-1; 1464 MatScalar *vp,value; 1465 1466 PetscFunctionBegin; 1467 for (k=0; k<m; k++) { /* loop over added rows */ 1468 row = im[k]; 1469 if (row < 0) continue; 1470 PetscCheck(row < A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT,row,A->rmap->n-1); 1471 shift = a->sliidx[row>>3]+(row&0x07); /* starting index of the row */ 1472 cp = a->colidx+shift; /* pointer to the row */ 1473 vp = a->val+shift; /* pointer to the row */ 1474 nrow = a->rlen[row]; 1475 low = 0; 1476 high = nrow; 1477 1478 for (l=0; l<n; l++) { /* loop over added columns */ 1479 col = in[l]; 1480 if (col<0) continue; 1481 PetscCheck(col < A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Col too large: row %" PetscInt_FMT " max %" PetscInt_FMT,col,A->cmap->n-1); 1482 if (a->roworiented) { 1483 value = v[l+k*n]; 1484 } else { 1485 value = v[k+l*m]; 1486 } 1487 if ((value == 0.0 && a->ignorezeroentries) && (is == ADD_VALUES)) continue; 1488 1489 /* search in this row for the specified column, i indicates the column to be set */ 1490 if (col <= lastcol) low = 0; 1491 else high = nrow; 1492 lastcol = col; 1493 while (high-low > 5) { 1494 t = (low+high)/2; 1495 if (*(cp+t*8) > col) high = t; 1496 else low = t; 1497 } 1498 for (i=low; i<high; i++) { 1499 if (*(cp+i*8) > col) break; 1500 if (*(cp+i*8) == col) { 1501 if (is == ADD_VALUES) *(vp+i*8) += value; 1502 else *(vp+i*8) = value; 1503 low = i + 1; 1504 goto noinsert; 1505 } 1506 } 1507 if (value == 0.0 && a->ignorezeroentries) goto noinsert; 1508 if (nonew == 1) goto noinsert; 1509 PetscCheck(nonew != -1,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Inserting a new nonzero (%" PetscInt_FMT ", %" PetscInt_FMT ") in the matrix", row, col); 1510 /* If the current row length exceeds the slice width (e.g. nrow==slice_width), allocate a new space, otherwise do nothing */ 1511 MatSeqXSELLReallocateSELL(A,A->rmap->n,1,nrow,a->sliidx,row/8,row,col,a->colidx,a->val,cp,vp,nonew,MatScalar); 1512 /* add the new nonzero to the high position, shift the remaining elements in current row to the right by one slot */ 1513 for (ii=nrow-1; ii>=i; ii--) { 1514 *(cp+(ii+1)*8) = *(cp+ii*8); 1515 *(vp+(ii+1)*8) = *(vp+ii*8); 1516 } 1517 a->rlen[row]++; 1518 *(cp+i*8) = col; 1519 *(vp+i*8) = value; 1520 a->nz++; 1521 A->nonzerostate++; 1522 low = i+1; high++; nrow++; 1523 noinsert:; 1524 } 1525 a->rlen[row] = nrow; 1526 } 1527 PetscFunctionReturn(0); 1528 } 1529 1530 PetscErrorCode MatCopy_SeqSELL(Mat A,Mat B,MatStructure str) 1531 { 1532 PetscFunctionBegin; 1533 /* If the two matrices have the same copy implementation, use fast copy. */ 1534 if (str == SAME_NONZERO_PATTERN && (A->ops->copy == B->ops->copy)) { 1535 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 1536 Mat_SeqSELL *b=(Mat_SeqSELL*)B->data; 1537 1538 PetscCheck(a->sliidx[a->totalslices] == b->sliidx[b->totalslices],PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Number of nonzeros in two matrices are different"); 1539 PetscCall(PetscArraycpy(b->val,a->val,a->sliidx[a->totalslices])); 1540 } else { 1541 PetscCall(MatCopy_Basic(A,B,str)); 1542 } 1543 PetscFunctionReturn(0); 1544 } 1545 1546 PetscErrorCode MatSetUp_SeqSELL(Mat A) 1547 { 1548 PetscFunctionBegin; 1549 PetscCall(MatSeqSELLSetPreallocation(A,PETSC_DEFAULT,NULL)); 1550 PetscFunctionReturn(0); 1551 } 1552 1553 PetscErrorCode MatSeqSELLGetArray_SeqSELL(Mat A,PetscScalar *array[]) 1554 { 1555 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 1556 1557 PetscFunctionBegin; 1558 *array = a->val; 1559 PetscFunctionReturn(0); 1560 } 1561 1562 PetscErrorCode MatSeqSELLRestoreArray_SeqSELL(Mat A,PetscScalar *array[]) 1563 { 1564 PetscFunctionBegin; 1565 PetscFunctionReturn(0); 1566 } 1567 1568 PetscErrorCode MatRealPart_SeqSELL(Mat A) 1569 { 1570 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 1571 PetscInt i; 1572 MatScalar *aval=a->val; 1573 1574 PetscFunctionBegin; 1575 for (i=0; i<a->sliidx[a->totalslices]; i++) aval[i]=PetscRealPart(aval[i]); 1576 PetscFunctionReturn(0); 1577 } 1578 1579 PetscErrorCode MatImaginaryPart_SeqSELL(Mat A) 1580 { 1581 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 1582 PetscInt i; 1583 MatScalar *aval=a->val; 1584 1585 PetscFunctionBegin; 1586 for (i=0; i<a->sliidx[a->totalslices]; i++) aval[i] = PetscImaginaryPart(aval[i]); 1587 PetscCall(MatSeqSELLInvalidateDiagonal(A)); 1588 PetscFunctionReturn(0); 1589 } 1590 1591 PetscErrorCode MatScale_SeqSELL(Mat inA,PetscScalar alpha) 1592 { 1593 Mat_SeqSELL *a=(Mat_SeqSELL*)inA->data; 1594 MatScalar *aval=a->val; 1595 PetscScalar oalpha=alpha; 1596 PetscBLASInt one=1,size; 1597 1598 PetscFunctionBegin; 1599 PetscCall(PetscBLASIntCast(a->sliidx[a->totalslices],&size)); 1600 PetscStackCallBLAS("BLASscal",BLASscal_(&size,&oalpha,aval,&one)); 1601 PetscCall(PetscLogFlops(a->nz)); 1602 PetscCall(MatSeqSELLInvalidateDiagonal(inA)); 1603 PetscFunctionReturn(0); 1604 } 1605 1606 PetscErrorCode MatShift_SeqSELL(Mat Y,PetscScalar a) 1607 { 1608 Mat_SeqSELL *y=(Mat_SeqSELL*)Y->data; 1609 1610 PetscFunctionBegin; 1611 if (!Y->preallocated || !y->nz) { 1612 PetscCall(MatSeqSELLSetPreallocation(Y,1,NULL)); 1613 } 1614 PetscCall(MatShift_Basic(Y,a)); 1615 PetscFunctionReturn(0); 1616 } 1617 1618 PetscErrorCode MatSOR_SeqSELL(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx) 1619 { 1620 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 1621 PetscScalar *x,sum,*t; 1622 const MatScalar *idiag=NULL,*mdiag; 1623 const PetscScalar *b,*xb; 1624 PetscInt n,m=A->rmap->n,i,j,shift; 1625 const PetscInt *diag; 1626 1627 PetscFunctionBegin; 1628 its = its*lits; 1629 1630 if (fshift != a->fshift || omega != a->omega) a->idiagvalid = PETSC_FALSE; /* must recompute idiag[] */ 1631 if (!a->idiagvalid) PetscCall(MatInvertDiagonal_SeqSELL(A,omega,fshift)); 1632 a->fshift = fshift; 1633 a->omega = omega; 1634 1635 diag = a->diag; 1636 t = a->ssor_work; 1637 idiag = a->idiag; 1638 mdiag = a->mdiag; 1639 1640 PetscCall(VecGetArray(xx,&x)); 1641 PetscCall(VecGetArrayRead(bb,&b)); 1642 /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */ 1643 PetscCheck(flag != SOR_APPLY_UPPER,PETSC_COMM_SELF,PETSC_ERR_SUP,"SOR_APPLY_UPPER is not implemented"); 1644 PetscCheck(flag != SOR_APPLY_LOWER,PETSC_COMM_SELF,PETSC_ERR_SUP,"SOR_APPLY_LOWER is not implemented"); 1645 PetscCheck(!(flag & SOR_EISENSTAT),PETSC_COMM_SELF,PETSC_ERR_SUP,"No support yet for Eisenstat"); 1646 1647 if (flag & SOR_ZERO_INITIAL_GUESS) { 1648 if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) { 1649 for (i=0; i<m; i++) { 1650 shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */ 1651 sum = b[i]; 1652 n = (diag[i]-shift)/8; 1653 for (j=0; j<n; j++) sum -= a->val[shift+j*8]*x[a->colidx[shift+j*8]]; 1654 t[i] = sum; 1655 x[i] = sum*idiag[i]; 1656 } 1657 xb = t; 1658 PetscCall(PetscLogFlops(a->nz)); 1659 } else xb = b; 1660 if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) { 1661 for (i=m-1; i>=0; i--) { 1662 shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */ 1663 sum = xb[i]; 1664 n = a->rlen[i]-(diag[i]-shift)/8-1; 1665 for (j=1; j<=n; j++) sum -= a->val[diag[i]+j*8]*x[a->colidx[diag[i]+j*8]]; 1666 if (xb == b) { 1667 x[i] = sum*idiag[i]; 1668 } else { 1669 x[i] = (1.-omega)*x[i]+sum*idiag[i]; /* omega in idiag */ 1670 } 1671 } 1672 PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */ 1673 } 1674 its--; 1675 } 1676 while (its--) { 1677 if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) { 1678 for (i=0; i<m; i++) { 1679 /* lower */ 1680 shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */ 1681 sum = b[i]; 1682 n = (diag[i]-shift)/8; 1683 for (j=0; j<n; j++) sum -= a->val[shift+j*8]*x[a->colidx[shift+j*8]]; 1684 t[i] = sum; /* save application of the lower-triangular part */ 1685 /* upper */ 1686 n = a->rlen[i]-(diag[i]-shift)/8-1; 1687 for (j=1; j<=n; j++) sum -= a->val[diag[i]+j*8]*x[a->colidx[diag[i]+j*8]]; 1688 x[i] = (1.-omega)*x[i]+sum*idiag[i]; /* omega in idiag */ 1689 } 1690 xb = t; 1691 PetscCall(PetscLogFlops(2.0*a->nz)); 1692 } else xb = b; 1693 if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) { 1694 for (i=m-1; i>=0; i--) { 1695 shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */ 1696 sum = xb[i]; 1697 if (xb == b) { 1698 /* whole matrix (no checkpointing available) */ 1699 n = a->rlen[i]; 1700 for (j=0; j<n; j++) sum -= a->val[shift+j*8]*x[a->colidx[shift+j*8]]; 1701 x[i] = (1.-omega)*x[i]+(sum+mdiag[i]*x[i])*idiag[i]; 1702 } else { /* lower-triangular part has been saved, so only apply upper-triangular */ 1703 n = a->rlen[i]-(diag[i]-shift)/8-1; 1704 for (j=1; j<=n; j++) sum -= a->val[diag[i]+j*8]*x[a->colidx[diag[i]+j*8]]; 1705 x[i] = (1.-omega)*x[i]+sum*idiag[i]; /* omega in idiag */ 1706 } 1707 } 1708 if (xb == b) { 1709 PetscCall(PetscLogFlops(2.0*a->nz)); 1710 } else { 1711 PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */ 1712 } 1713 } 1714 } 1715 PetscCall(VecRestoreArray(xx,&x)); 1716 PetscCall(VecRestoreArrayRead(bb,&b)); 1717 PetscFunctionReturn(0); 1718 } 1719 1720 /* -------------------------------------------------------------------*/ 1721 static struct _MatOps MatOps_Values = {MatSetValues_SeqSELL, 1722 MatGetRow_SeqSELL, 1723 MatRestoreRow_SeqSELL, 1724 MatMult_SeqSELL, 1725 /* 4*/ MatMultAdd_SeqSELL, 1726 MatMultTranspose_SeqSELL, 1727 MatMultTransposeAdd_SeqSELL, 1728 NULL, 1729 NULL, 1730 NULL, 1731 /* 10*/ NULL, 1732 NULL, 1733 NULL, 1734 MatSOR_SeqSELL, 1735 NULL, 1736 /* 15*/ MatGetInfo_SeqSELL, 1737 MatEqual_SeqSELL, 1738 MatGetDiagonal_SeqSELL, 1739 MatDiagonalScale_SeqSELL, 1740 NULL, 1741 /* 20*/ NULL, 1742 MatAssemblyEnd_SeqSELL, 1743 MatSetOption_SeqSELL, 1744 MatZeroEntries_SeqSELL, 1745 /* 24*/ NULL, 1746 NULL, 1747 NULL, 1748 NULL, 1749 NULL, 1750 /* 29*/ MatSetUp_SeqSELL, 1751 NULL, 1752 NULL, 1753 NULL, 1754 NULL, 1755 /* 34*/ MatDuplicate_SeqSELL, 1756 NULL, 1757 NULL, 1758 NULL, 1759 NULL, 1760 /* 39*/ NULL, 1761 NULL, 1762 NULL, 1763 MatGetValues_SeqSELL, 1764 MatCopy_SeqSELL, 1765 /* 44*/ NULL, 1766 MatScale_SeqSELL, 1767 MatShift_SeqSELL, 1768 NULL, 1769 NULL, 1770 /* 49*/ NULL, 1771 NULL, 1772 NULL, 1773 NULL, 1774 NULL, 1775 /* 54*/ MatFDColoringCreate_SeqXAIJ, 1776 NULL, 1777 NULL, 1778 NULL, 1779 NULL, 1780 /* 59*/ NULL, 1781 MatDestroy_SeqSELL, 1782 MatView_SeqSELL, 1783 NULL, 1784 NULL, 1785 /* 64*/ NULL, 1786 NULL, 1787 NULL, 1788 NULL, 1789 NULL, 1790 /* 69*/ NULL, 1791 NULL, 1792 NULL, 1793 NULL, 1794 NULL, 1795 /* 74*/ NULL, 1796 MatFDColoringApply_AIJ, /* reuse the FDColoring function for AIJ */ 1797 NULL, 1798 NULL, 1799 NULL, 1800 /* 79*/ NULL, 1801 NULL, 1802 NULL, 1803 NULL, 1804 NULL, 1805 /* 84*/ NULL, 1806 NULL, 1807 NULL, 1808 NULL, 1809 NULL, 1810 /* 89*/ NULL, 1811 NULL, 1812 NULL, 1813 NULL, 1814 NULL, 1815 /* 94*/ NULL, 1816 NULL, 1817 NULL, 1818 NULL, 1819 NULL, 1820 /* 99*/ NULL, 1821 NULL, 1822 NULL, 1823 MatConjugate_SeqSELL, 1824 NULL, 1825 /*104*/ NULL, 1826 NULL, 1827 NULL, 1828 NULL, 1829 NULL, 1830 /*109*/ NULL, 1831 NULL, 1832 NULL, 1833 NULL, 1834 MatMissingDiagonal_SeqSELL, 1835 /*114*/ NULL, 1836 NULL, 1837 NULL, 1838 NULL, 1839 NULL, 1840 /*119*/ NULL, 1841 NULL, 1842 NULL, 1843 NULL, 1844 NULL, 1845 /*124*/ NULL, 1846 NULL, 1847 NULL, 1848 NULL, 1849 NULL, 1850 /*129*/ NULL, 1851 NULL, 1852 NULL, 1853 NULL, 1854 NULL, 1855 /*134*/ NULL, 1856 NULL, 1857 NULL, 1858 NULL, 1859 NULL, 1860 /*139*/ NULL, 1861 NULL, 1862 NULL, 1863 MatFDColoringSetUp_SeqXAIJ, 1864 NULL, 1865 /*144*/ NULL, 1866 NULL, 1867 NULL, 1868 NULL, 1869 NULL, 1870 NULL 1871 }; 1872 1873 PetscErrorCode MatStoreValues_SeqSELL(Mat mat) 1874 { 1875 Mat_SeqSELL *a=(Mat_SeqSELL*)mat->data; 1876 1877 PetscFunctionBegin; 1878 PetscCheck(a->nonew,PETSC_COMM_SELF,PETSC_ERR_ORDER,"Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first"); 1879 1880 /* allocate space for values if not already there */ 1881 if (!a->saved_values) { 1882 PetscCall(PetscMalloc1(a->sliidx[a->totalslices]+1,&a->saved_values)); 1883 PetscCall(PetscLogObjectMemory((PetscObject)mat,(a->sliidx[a->totalslices]+1)*sizeof(PetscScalar))); 1884 } 1885 1886 /* copy values over */ 1887 PetscCall(PetscArraycpy(a->saved_values,a->val,a->sliidx[a->totalslices])); 1888 PetscFunctionReturn(0); 1889 } 1890 1891 PetscErrorCode MatRetrieveValues_SeqSELL(Mat mat) 1892 { 1893 Mat_SeqSELL *a=(Mat_SeqSELL*)mat->data; 1894 1895 PetscFunctionBegin; 1896 PetscCheck(a->nonew,PETSC_COMM_SELF,PETSC_ERR_ORDER,"Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first"); 1897 PetscCheck(a->saved_values,PETSC_COMM_SELF,PETSC_ERR_ORDER,"Must call MatStoreValues(A);first"); 1898 PetscCall(PetscArraycpy(a->val,a->saved_values,a->sliidx[a->totalslices])); 1899 PetscFunctionReturn(0); 1900 } 1901 1902 /*@C 1903 MatSeqSELLRestoreArray - returns access to the array where the data for a MATSEQSELL matrix is stored obtained by MatSeqSELLGetArray() 1904 1905 Not Collective 1906 1907 Input Parameters: 1908 . mat - a MATSEQSELL matrix 1909 . array - pointer to the data 1910 1911 Level: intermediate 1912 1913 .seealso: `MatSeqSELLGetArray()`, `MatSeqSELLRestoreArrayF90()` 1914 @*/ 1915 PetscErrorCode MatSeqSELLRestoreArray(Mat A,PetscScalar **array) 1916 { 1917 PetscFunctionBegin; 1918 PetscUseMethod(A,"MatSeqSELLRestoreArray_C",(Mat,PetscScalar**),(A,array)); 1919 PetscFunctionReturn(0); 1920 } 1921 1922 PETSC_EXTERN PetscErrorCode MatCreate_SeqSELL(Mat B) 1923 { 1924 Mat_SeqSELL *b; 1925 PetscMPIInt size; 1926 1927 PetscFunctionBegin; 1928 PetscCall(PetscCitationsRegister(citation,&cited)); 1929 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B),&size)); 1930 PetscCheck(size <= 1,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Comm must be of size 1"); 1931 1932 PetscCall(PetscNewLog(B,&b)); 1933 1934 B->data = (void*)b; 1935 1936 PetscCall(PetscMemcpy(B->ops,&MatOps_Values,sizeof(struct _MatOps))); 1937 1938 b->row = NULL; 1939 b->col = NULL; 1940 b->icol = NULL; 1941 b->reallocs = 0; 1942 b->ignorezeroentries = PETSC_FALSE; 1943 b->roworiented = PETSC_TRUE; 1944 b->nonew = 0; 1945 b->diag = NULL; 1946 b->solve_work = NULL; 1947 B->spptr = NULL; 1948 b->saved_values = NULL; 1949 b->idiag = NULL; 1950 b->mdiag = NULL; 1951 b->ssor_work = NULL; 1952 b->omega = 1.0; 1953 b->fshift = 0.0; 1954 b->idiagvalid = PETSC_FALSE; 1955 b->keepnonzeropattern = PETSC_FALSE; 1956 1957 PetscCall(PetscObjectChangeTypeName((PetscObject)B,MATSEQSELL)); 1958 PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatSeqSELLGetArray_C",MatSeqSELLGetArray_SeqSELL)); 1959 PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatSeqSELLRestoreArray_C",MatSeqSELLRestoreArray_SeqSELL)); 1960 PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatStoreValues_C",MatStoreValues_SeqSELL)); 1961 PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatRetrieveValues_C",MatRetrieveValues_SeqSELL)); 1962 PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatSeqSELLSetPreallocation_C",MatSeqSELLSetPreallocation_SeqSELL)); 1963 PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqsell_seqaij_C",MatConvert_SeqSELL_SeqAIJ)); 1964 PetscFunctionReturn(0); 1965 } 1966 1967 /* 1968 Given a matrix generated with MatGetFactor() duplicates all the information in A into B 1969 */ 1970 PetscErrorCode MatDuplicateNoCreate_SeqSELL(Mat C,Mat A,MatDuplicateOption cpvalues,PetscBool mallocmatspace) 1971 { 1972 Mat_SeqSELL *c = (Mat_SeqSELL*)C->data,*a = (Mat_SeqSELL*)A->data; 1973 PetscInt i,m=A->rmap->n; 1974 PetscInt totalslices=a->totalslices; 1975 1976 PetscFunctionBegin; 1977 C->factortype = A->factortype; 1978 c->row = NULL; 1979 c->col = NULL; 1980 c->icol = NULL; 1981 c->reallocs = 0; 1982 C->assembled = PETSC_TRUE; 1983 1984 PetscCall(PetscLayoutReference(A->rmap,&C->rmap)); 1985 PetscCall(PetscLayoutReference(A->cmap,&C->cmap)); 1986 1987 PetscCall(PetscMalloc1(8*totalslices,&c->rlen)); 1988 PetscCall(PetscLogObjectMemory((PetscObject)C,m*sizeof(PetscInt))); 1989 PetscCall(PetscMalloc1(totalslices+1,&c->sliidx)); 1990 PetscCall(PetscLogObjectMemory((PetscObject)C, (totalslices+1)*sizeof(PetscInt))); 1991 1992 for (i=0; i<m; i++) c->rlen[i] = a->rlen[i]; 1993 for (i=0; i<totalslices+1; i++) c->sliidx[i] = a->sliidx[i]; 1994 1995 /* allocate the matrix space */ 1996 if (mallocmatspace) { 1997 PetscCall(PetscMalloc2(a->maxallocmat,&c->val,a->maxallocmat,&c->colidx)); 1998 PetscCall(PetscLogObjectMemory((PetscObject)C,a->maxallocmat*(sizeof(PetscScalar)+sizeof(PetscInt)))); 1999 2000 c->singlemalloc = PETSC_TRUE; 2001 2002 if (m > 0) { 2003 PetscCall(PetscArraycpy(c->colidx,a->colidx,a->maxallocmat)); 2004 if (cpvalues == MAT_COPY_VALUES) { 2005 PetscCall(PetscArraycpy(c->val,a->val,a->maxallocmat)); 2006 } else { 2007 PetscCall(PetscArrayzero(c->val,a->maxallocmat)); 2008 } 2009 } 2010 } 2011 2012 c->ignorezeroentries = a->ignorezeroentries; 2013 c->roworiented = a->roworiented; 2014 c->nonew = a->nonew; 2015 if (a->diag) { 2016 PetscCall(PetscMalloc1(m,&c->diag)); 2017 PetscCall(PetscLogObjectMemory((PetscObject)C,m*sizeof(PetscInt))); 2018 for (i=0; i<m; i++) { 2019 c->diag[i] = a->diag[i]; 2020 } 2021 } else c->diag = NULL; 2022 2023 c->solve_work = NULL; 2024 c->saved_values = NULL; 2025 c->idiag = NULL; 2026 c->ssor_work = NULL; 2027 c->keepnonzeropattern = a->keepnonzeropattern; 2028 c->free_val = PETSC_TRUE; 2029 c->free_colidx = PETSC_TRUE; 2030 2031 c->maxallocmat = a->maxallocmat; 2032 c->maxallocrow = a->maxallocrow; 2033 c->rlenmax = a->rlenmax; 2034 c->nz = a->nz; 2035 C->preallocated = PETSC_TRUE; 2036 2037 c->nonzerorowcnt = a->nonzerorowcnt; 2038 C->nonzerostate = A->nonzerostate; 2039 2040 PetscCall(PetscFunctionListDuplicate(((PetscObject)A)->qlist,&((PetscObject)C)->qlist)); 2041 PetscFunctionReturn(0); 2042 } 2043 2044 PetscErrorCode MatDuplicate_SeqSELL(Mat A,MatDuplicateOption cpvalues,Mat *B) 2045 { 2046 PetscFunctionBegin; 2047 PetscCall(MatCreate(PetscObjectComm((PetscObject)A),B)); 2048 PetscCall(MatSetSizes(*B,A->rmap->n,A->cmap->n,A->rmap->n,A->cmap->n)); 2049 if (!(A->rmap->n % A->rmap->bs) && !(A->cmap->n % A->cmap->bs)) { 2050 PetscCall(MatSetBlockSizesFromMats(*B,A,A)); 2051 } 2052 PetscCall(MatSetType(*B,((PetscObject)A)->type_name)); 2053 PetscCall(MatDuplicateNoCreate_SeqSELL(*B,A,cpvalues,PETSC_TRUE)); 2054 PetscFunctionReturn(0); 2055 } 2056 2057 /*MC 2058 MATSEQSELL - MATSEQSELL = "seqsell" - A matrix type to be used for sequential sparse matrices, 2059 based on the sliced Ellpack format 2060 2061 Options Database Keys: 2062 . -mat_type seqsell - sets the matrix type to "seqsell" during a call to MatSetFromOptions() 2063 2064 Level: beginner 2065 2066 .seealso: `MatCreateSeqSell()`, `MATSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATAIJ`, `MATMPIAIJ` 2067 M*/ 2068 2069 /*MC 2070 MATSELL - MATSELL = "sell" - A matrix type to be used for sparse matrices. 2071 2072 This matrix type is identical to MATSEQSELL when constructed with a single process communicator, 2073 and MATMPISELL otherwise. As a result, for single process communicators, 2074 MatSeqSELLSetPreallocation() is supported, and similarly MatMPISELLSetPreallocation() is supported 2075 for communicators controlling multiple processes. It is recommended that you call both of 2076 the above preallocation routines for simplicity. 2077 2078 Options Database Keys: 2079 . -mat_type sell - sets the matrix type to "sell" during a call to MatSetFromOptions() 2080 2081 Level: beginner 2082 2083 Notes: 2084 This format is only supported for real scalars, double precision, and 32 bit indices (the defaults). 2085 2086 It can provide better performance on Intel and AMD processes with AVX2 or AVX512 support for matrices that have a similar number of 2087 non-zeros in contiguous groups of rows. However if the computation is memory bandwidth limited it may not provide much improvement. 2088 2089 Developer Notes: 2090 On Intel (and AMD) systems some of the matrix operations use SIMD (AVX) instructions to achieve higher performance. 2091 2092 The sparse matrix format is as follows. For simplicity we assume a slice size of 2, it is actually 8 2093 .vb 2094 (2 0 3 4) 2095 Consider the matrix A = (5 0 6 0) 2096 (0 0 7 8) 2097 (0 0 9 9) 2098 2099 symbolically the Ellpack format can be written as 2100 2101 (2 3 4 |) (0 2 3 |) 2102 v = (5 6 0 |) colidx = (0 2 2 |) 2103 -------- --------- 2104 (7 8 |) (2 3 |) 2105 (9 9 |) (2 3 |) 2106 2107 The data for 2 contiguous rows of the matrix are stored together (in column-major format) (with any left-over rows handled as a special case). 2108 Any of the rows in a slice fewer columns than the rest of the slice (row 1 above) are padded with a previous valid column in their "extra" colidx[] locations and 2109 zeros in their "extra" v locations so that the matrix operations do not need special code to handle different length rows within the 2 rows in a slice. 2110 2111 The one-dimensional representation of v used in the code is (2 5 3 6 4 0 7 9 8 9) and for colidx is (0 0 2 2 3 2 2 2 3 3) 2112 2113 .ve 2114 2115 See MatMult_SeqSELL() for how this format is used with the SIMD operations to achieve high performance. 2116 2117 References: 2118 . * - Hong Zhang, Richard T. Mills, Karl Rupp, and Barry F. Smith, Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512}, 2119 Proceedings of the 47th International Conference on Parallel Processing, 2018. 2120 2121 .seealso: `MatCreateSeqSELL()`, `MatCreateSeqAIJ()`, `MatCreateSell()`, `MATSEQSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATMPIAIJ`, `MATAIJ` 2122 M*/ 2123 2124 /*@C 2125 MatCreateSeqSELL - Creates a sparse matrix in SELL format. 2126 2127 Collective on comm 2128 2129 Input Parameters: 2130 + comm - MPI communicator, set to PETSC_COMM_SELF 2131 . m - number of rows 2132 . n - number of columns 2133 . rlenmax - maximum number of nonzeros in a row 2134 - rlen - array containing the number of nonzeros in the various rows 2135 (possibly different for each row) or NULL 2136 2137 Output Parameter: 2138 . A - the matrix 2139 2140 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 2141 MatXXXXSetPreallocation() paradigm instead of this routine directly. 2142 [MatXXXXSetPreallocation() is, for example, MatSeqSELLSetPreallocation] 2143 2144 Notes: 2145 If nnz is given then nz is ignored 2146 2147 Specify the preallocated storage with either rlenmax or rlen (not both). 2148 Set rlenmax=PETSC_DEFAULT and rlen=NULL for PETSc to control dynamic memory 2149 allocation. For large problems you MUST preallocate memory or you 2150 will get TERRIBLE performance, see the users' manual chapter on matrices. 2151 2152 Level: intermediate 2153 2154 .seealso: `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatSeqSELLSetPreallocation()`, `MATSELL`, `MATSEQSELL`, `MATMPISELL` 2155 2156 @*/ 2157 PetscErrorCode MatCreateSeqSELL(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt maxallocrow,const PetscInt rlen[],Mat *A) 2158 { 2159 PetscFunctionBegin; 2160 PetscCall(MatCreate(comm,A)); 2161 PetscCall(MatSetSizes(*A,m,n,m,n)); 2162 PetscCall(MatSetType(*A,MATSEQSELL)); 2163 PetscCall(MatSeqSELLSetPreallocation_SeqSELL(*A,maxallocrow,rlen)); 2164 PetscFunctionReturn(0); 2165 } 2166 2167 PetscErrorCode MatEqual_SeqSELL(Mat A,Mat B,PetscBool * flg) 2168 { 2169 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data,*b=(Mat_SeqSELL*)B->data; 2170 PetscInt totalslices=a->totalslices; 2171 2172 PetscFunctionBegin; 2173 /* If the matrix dimensions are not equal,or no of nonzeros */ 2174 if ((A->rmap->n != B->rmap->n) || (A->cmap->n != B->cmap->n) ||(a->nz != b->nz) || (a->rlenmax != b->rlenmax)) { 2175 *flg = PETSC_FALSE; 2176 PetscFunctionReturn(0); 2177 } 2178 /* if the a->colidx are the same */ 2179 PetscCall(PetscArraycmp(a->colidx,b->colidx,a->sliidx[totalslices],flg)); 2180 if (!*flg) PetscFunctionReturn(0); 2181 /* if a->val are the same */ 2182 PetscCall(PetscArraycmp(a->val,b->val,a->sliidx[totalslices],flg)); 2183 PetscFunctionReturn(0); 2184 } 2185 2186 PetscErrorCode MatSeqSELLInvalidateDiagonal(Mat A) 2187 { 2188 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 2189 2190 PetscFunctionBegin; 2191 a->idiagvalid = PETSC_FALSE; 2192 PetscFunctionReturn(0); 2193 } 2194 2195 PetscErrorCode MatConjugate_SeqSELL(Mat A) 2196 { 2197 #if defined(PETSC_USE_COMPLEX) 2198 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 2199 PetscInt i; 2200 PetscScalar *val = a->val; 2201 2202 PetscFunctionBegin; 2203 for (i=0; i<a->sliidx[a->totalslices]; i++) { 2204 val[i] = PetscConj(val[i]); 2205 } 2206 #else 2207 PetscFunctionBegin; 2208 #endif 2209 PetscFunctionReturn(0); 2210 } 2211