1 2 /* 3 Defines the basic matrix operations for the SELL matrix storage format. 4 */ 5 #include <../src/mat/impls/sell/seq/sell.h> /*I "petscmat.h" I*/ 6 #include <petscblaslapack.h> 7 #include <petsc/private/kernels/blocktranspose.h> 8 9 static PetscBool cited = PETSC_FALSE; 10 static const char citation[] = 11 "@inproceedings{ZhangELLPACK2018,\n" 12 " author = {Hong Zhang and Richard T. Mills and Karl Rupp and Barry F. Smith},\n" 13 " title = {Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512}},\n" 14 " booktitle = {Proceedings of the 47th International Conference on Parallel Processing},\n" 15 " year = 2018\n" 16 "}\n"; 17 18 #if defined(PETSC_HAVE_IMMINTRIN_H) && (defined(__AVX512F__) || (defined(__AVX2__) && defined(__FMA__)) || defined(__AVX__)) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 19 20 #include <immintrin.h> 21 22 #if !defined(_MM_SCALE_8) 23 #define _MM_SCALE_8 8 24 #endif 25 26 #if defined(__AVX512F__) 27 /* these do not work 28 vec_idx = _mm512_loadunpackhi_epi32(vec_idx,acolidx); 29 vec_vals = _mm512_loadunpackhi_pd(vec_vals,aval); 30 */ 31 #define AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y) \ 32 /* if the mask bit is set, copy from acolidx, otherwise from vec_idx */ \ 33 vec_idx = _mm256_loadu_si256((__m256i const*)acolidx); \ 34 vec_vals = _mm512_loadu_pd(aval); \ 35 vec_x = _mm512_i32gather_pd(vec_idx,x,_MM_SCALE_8); \ 36 vec_y = _mm512_fmadd_pd(vec_x,vec_vals,vec_y) 37 #elif defined(__AVX2__) && defined(__FMA__) 38 #define AVX2_Mult_Private(vec_idx,vec_x,vec_vals,vec_y) \ 39 vec_vals = _mm256_loadu_pd(aval); \ 40 vec_idx = _mm_loadu_si128((__m128i const*)acolidx); /* SSE2 */ \ 41 vec_x = _mm256_i32gather_pd(x,vec_idx,_MM_SCALE_8); \ 42 vec_y = _mm256_fmadd_pd(vec_x,vec_vals,vec_y) 43 #endif 44 #endif /* PETSC_HAVE_IMMINTRIN_H */ 45 46 /*@C 47 MatSeqSELLSetPreallocation - For good matrix assembly performance 48 the user should preallocate the matrix storage by setting the parameter nz 49 (or the array nnz). By setting these parameters accurately, performance 50 during matrix assembly can be increased significantly. 51 52 Collective 53 54 Input Parameters: 55 + B - The matrix 56 . nz - number of nonzeros per row (same for all rows) 57 - nnz - array containing the number of nonzeros in the various rows 58 (possibly different for each row) or NULL 59 60 Notes: 61 If nnz is given then nz is ignored. 62 63 Specify the preallocated storage with either nz or nnz (not both). 64 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 65 allocation. For large problems you MUST preallocate memory or you 66 will get TERRIBLE performance, see the users' manual chapter on matrices. 67 68 You can call MatGetInfo() to get information on how effective the preallocation was; 69 for example the fields mallocs,nz_allocated,nz_used,nz_unneeded; 70 You can also run with the option -info and look for messages with the string 71 malloc in them to see if additional memory allocation was needed. 72 73 Developers: Use nz of MAT_SKIP_ALLOCATION to not allocate any space for the matrix 74 entries or columns indices. 75 76 The maximum number of nonzeos in any row should be as accurate as possible. 77 If it is underestimated, you will get bad performance due to reallocation 78 (MatSeqXSELLReallocateSELL). 79 80 Level: intermediate 81 82 .seealso: MatCreate(), MatCreateSELL(), MatSetValues(), MatGetInfo() 83 84 @*/ 85 PetscErrorCode MatSeqSELLSetPreallocation(Mat B,PetscInt rlenmax,const PetscInt rlen[]) 86 { 87 PetscFunctionBegin; 88 PetscValidHeaderSpecific(B,MAT_CLASSID,1); 89 PetscValidType(B,1); 90 PetscTryMethod(B,"MatSeqSELLSetPreallocation_C",(Mat,PetscInt,const PetscInt[]),(B,rlenmax,rlen)); 91 PetscFunctionReturn(0); 92 } 93 94 PetscErrorCode MatSeqSELLSetPreallocation_SeqSELL(Mat B,PetscInt maxallocrow,const PetscInt rlen[]) 95 { 96 Mat_SeqSELL *b; 97 PetscInt i,j,totalslices; 98 PetscBool skipallocation=PETSC_FALSE,realalloc=PETSC_FALSE; 99 100 PetscFunctionBegin; 101 if (maxallocrow >= 0 || rlen) realalloc = PETSC_TRUE; 102 if (maxallocrow == MAT_SKIP_ALLOCATION) { 103 skipallocation = PETSC_TRUE; 104 maxallocrow = 0; 105 } 106 107 PetscCall(PetscLayoutSetUp(B->rmap)); 108 PetscCall(PetscLayoutSetUp(B->cmap)); 109 110 /* FIXME: if one preallocates more space than needed, the matrix does not shrink automatically, but for best performance it should */ 111 if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 5; 112 PetscCheck(maxallocrow >= 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"maxallocrow cannot be less than 0: value %" PetscInt_FMT,maxallocrow); 113 if (rlen) { 114 for (i=0; i<B->rmap->n; i++) { 115 PetscCheck(rlen[i] >= 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"rlen cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT,i,rlen[i]); 116 PetscCheck(rlen[i] <= B->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"rlen cannot be greater than row length: local row %" PetscInt_FMT " value %" PetscInt_FMT " rowlength %" PetscInt_FMT,i,rlen[i],B->cmap->n); 117 } 118 } 119 120 B->preallocated = PETSC_TRUE; 121 122 b = (Mat_SeqSELL*)B->data; 123 124 totalslices = PetscCeilInt(B->rmap->n,8); 125 b->totalslices = totalslices; 126 if (!skipallocation) { 127 if (B->rmap->n & 0x07) PetscCall(PetscInfo(B,"Padding rows to the SEQSELL matrix because the number of rows is not the multiple of 8 (value %" PetscInt_FMT ")\n",B->rmap->n)); 128 129 if (!b->sliidx) { /* sliidx gives the starting index of each slice, the last element is the total space allocated */ 130 PetscCall(PetscMalloc1(totalslices+1,&b->sliidx)); 131 PetscCall(PetscLogObjectMemory((PetscObject)B,(totalslices+1)*sizeof(PetscInt))); 132 } 133 if (!rlen) { /* if rlen is not provided, allocate same space for all the slices */ 134 if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 10; 135 else if (maxallocrow < 0) maxallocrow = 1; 136 for (i=0; i<=totalslices; i++) b->sliidx[i] = i*8*maxallocrow; 137 } else { 138 maxallocrow = 0; 139 b->sliidx[0] = 0; 140 for (i=1; i<totalslices; i++) { 141 b->sliidx[i] = 0; 142 for (j=0;j<8;j++) { 143 b->sliidx[i] = PetscMax(b->sliidx[i],rlen[8*(i-1)+j]); 144 } 145 maxallocrow = PetscMax(b->sliidx[i],maxallocrow); 146 PetscCall(PetscIntSumError(b->sliidx[i-1],8*b->sliidx[i],&b->sliidx[i])); 147 } 148 /* last slice */ 149 b->sliidx[totalslices] = 0; 150 for (j=(totalslices-1)*8;j<B->rmap->n;j++) b->sliidx[totalslices] = PetscMax(b->sliidx[totalslices],rlen[j]); 151 maxallocrow = PetscMax(b->sliidx[totalslices],maxallocrow); 152 b->sliidx[totalslices] = b->sliidx[totalslices-1] + 8*b->sliidx[totalslices]; 153 } 154 155 /* allocate space for val, colidx, rlen */ 156 /* FIXME: should B's old memory be unlogged? */ 157 PetscCall(MatSeqXSELLFreeSELL(B,&b->val,&b->colidx)); 158 /* FIXME: assuming an element of the bit array takes 8 bits */ 159 PetscCall(PetscMalloc2(b->sliidx[totalslices],&b->val,b->sliidx[totalslices],&b->colidx)); 160 PetscCall(PetscLogObjectMemory((PetscObject)B,b->sliidx[totalslices]*(sizeof(PetscScalar)+sizeof(PetscInt)))); 161 /* b->rlen will count nonzeros in each row so far. We dont copy rlen to b->rlen because the matrix has not been set. */ 162 PetscCall(PetscCalloc1(8*totalslices,&b->rlen)); 163 PetscCall(PetscLogObjectMemory((PetscObject)B,8*totalslices*sizeof(PetscInt))); 164 165 b->singlemalloc = PETSC_TRUE; 166 b->free_val = PETSC_TRUE; 167 b->free_colidx = PETSC_TRUE; 168 } else { 169 b->free_val = PETSC_FALSE; 170 b->free_colidx = PETSC_FALSE; 171 } 172 173 b->nz = 0; 174 b->maxallocrow = maxallocrow; 175 b->rlenmax = maxallocrow; 176 b->maxallocmat = b->sliidx[totalslices]; 177 B->info.nz_unneeded = (double)b->maxallocmat; 178 if (realalloc) { 179 PetscCall(MatSetOption(B,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE)); 180 } 181 PetscFunctionReturn(0); 182 } 183 184 PetscErrorCode MatGetRow_SeqSELL(Mat A,PetscInt row,PetscInt *nz,PetscInt **idx,PetscScalar **v) 185 { 186 Mat_SeqSELL *a = (Mat_SeqSELL*)A->data; 187 PetscInt shift; 188 189 PetscFunctionBegin; 190 PetscCheckFalse(row < 0 || row >= A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Row %" PetscInt_FMT " out of range",row); 191 if (nz) *nz = a->rlen[row]; 192 shift = a->sliidx[row>>3]+(row&0x07); 193 if (!a->getrowcols) { 194 195 PetscCall(PetscMalloc2(a->rlenmax,&a->getrowcols,a->rlenmax,&a->getrowvals)); 196 } 197 if (idx) { 198 PetscInt j; 199 for (j=0; j<a->rlen[row]; j++) a->getrowcols[j] = a->colidx[shift+8*j]; 200 *idx = a->getrowcols; 201 } 202 if (v) { 203 PetscInt j; 204 for (j=0; j<a->rlen[row]; j++) a->getrowvals[j] = a->val[shift+8*j]; 205 *v = a->getrowvals; 206 } 207 PetscFunctionReturn(0); 208 } 209 210 PetscErrorCode MatRestoreRow_SeqSELL(Mat A,PetscInt row,PetscInt *nz,PetscInt **idx,PetscScalar **v) 211 { 212 PetscFunctionBegin; 213 PetscFunctionReturn(0); 214 } 215 216 PetscErrorCode MatConvert_SeqSELL_SeqAIJ(Mat A, MatType newtype,MatReuse reuse,Mat *newmat) 217 { 218 Mat B; 219 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 220 PetscInt i; 221 222 PetscFunctionBegin; 223 if (reuse == MAT_REUSE_MATRIX) { 224 B = *newmat; 225 PetscCall(MatZeroEntries(B)); 226 } else { 227 PetscCall(MatCreate(PetscObjectComm((PetscObject)A),&B)); 228 PetscCall(MatSetSizes(B,A->rmap->n,A->cmap->n,A->rmap->N,A->cmap->N)); 229 PetscCall(MatSetType(B,MATSEQAIJ)); 230 PetscCall(MatSeqAIJSetPreallocation(B,0,a->rlen)); 231 } 232 233 for (i=0; i<A->rmap->n; i++) { 234 PetscInt nz = 0,*cols = NULL; 235 PetscScalar *vals = NULL; 236 237 PetscCall(MatGetRow_SeqSELL(A,i,&nz,&cols,&vals)); 238 PetscCall(MatSetValues(B,1,&i,nz,cols,vals,INSERT_VALUES)); 239 PetscCall(MatRestoreRow_SeqSELL(A,i,&nz,&cols,&vals)); 240 } 241 242 PetscCall(MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY)); 243 PetscCall(MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY)); 244 B->rmap->bs = A->rmap->bs; 245 246 if (reuse == MAT_INPLACE_MATRIX) { 247 PetscCall(MatHeaderReplace(A,&B)); 248 } else { 249 *newmat = B; 250 } 251 PetscFunctionReturn(0); 252 } 253 254 #include <../src/mat/impls/aij/seq/aij.h> 255 256 PetscErrorCode MatConvert_SeqAIJ_SeqSELL(Mat A,MatType newtype,MatReuse reuse,Mat *newmat) 257 { 258 Mat B; 259 Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data; 260 PetscInt *ai=a->i,m=A->rmap->N,n=A->cmap->N,i,*rowlengths,row,ncols; 261 const PetscInt *cols; 262 const PetscScalar *vals; 263 264 PetscFunctionBegin; 265 266 if (reuse == MAT_REUSE_MATRIX) { 267 B = *newmat; 268 } else { 269 if (PetscDefined(USE_DEBUG) || !a->ilen) { 270 PetscCall(PetscMalloc1(m,&rowlengths)); 271 for (i=0; i<m; i++) { 272 rowlengths[i] = ai[i+1] - ai[i]; 273 } 274 } 275 if (PetscDefined(USE_DEBUG) && a->ilen) { 276 PetscBool eq; 277 PetscCall(PetscMemcmp(rowlengths,a->ilen,m*sizeof(PetscInt),&eq)); 278 PetscCheck(eq,PETSC_COMM_SELF,PETSC_ERR_PLIB,"SeqAIJ ilen array incorrect"); 279 PetscCall(PetscFree(rowlengths)); 280 rowlengths = a->ilen; 281 } else if (a->ilen) rowlengths = a->ilen; 282 PetscCall(MatCreate(PetscObjectComm((PetscObject)A),&B)); 283 PetscCall(MatSetSizes(B,m,n,m,n)); 284 PetscCall(MatSetType(B,MATSEQSELL)); 285 PetscCall(MatSeqSELLSetPreallocation(B,0,rowlengths)); 286 if (rowlengths != a->ilen) PetscCall(PetscFree(rowlengths)); 287 } 288 289 for (row=0; row<m; row++) { 290 PetscCall(MatGetRow_SeqAIJ(A,row,&ncols,(PetscInt**)&cols,(PetscScalar**)&vals)); 291 PetscCall(MatSetValues_SeqSELL(B,1,&row,ncols,cols,vals,INSERT_VALUES)); 292 PetscCall(MatRestoreRow_SeqAIJ(A,row,&ncols,(PetscInt**)&cols,(PetscScalar**)&vals)); 293 } 294 PetscCall(MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY)); 295 PetscCall(MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY)); 296 B->rmap->bs = A->rmap->bs; 297 298 if (reuse == MAT_INPLACE_MATRIX) { 299 PetscCall(MatHeaderReplace(A,&B)); 300 } else { 301 *newmat = B; 302 } 303 PetscFunctionReturn(0); 304 } 305 306 PetscErrorCode MatMult_SeqSELL(Mat A,Vec xx,Vec yy) 307 { 308 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 309 PetscScalar *y; 310 const PetscScalar *x; 311 const MatScalar *aval=a->val; 312 PetscInt totalslices=a->totalslices; 313 const PetscInt *acolidx=a->colidx; 314 PetscInt i,j; 315 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 316 __m512d vec_x,vec_y,vec_vals; 317 __m256i vec_idx; 318 __mmask8 mask; 319 __m512d vec_x2,vec_y2,vec_vals2,vec_x3,vec_y3,vec_vals3,vec_x4,vec_y4,vec_vals4; 320 __m256i vec_idx2,vec_idx3,vec_idx4; 321 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 322 __m128i vec_idx; 323 __m256d vec_x,vec_y,vec_y2,vec_vals; 324 MatScalar yval; 325 PetscInt r,rows_left,row,nnz_in_row; 326 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 327 __m128d vec_x_tmp; 328 __m256d vec_x,vec_y,vec_y2,vec_vals; 329 MatScalar yval; 330 PetscInt r,rows_left,row,nnz_in_row; 331 #else 332 PetscScalar sum[8]; 333 #endif 334 335 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 336 #pragma disjoint(*x,*y,*aval) 337 #endif 338 339 PetscFunctionBegin; 340 PetscCall(VecGetArrayRead(xx,&x)); 341 PetscCall(VecGetArray(yy,&y)); 342 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 343 for (i=0; i<totalslices; i++) { /* loop over slices */ 344 PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0); 345 PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0); 346 347 vec_y = _mm512_setzero_pd(); 348 vec_y2 = _mm512_setzero_pd(); 349 vec_y3 = _mm512_setzero_pd(); 350 vec_y4 = _mm512_setzero_pd(); 351 352 j = a->sliidx[i]>>3; /* 8 bytes are read at each time, corresponding to a slice columnn */ 353 switch ((a->sliidx[i+1]-a->sliidx[i])/8 & 3) { 354 case 3: 355 AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y); 356 acolidx += 8; aval += 8; 357 AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2); 358 acolidx += 8; aval += 8; 359 AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3); 360 acolidx += 8; aval += 8; 361 j += 3; 362 break; 363 case 2: 364 AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y); 365 acolidx += 8; aval += 8; 366 AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2); 367 acolidx += 8; aval += 8; 368 j += 2; 369 break; 370 case 1: 371 AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y); 372 acolidx += 8; aval += 8; 373 j += 1; 374 break; 375 } 376 #pragma novector 377 for (; j<(a->sliidx[i+1]>>3); j+=4) { 378 AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y); 379 acolidx += 8; aval += 8; 380 AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2); 381 acolidx += 8; aval += 8; 382 AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3); 383 acolidx += 8; aval += 8; 384 AVX512_Mult_Private(vec_idx4,vec_x4,vec_vals4,vec_y4); 385 acolidx += 8; aval += 8; 386 } 387 388 vec_y = _mm512_add_pd(vec_y,vec_y2); 389 vec_y = _mm512_add_pd(vec_y,vec_y3); 390 vec_y = _mm512_add_pd(vec_y,vec_y4); 391 if (i == totalslices-1 && A->rmap->n & 0x07) { /* if last slice has padding rows */ 392 mask = (__mmask8)(0xff >> (8-(A->rmap->n & 0x07))); 393 _mm512_mask_storeu_pd(&y[8*i],mask,vec_y); 394 } else { 395 _mm512_storeu_pd(&y[8*i],vec_y); 396 } 397 } 398 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 399 for (i=0; i<totalslices; i++) { /* loop over full slices */ 400 PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0); 401 PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0); 402 403 /* last slice may have padding rows. Don't use vectorization. */ 404 if (i == totalslices-1 && (A->rmap->n & 0x07)) { 405 rows_left = A->rmap->n - 8*i; 406 for (r=0; r<rows_left; ++r) { 407 yval = (MatScalar)0; 408 row = 8*i + r; 409 nnz_in_row = a->rlen[row]; 410 for (j=0; j<nnz_in_row; ++j) yval += aval[8*j+r] * x[acolidx[8*j+r]]; 411 y[row] = yval; 412 } 413 break; 414 } 415 416 vec_y = _mm256_setzero_pd(); 417 vec_y2 = _mm256_setzero_pd(); 418 419 /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */ 420 #pragma novector 421 #pragma unroll(2) 422 for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) { 423 AVX2_Mult_Private(vec_idx,vec_x,vec_vals,vec_y); 424 aval += 4; acolidx += 4; 425 AVX2_Mult_Private(vec_idx,vec_x,vec_vals,vec_y2); 426 aval += 4; acolidx += 4; 427 } 428 429 _mm256_storeu_pd(y+i*8,vec_y); 430 _mm256_storeu_pd(y+i*8+4,vec_y2); 431 } 432 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 433 for (i=0; i<totalslices; i++) { /* loop over full slices */ 434 PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0); 435 PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0); 436 437 vec_y = _mm256_setzero_pd(); 438 vec_y2 = _mm256_setzero_pd(); 439 440 /* last slice may have padding rows. Don't use vectorization. */ 441 if (i == totalslices-1 && (A->rmap->n & 0x07)) { 442 rows_left = A->rmap->n - 8*i; 443 for (r=0; r<rows_left; ++r) { 444 yval = (MatScalar)0; 445 row = 8*i + r; 446 nnz_in_row = a->rlen[row]; 447 for (j=0; j<nnz_in_row; ++j) yval += aval[8*j + r] * x[acolidx[8*j + r]]; 448 y[row] = yval; 449 } 450 break; 451 } 452 453 /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */ 454 #pragma novector 455 #pragma unroll(2) 456 for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) { 457 vec_vals = _mm256_loadu_pd(aval); 458 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 459 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 460 vec_x = _mm256_insertf128_pd(vec_x,vec_x_tmp,0); 461 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 462 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 463 vec_x = _mm256_insertf128_pd(vec_x,vec_x_tmp,1); 464 vec_y = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y); 465 aval += 4; 466 467 vec_vals = _mm256_loadu_pd(aval); 468 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 469 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 470 vec_x = _mm256_insertf128_pd(vec_x,vec_x_tmp,0); 471 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 472 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 473 vec_x = _mm256_insertf128_pd(vec_x,vec_x_tmp,1); 474 vec_y2 = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y2); 475 aval += 4; 476 } 477 478 _mm256_storeu_pd(y + i*8, vec_y); 479 _mm256_storeu_pd(y + i*8 + 4, vec_y2); 480 } 481 #else 482 for (i=0; i<totalslices; i++) { /* loop over slices */ 483 for (j=0; j<8; j++) sum[j] = 0.0; 484 for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) { 485 sum[0] += aval[j] * x[acolidx[j]]; 486 sum[1] += aval[j+1] * x[acolidx[j+1]]; 487 sum[2] += aval[j+2] * x[acolidx[j+2]]; 488 sum[3] += aval[j+3] * x[acolidx[j+3]]; 489 sum[4] += aval[j+4] * x[acolidx[j+4]]; 490 sum[5] += aval[j+5] * x[acolidx[j+5]]; 491 sum[6] += aval[j+6] * x[acolidx[j+6]]; 492 sum[7] += aval[j+7] * x[acolidx[j+7]]; 493 } 494 if (i == totalslices-1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */ 495 for (j=0; j<(A->rmap->n & 0x07); j++) y[8*i+j] = sum[j]; 496 } else { 497 for (j=0; j<8; j++) y[8*i+j] = sum[j]; 498 } 499 } 500 #endif 501 502 PetscCall(PetscLogFlops(2.0*a->nz-a->nonzerorowcnt)); /* theoretical minimal FLOPs */ 503 PetscCall(VecRestoreArrayRead(xx,&x)); 504 PetscCall(VecRestoreArray(yy,&y)); 505 PetscFunctionReturn(0); 506 } 507 508 #include <../src/mat/impls/aij/seq/ftn-kernels/fmultadd.h> 509 PetscErrorCode MatMultAdd_SeqSELL(Mat A,Vec xx,Vec yy,Vec zz) 510 { 511 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 512 PetscScalar *y,*z; 513 const PetscScalar *x; 514 const MatScalar *aval=a->val; 515 PetscInt totalslices=a->totalslices; 516 const PetscInt *acolidx=a->colidx; 517 PetscInt i,j; 518 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 519 __m512d vec_x,vec_y,vec_vals; 520 __m256i vec_idx; 521 __mmask8 mask; 522 __m512d vec_x2,vec_y2,vec_vals2,vec_x3,vec_y3,vec_vals3,vec_x4,vec_y4,vec_vals4; 523 __m256i vec_idx2,vec_idx3,vec_idx4; 524 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 525 __m128d vec_x_tmp; 526 __m256d vec_x,vec_y,vec_y2,vec_vals; 527 MatScalar yval; 528 PetscInt r,row,nnz_in_row; 529 #else 530 PetscScalar sum[8]; 531 #endif 532 533 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 534 #pragma disjoint(*x,*y,*aval) 535 #endif 536 537 PetscFunctionBegin; 538 PetscCall(VecGetArrayRead(xx,&x)); 539 PetscCall(VecGetArrayPair(yy,zz,&y,&z)); 540 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 541 for (i=0; i<totalslices; i++) { /* loop over slices */ 542 PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0); 543 PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0); 544 545 if (i == totalslices-1 && A->rmap->n & 0x07) { /* if last slice has padding rows */ 546 mask = (__mmask8)(0xff >> (8-(A->rmap->n & 0x07))); 547 vec_y = _mm512_mask_loadu_pd(vec_y,mask,&y[8*i]); 548 } else { 549 vec_y = _mm512_loadu_pd(&y[8*i]); 550 } 551 vec_y2 = _mm512_setzero_pd(); 552 vec_y3 = _mm512_setzero_pd(); 553 vec_y4 = _mm512_setzero_pd(); 554 555 j = a->sliidx[i]>>3; /* 8 bytes are read at each time, corresponding to a slice columnn */ 556 switch ((a->sliidx[i+1]-a->sliidx[i])/8 & 3) { 557 case 3: 558 AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y); 559 acolidx += 8; aval += 8; 560 AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2); 561 acolidx += 8; aval += 8; 562 AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3); 563 acolidx += 8; aval += 8; 564 j += 3; 565 break; 566 case 2: 567 AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y); 568 acolidx += 8; aval += 8; 569 AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2); 570 acolidx += 8; aval += 8; 571 j += 2; 572 break; 573 case 1: 574 AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y); 575 acolidx += 8; aval += 8; 576 j += 1; 577 break; 578 } 579 #pragma novector 580 for (; j<(a->sliidx[i+1]>>3); j+=4) { 581 AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y); 582 acolidx += 8; aval += 8; 583 AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2); 584 acolidx += 8; aval += 8; 585 AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3); 586 acolidx += 8; aval += 8; 587 AVX512_Mult_Private(vec_idx4,vec_x4,vec_vals4,vec_y4); 588 acolidx += 8; aval += 8; 589 } 590 591 vec_y = _mm512_add_pd(vec_y,vec_y2); 592 vec_y = _mm512_add_pd(vec_y,vec_y3); 593 vec_y = _mm512_add_pd(vec_y,vec_y4); 594 if (i == totalslices-1 && A->rmap->n & 0x07) { /* if last slice has padding rows */ 595 _mm512_mask_storeu_pd(&z[8*i],mask,vec_y); 596 } else { 597 _mm512_storeu_pd(&z[8*i],vec_y); 598 } 599 } 600 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 601 for (i=0; i<totalslices; i++) { /* loop over full slices */ 602 PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0); 603 PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0); 604 605 /* last slice may have padding rows. Don't use vectorization. */ 606 if (i == totalslices-1 && (A->rmap->n & 0x07)) { 607 for (r=0; r<(A->rmap->n & 0x07); ++r) { 608 row = 8*i + r; 609 yval = (MatScalar)0.0; 610 nnz_in_row = a->rlen[row]; 611 for (j=0; j<nnz_in_row; ++j) yval += aval[8*j+r] * x[acolidx[8*j+r]]; 612 z[row] = y[row] + yval; 613 } 614 break; 615 } 616 617 vec_y = _mm256_loadu_pd(y+8*i); 618 vec_y2 = _mm256_loadu_pd(y+8*i+4); 619 620 /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */ 621 for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) { 622 vec_vals = _mm256_loadu_pd(aval); 623 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 624 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 625 vec_x = _mm256_insertf128_pd(vec_x,vec_x_tmp,0); 626 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 627 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 628 vec_x = _mm256_insertf128_pd(vec_x,vec_x_tmp,1); 629 vec_y = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y); 630 aval += 4; 631 632 vec_vals = _mm256_loadu_pd(aval); 633 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 634 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 635 vec_x = _mm256_insertf128_pd(vec_x,vec_x_tmp,0); 636 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 637 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 638 vec_x = _mm256_insertf128_pd(vec_x,vec_x_tmp,1); 639 vec_y2 = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y2); 640 aval += 4; 641 } 642 643 _mm256_storeu_pd(z+i*8,vec_y); 644 _mm256_storeu_pd(z+i*8+4,vec_y2); 645 } 646 #else 647 for (i=0; i<totalslices; i++) { /* loop over slices */ 648 for (j=0; j<8; j++) sum[j] = 0.0; 649 for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) { 650 sum[0] += aval[j] * x[acolidx[j]]; 651 sum[1] += aval[j+1] * x[acolidx[j+1]]; 652 sum[2] += aval[j+2] * x[acolidx[j+2]]; 653 sum[3] += aval[j+3] * x[acolidx[j+3]]; 654 sum[4] += aval[j+4] * x[acolidx[j+4]]; 655 sum[5] += aval[j+5] * x[acolidx[j+5]]; 656 sum[6] += aval[j+6] * x[acolidx[j+6]]; 657 sum[7] += aval[j+7] * x[acolidx[j+7]]; 658 } 659 if (i == totalslices-1 && (A->rmap->n & 0x07)) { 660 for (j=0; j<(A->rmap->n & 0x07); j++) z[8*i+j] = y[8*i+j] + sum[j]; 661 } else { 662 for (j=0; j<8; j++) z[8*i+j] = y[8*i+j] + sum[j]; 663 } 664 } 665 #endif 666 667 PetscCall(PetscLogFlops(2.0*a->nz)); 668 PetscCall(VecRestoreArrayRead(xx,&x)); 669 PetscCall(VecRestoreArrayPair(yy,zz,&y,&z)); 670 PetscFunctionReturn(0); 671 } 672 673 PetscErrorCode MatMultTransposeAdd_SeqSELL(Mat A,Vec xx,Vec zz,Vec yy) 674 { 675 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 676 PetscScalar *y; 677 const PetscScalar *x; 678 const MatScalar *aval=a->val; 679 const PetscInt *acolidx=a->colidx; 680 PetscInt i,j,r,row,nnz_in_row,totalslices=a->totalslices; 681 682 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 683 #pragma disjoint(*x,*y,*aval) 684 #endif 685 686 PetscFunctionBegin; 687 if (A->symmetric) { 688 PetscCall(MatMultAdd_SeqSELL(A,xx,zz,yy)); 689 PetscFunctionReturn(0); 690 } 691 if (zz != yy) PetscCall(VecCopy(zz,yy)); 692 PetscCall(VecGetArrayRead(xx,&x)); 693 PetscCall(VecGetArray(yy,&y)); 694 for (i=0; i<a->totalslices; i++) { /* loop over slices */ 695 if (i == totalslices-1 && (A->rmap->n & 0x07)) { 696 for (r=0; r<(A->rmap->n & 0x07); ++r) { 697 row = 8*i + r; 698 nnz_in_row = a->rlen[row]; 699 for (j=0; j<nnz_in_row; ++j) y[acolidx[8*j+r]] += aval[8*j+r] * x[row]; 700 } 701 break; 702 } 703 for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) { 704 y[acolidx[j]] += aval[j] * x[8*i]; 705 y[acolidx[j+1]] += aval[j+1] * x[8*i+1]; 706 y[acolidx[j+2]] += aval[j+2] * x[8*i+2]; 707 y[acolidx[j+3]] += aval[j+3] * x[8*i+3]; 708 y[acolidx[j+4]] += aval[j+4] * x[8*i+4]; 709 y[acolidx[j+5]] += aval[j+5] * x[8*i+5]; 710 y[acolidx[j+6]] += aval[j+6] * x[8*i+6]; 711 y[acolidx[j+7]] += aval[j+7] * x[8*i+7]; 712 } 713 } 714 PetscCall(PetscLogFlops(2.0*a->sliidx[a->totalslices])); 715 PetscCall(VecRestoreArrayRead(xx,&x)); 716 PetscCall(VecRestoreArray(yy,&y)); 717 PetscFunctionReturn(0); 718 } 719 720 PetscErrorCode MatMultTranspose_SeqSELL(Mat A,Vec xx,Vec yy) 721 { 722 PetscFunctionBegin; 723 if (A->symmetric) { 724 PetscCall(MatMult_SeqSELL(A,xx,yy)); 725 } else { 726 PetscCall(VecSet(yy,0.0)); 727 PetscCall(MatMultTransposeAdd_SeqSELL(A,xx,yy,yy)); 728 } 729 PetscFunctionReturn(0); 730 } 731 732 /* 733 Checks for missing diagonals 734 */ 735 PetscErrorCode MatMissingDiagonal_SeqSELL(Mat A,PetscBool *missing,PetscInt *d) 736 { 737 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 738 PetscInt *diag,i; 739 740 PetscFunctionBegin; 741 *missing = PETSC_FALSE; 742 if (A->rmap->n > 0 && !(a->colidx)) { 743 *missing = PETSC_TRUE; 744 if (d) *d = 0; 745 PetscCall(PetscInfo(A,"Matrix has no entries therefore is missing diagonal\n")); 746 } else { 747 diag = a->diag; 748 for (i=0; i<A->rmap->n; i++) { 749 if (diag[i] == -1) { 750 *missing = PETSC_TRUE; 751 if (d) *d = i; 752 PetscCall(PetscInfo(A,"Matrix is missing diagonal number %" PetscInt_FMT "\n",i)); 753 break; 754 } 755 } 756 } 757 PetscFunctionReturn(0); 758 } 759 760 PetscErrorCode MatMarkDiagonal_SeqSELL(Mat A) 761 { 762 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 763 PetscInt i,j,m=A->rmap->n,shift; 764 765 PetscFunctionBegin; 766 if (!a->diag) { 767 PetscCall(PetscMalloc1(m,&a->diag)); 768 PetscCall(PetscLogObjectMemory((PetscObject)A,m*sizeof(PetscInt))); 769 a->free_diag = PETSC_TRUE; 770 } 771 for (i=0; i<m; i++) { /* loop over rows */ 772 shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */ 773 a->diag[i] = -1; 774 for (j=0; j<a->rlen[i]; j++) { 775 if (a->colidx[shift+j*8] == i) { 776 a->diag[i] = shift+j*8; 777 break; 778 } 779 } 780 } 781 PetscFunctionReturn(0); 782 } 783 784 /* 785 Negative shift indicates do not generate an error if there is a zero diagonal, just invert it anyways 786 */ 787 PetscErrorCode MatInvertDiagonal_SeqSELL(Mat A,PetscScalar omega,PetscScalar fshift) 788 { 789 Mat_SeqSELL *a=(Mat_SeqSELL*) A->data; 790 PetscInt i,*diag,m = A->rmap->n; 791 MatScalar *val = a->val; 792 PetscScalar *idiag,*mdiag; 793 794 PetscFunctionBegin; 795 if (a->idiagvalid) PetscFunctionReturn(0); 796 PetscCall(MatMarkDiagonal_SeqSELL(A)); 797 diag = a->diag; 798 if (!a->idiag) { 799 PetscCall(PetscMalloc3(m,&a->idiag,m,&a->mdiag,m,&a->ssor_work)); 800 PetscCall(PetscLogObjectMemory((PetscObject)A, 3*m*sizeof(PetscScalar))); 801 val = a->val; 802 } 803 mdiag = a->mdiag; 804 idiag = a->idiag; 805 806 if (omega == 1.0 && PetscRealPart(fshift) <= 0.0) { 807 for (i=0; i<m; i++) { 808 mdiag[i] = val[diag[i]]; 809 if (!PetscAbsScalar(mdiag[i])) { /* zero diagonal */ 810 if (PetscRealPart(fshift)) { 811 PetscCall(PetscInfo(A,"Zero diagonal on row %" PetscInt_FMT "\n",i)); 812 A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 813 A->factorerror_zeropivot_value = 0.0; 814 A->factorerror_zeropivot_row = i; 815 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Zero diagonal on row %" PetscInt_FMT,i); 816 } 817 idiag[i] = 1.0/val[diag[i]]; 818 } 819 PetscCall(PetscLogFlops(m)); 820 } else { 821 for (i=0; i<m; i++) { 822 mdiag[i] = val[diag[i]]; 823 idiag[i] = omega/(fshift + val[diag[i]]); 824 } 825 PetscCall(PetscLogFlops(2.0*m)); 826 } 827 a->idiagvalid = PETSC_TRUE; 828 PetscFunctionReturn(0); 829 } 830 831 PetscErrorCode MatZeroEntries_SeqSELL(Mat A) 832 { 833 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 834 835 PetscFunctionBegin; 836 PetscCall(PetscArrayzero(a->val,a->sliidx[a->totalslices])); 837 PetscCall(MatSeqSELLInvalidateDiagonal(A)); 838 PetscFunctionReturn(0); 839 } 840 841 PetscErrorCode MatDestroy_SeqSELL(Mat A) 842 { 843 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 844 845 PetscFunctionBegin; 846 #if defined(PETSC_USE_LOG) 847 PetscLogObjectState((PetscObject)A,"Rows=%" PetscInt_FMT ", Cols=%" PetscInt_FMT ", NZ=%" PetscInt_FMT,A->rmap->n,A->cmap->n,a->nz); 848 #endif 849 PetscCall(MatSeqXSELLFreeSELL(A,&a->val,&a->colidx)); 850 PetscCall(ISDestroy(&a->row)); 851 PetscCall(ISDestroy(&a->col)); 852 PetscCall(PetscFree(a->diag)); 853 PetscCall(PetscFree(a->rlen)); 854 PetscCall(PetscFree(a->sliidx)); 855 PetscCall(PetscFree3(a->idiag,a->mdiag,a->ssor_work)); 856 PetscCall(PetscFree(a->solve_work)); 857 PetscCall(ISDestroy(&a->icol)); 858 PetscCall(PetscFree(a->saved_values)); 859 PetscCall(PetscFree2(a->getrowcols,a->getrowvals)); 860 861 PetscCall(PetscFree(A->data)); 862 863 PetscCall(PetscObjectChangeTypeName((PetscObject)A,NULL)); 864 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatStoreValues_C",NULL)); 865 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatRetrieveValues_C",NULL)); 866 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqSELLSetPreallocation_C",NULL)); 867 PetscFunctionReturn(0); 868 } 869 870 PetscErrorCode MatSetOption_SeqSELL(Mat A,MatOption op,PetscBool flg) 871 { 872 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 873 874 PetscFunctionBegin; 875 switch (op) { 876 case MAT_ROW_ORIENTED: 877 a->roworiented = flg; 878 break; 879 case MAT_KEEP_NONZERO_PATTERN: 880 a->keepnonzeropattern = flg; 881 break; 882 case MAT_NEW_NONZERO_LOCATIONS: 883 a->nonew = (flg ? 0 : 1); 884 break; 885 case MAT_NEW_NONZERO_LOCATION_ERR: 886 a->nonew = (flg ? -1 : 0); 887 break; 888 case MAT_NEW_NONZERO_ALLOCATION_ERR: 889 a->nonew = (flg ? -2 : 0); 890 break; 891 case MAT_UNUSED_NONZERO_LOCATION_ERR: 892 a->nounused = (flg ? -1 : 0); 893 break; 894 case MAT_FORCE_DIAGONAL_ENTRIES: 895 case MAT_IGNORE_OFF_PROC_ENTRIES: 896 case MAT_USE_HASH_TABLE: 897 case MAT_SORTED_FULL: 898 PetscCall(PetscInfo(A,"Option %s ignored\n",MatOptions[op])); 899 break; 900 case MAT_SPD: 901 case MAT_SYMMETRIC: 902 case MAT_STRUCTURALLY_SYMMETRIC: 903 case MAT_HERMITIAN: 904 case MAT_SYMMETRY_ETERNAL: 905 /* These options are handled directly by MatSetOption() */ 906 break; 907 default: 908 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"unknown option %d",op); 909 } 910 PetscFunctionReturn(0); 911 } 912 913 PetscErrorCode MatGetDiagonal_SeqSELL(Mat A,Vec v) 914 { 915 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 916 PetscInt i,j,n,shift; 917 PetscScalar *x,zero=0.0; 918 919 PetscFunctionBegin; 920 PetscCall(VecGetLocalSize(v,&n)); 921 PetscCheck(n == A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 922 923 if (A->factortype == MAT_FACTOR_ILU || A->factortype == MAT_FACTOR_LU) { 924 PetscInt *diag=a->diag; 925 PetscCall(VecGetArray(v,&x)); 926 for (i=0; i<n; i++) x[i] = 1.0/a->val[diag[i]]; 927 PetscCall(VecRestoreArray(v,&x)); 928 PetscFunctionReturn(0); 929 } 930 931 PetscCall(VecSet(v,zero)); 932 PetscCall(VecGetArray(v,&x)); 933 for (i=0; i<n; i++) { /* loop over rows */ 934 shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */ 935 x[i] = 0; 936 for (j=0; j<a->rlen[i]; j++) { 937 if (a->colidx[shift+j*8] == i) { 938 x[i] = a->val[shift+j*8]; 939 break; 940 } 941 } 942 } 943 PetscCall(VecRestoreArray(v,&x)); 944 PetscFunctionReturn(0); 945 } 946 947 PetscErrorCode MatDiagonalScale_SeqSELL(Mat A,Vec ll,Vec rr) 948 { 949 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 950 const PetscScalar *l,*r; 951 PetscInt i,j,m,n,row; 952 953 PetscFunctionBegin; 954 if (ll) { 955 /* The local size is used so that VecMPI can be passed to this routine 956 by MatDiagonalScale_MPISELL */ 957 PetscCall(VecGetLocalSize(ll,&m)); 958 PetscCheck(m == A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length"); 959 PetscCall(VecGetArrayRead(ll,&l)); 960 for (i=0; i<a->totalslices; i++) { /* loop over slices */ 961 if (i == a->totalslices-1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */ 962 for (j=a->sliidx[i],row=0; j<a->sliidx[i+1]; j++,row=((row+1)&0x07)) { 963 if (row < (A->rmap->n & 0x07)) a->val[j] *= l[8*i+row]; 964 } 965 } else { 966 for (j=a->sliidx[i],row=0; j<a->sliidx[i+1]; j++,row=((row+1)&0x07)) { 967 a->val[j] *= l[8*i+row]; 968 } 969 } 970 } 971 PetscCall(VecRestoreArrayRead(ll,&l)); 972 PetscCall(PetscLogFlops(a->nz)); 973 } 974 if (rr) { 975 PetscCall(VecGetLocalSize(rr,&n)); 976 PetscCheck(n == A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length"); 977 PetscCall(VecGetArrayRead(rr,&r)); 978 for (i=0; i<a->totalslices; i++) { /* loop over slices */ 979 if (i == a->totalslices-1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */ 980 for (j=a->sliidx[i],row=0; j<a->sliidx[i+1]; j++,row=((row+1)&0x07)) { 981 if (row < (A->rmap->n & 0x07)) a->val[j] *= r[a->colidx[j]]; 982 } 983 } else { 984 for (j=a->sliidx[i]; j<a->sliidx[i+1]; j++) { 985 a->val[j] *= r[a->colidx[j]]; 986 } 987 } 988 } 989 PetscCall(VecRestoreArrayRead(rr,&r)); 990 PetscCall(PetscLogFlops(a->nz)); 991 } 992 PetscCall(MatSeqSELLInvalidateDiagonal(A)); 993 PetscFunctionReturn(0); 994 } 995 996 PetscErrorCode MatGetValues_SeqSELL(Mat A,PetscInt m,const PetscInt im[],PetscInt n,const PetscInt in[],PetscScalar v[]) 997 { 998 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 999 PetscInt *cp,i,k,low,high,t,row,col,l; 1000 PetscInt shift; 1001 MatScalar *vp; 1002 1003 PetscFunctionBegin; 1004 for (k=0; k<m; k++) { /* loop over requested rows */ 1005 row = im[k]; 1006 if (row<0) continue; 1007 PetscCheck(row < A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT,row,A->rmap->n-1); 1008 shift = a->sliidx[row>>3]+(row&0x07); /* starting index of the row */ 1009 cp = a->colidx+shift; /* pointer to the row */ 1010 vp = a->val+shift; /* pointer to the row */ 1011 for (l=0; l<n; l++) { /* loop over requested columns */ 1012 col = in[l]; 1013 if (col<0) continue; 1014 PetscCheck(col < A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Column too large: row %" PetscInt_FMT " max %" PetscInt_FMT,col,A->cmap->n-1); 1015 high = a->rlen[row]; low = 0; /* assume unsorted */ 1016 while (high-low > 5) { 1017 t = (low+high)/2; 1018 if (*(cp+t*8) > col) high = t; 1019 else low = t; 1020 } 1021 for (i=low; i<high; i++) { 1022 if (*(cp+8*i) > col) break; 1023 if (*(cp+8*i) == col) { 1024 *v++ = *(vp+8*i); 1025 goto finished; 1026 } 1027 } 1028 *v++ = 0.0; 1029 finished:; 1030 } 1031 } 1032 PetscFunctionReturn(0); 1033 } 1034 1035 PetscErrorCode MatView_SeqSELL_ASCII(Mat A,PetscViewer viewer) 1036 { 1037 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 1038 PetscInt i,j,m=A->rmap->n,shift; 1039 const char *name; 1040 PetscViewerFormat format; 1041 1042 PetscFunctionBegin; 1043 PetscCall(PetscViewerGetFormat(viewer,&format)); 1044 if (format == PETSC_VIEWER_ASCII_MATLAB) { 1045 PetscInt nofinalvalue = 0; 1046 /* 1047 if (m && ((a->i[m] == a->i[m-1]) || (a->j[a->nz-1] != A->cmap->n-1))) { 1048 nofinalvalue = 1; 1049 } 1050 */ 1051 PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_FALSE)); 1052 PetscCall(PetscViewerASCIIPrintf(viewer,"%% Size = %" PetscInt_FMT " %" PetscInt_FMT " \n",m,A->cmap->n)); 1053 PetscCall(PetscViewerASCIIPrintf(viewer,"%% Nonzeros = %" PetscInt_FMT " \n",a->nz)); 1054 #if defined(PETSC_USE_COMPLEX) 1055 PetscCall(PetscViewerASCIIPrintf(viewer,"zzz = zeros(%" PetscInt_FMT ",4);\n",a->nz+nofinalvalue)); 1056 #else 1057 PetscCall(PetscViewerASCIIPrintf(viewer,"zzz = zeros(%" PetscInt_FMT ",3);\n",a->nz+nofinalvalue)); 1058 #endif 1059 PetscCall(PetscViewerASCIIPrintf(viewer,"zzz = [\n")); 1060 1061 for (i=0; i<m; i++) { 1062 shift = a->sliidx[i>>3]+(i&0x07); 1063 for (j=0; j<a->rlen[i]; j++) { 1064 #if defined(PETSC_USE_COMPLEX) 1065 PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %18.16e %18.16e\n",i+1,a->colidx[shift+8*j]+1,(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j]))); 1066 #else 1067 PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %18.16e\n",i+1,a->colidx[shift+8*j]+1,(double)a->val[shift+8*j])); 1068 #endif 1069 } 1070 } 1071 /* 1072 if (nofinalvalue) { 1073 #if defined(PETSC_USE_COMPLEX) 1074 PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %18.16e %18.16e\n",m,A->cmap->n,0.,0.)); 1075 #else 1076 PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %18.16e\n",m,A->cmap->n,0.0)); 1077 #endif 1078 } 1079 */ 1080 PetscCall(PetscObjectGetName((PetscObject)A,&name)); 1081 PetscCall(PetscViewerASCIIPrintf(viewer,"];\n %s = spconvert(zzz);\n",name)); 1082 PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_TRUE)); 1083 } else if (format == PETSC_VIEWER_ASCII_FACTOR_INFO || format == PETSC_VIEWER_ASCII_INFO) { 1084 PetscFunctionReturn(0); 1085 } else if (format == PETSC_VIEWER_ASCII_COMMON) { 1086 PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_FALSE)); 1087 for (i=0; i<m; i++) { 1088 PetscCall(PetscViewerASCIIPrintf(viewer,"row %" PetscInt_FMT ":",i)); 1089 shift = a->sliidx[i>>3]+(i&0x07); 1090 for (j=0; j<a->rlen[i]; j++) { 1091 #if defined(PETSC_USE_COMPLEX) 1092 if (PetscImaginaryPart(a->val[shift+8*j]) > 0.0 && PetscRealPart(a->val[shift+8*j]) != 0.0) { 1093 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g + %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j]))); 1094 } else if (PetscImaginaryPart(a->val[shift+8*j]) < 0.0 && PetscRealPart(a->val[shift+8*j]) != 0.0) { 1095 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g - %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)-PetscImaginaryPart(a->val[shift+8*j]))); 1096 } else if (PetscRealPart(a->val[shift+8*j]) != 0.0) { 1097 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]))); 1098 } 1099 #else 1100 if (a->val[shift+8*j] != 0.0) PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[shift+8*j],(double)a->val[shift+8*j])); 1101 #endif 1102 } 1103 PetscCall(PetscViewerASCIIPrintf(viewer,"\n")); 1104 } 1105 PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_TRUE)); 1106 } else if (format == PETSC_VIEWER_ASCII_DENSE) { 1107 PetscInt cnt=0,jcnt; 1108 PetscScalar value; 1109 #if defined(PETSC_USE_COMPLEX) 1110 PetscBool realonly=PETSC_TRUE; 1111 for (i=0; i<a->sliidx[a->totalslices]; i++) { 1112 if (PetscImaginaryPart(a->val[i]) != 0.0) { 1113 realonly = PETSC_FALSE; 1114 break; 1115 } 1116 } 1117 #endif 1118 1119 PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_FALSE)); 1120 for (i=0; i<m; i++) { 1121 jcnt = 0; 1122 shift = a->sliidx[i>>3]+(i&0x07); 1123 for (j=0; j<A->cmap->n; j++) { 1124 if (jcnt < a->rlen[i] && j == a->colidx[shift+8*j]) { 1125 value = a->val[cnt++]; 1126 jcnt++; 1127 } else { 1128 value = 0.0; 1129 } 1130 #if defined(PETSC_USE_COMPLEX) 1131 if (realonly) { 1132 PetscCall(PetscViewerASCIIPrintf(viewer," %7.5e ",(double)PetscRealPart(value))); 1133 } else { 1134 PetscCall(PetscViewerASCIIPrintf(viewer," %7.5e+%7.5e i ",(double)PetscRealPart(value),(double)PetscImaginaryPart(value))); 1135 } 1136 #else 1137 PetscCall(PetscViewerASCIIPrintf(viewer," %7.5e ",(double)value)); 1138 #endif 1139 } 1140 PetscCall(PetscViewerASCIIPrintf(viewer,"\n")); 1141 } 1142 PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_TRUE)); 1143 } else if (format == PETSC_VIEWER_ASCII_MATRIXMARKET) { 1144 PetscInt fshift=1; 1145 PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_FALSE)); 1146 #if defined(PETSC_USE_COMPLEX) 1147 PetscCall(PetscViewerASCIIPrintf(viewer,"%%%%MatrixMarket matrix coordinate complex general\n")); 1148 #else 1149 PetscCall(PetscViewerASCIIPrintf(viewer,"%%%%MatrixMarket matrix coordinate real general\n")); 1150 #endif 1151 PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %" PetscInt_FMT "\n", m, A->cmap->n, a->nz)); 1152 for (i=0; i<m; i++) { 1153 shift = a->sliidx[i>>3]+(i&0x07); 1154 for (j=0; j<a->rlen[i]; j++) { 1155 #if defined(PETSC_USE_COMPLEX) 1156 PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %g %g\n",i+fshift,a->colidx[shift+8*j]+fshift,(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j]))); 1157 #else 1158 PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %g\n",i+fshift,a->colidx[shift+8*j]+fshift,(double)a->val[shift+8*j])); 1159 #endif 1160 } 1161 } 1162 PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_TRUE)); 1163 } else if (format == PETSC_VIEWER_NATIVE) { 1164 for (i=0; i<a->totalslices; i++) { /* loop over slices */ 1165 PetscInt row; 1166 PetscCall(PetscViewerASCIIPrintf(viewer,"slice %" PetscInt_FMT ": %" PetscInt_FMT " %" PetscInt_FMT "\n",i,a->sliidx[i],a->sliidx[i+1])); 1167 for (j=a->sliidx[i],row=0; j<a->sliidx[i+1]; j++,row=((row+1)&0x07)) { 1168 #if defined(PETSC_USE_COMPLEX) 1169 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1170 PetscCall(PetscViewerASCIIPrintf(viewer," %" PetscInt_FMT " %" PetscInt_FMT " %g + %g i\n",8*i+row,a->colidx[j],(double)PetscRealPart(a->val[j]),(double)PetscImaginaryPart(a->val[j]))); 1171 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1172 PetscCall(PetscViewerASCIIPrintf(viewer," %" PetscInt_FMT " %" PetscInt_FMT " %g - %g i\n",8*i+row,a->colidx[j],(double)PetscRealPart(a->val[j]),-(double)PetscImaginaryPart(a->val[j]))); 1173 } else { 1174 PetscCall(PetscViewerASCIIPrintf(viewer," %" PetscInt_FMT " %" PetscInt_FMT " %g\n",8*i+row,a->colidx[j],(double)PetscRealPart(a->val[j]))); 1175 } 1176 #else 1177 PetscCall(PetscViewerASCIIPrintf(viewer," %" PetscInt_FMT " %" PetscInt_FMT " %g\n",8*i+row,a->colidx[j],(double)a->val[j])); 1178 #endif 1179 } 1180 } 1181 } else { 1182 PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_FALSE)); 1183 if (A->factortype) { 1184 for (i=0; i<m; i++) { 1185 shift = a->sliidx[i>>3]+(i&0x07); 1186 PetscCall(PetscViewerASCIIPrintf(viewer,"row %" PetscInt_FMT ":",i)); 1187 /* L part */ 1188 for (j=shift; j<a->diag[i]; j+=8) { 1189 #if defined(PETSC_USE_COMPLEX) 1190 if (PetscImaginaryPart(a->val[shift+8*j]) > 0.0) { 1191 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g + %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)PetscImaginaryPart(a->val[j]))); 1192 } else if (PetscImaginaryPart(a->val[shift+8*j]) < 0.0) { 1193 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g - %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)(-PetscImaginaryPart(a->val[j])))); 1194 } else { 1195 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[j],(double)PetscRealPart(a->val[j]))); 1196 } 1197 #else 1198 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[j],(double)a->val[j])); 1199 #endif 1200 } 1201 /* diagonal */ 1202 j = a->diag[i]; 1203 #if defined(PETSC_USE_COMPLEX) 1204 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1205 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g + %g i)",a->colidx[j],(double)PetscRealPart(1.0/a->val[j]),(double)PetscImaginaryPart(1.0/a->val[j]))); 1206 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1207 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g - %g i)",a->colidx[j],(double)PetscRealPart(1.0/a->val[j]),(double)(-PetscImaginaryPart(1.0/a->val[j])))); 1208 } else { 1209 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[j],(double)PetscRealPart(1.0/a->val[j]))); 1210 } 1211 #else 1212 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[j],(double)(1.0/a->val[j]))); 1213 #endif 1214 1215 /* U part */ 1216 for (j=a->diag[i]+1; j<shift+8*a->rlen[i]; j+=8) { 1217 #if defined(PETSC_USE_COMPLEX) 1218 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1219 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g + %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)PetscImaginaryPart(a->val[j]))); 1220 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1221 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g - %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)(-PetscImaginaryPart(a->val[j])))); 1222 } else { 1223 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[j],(double)PetscRealPart(a->val[j]))); 1224 } 1225 #else 1226 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[j],(double)a->val[j])); 1227 #endif 1228 } 1229 PetscCall(PetscViewerASCIIPrintf(viewer,"\n")); 1230 } 1231 } else { 1232 for (i=0; i<m; i++) { 1233 shift = a->sliidx[i>>3]+(i&0x07); 1234 PetscCall(PetscViewerASCIIPrintf(viewer,"row %" PetscInt_FMT ":",i)); 1235 for (j=0; j<a->rlen[i]; j++) { 1236 #if defined(PETSC_USE_COMPLEX) 1237 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1238 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g + %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j]))); 1239 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1240 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g - %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)-PetscImaginaryPart(a->val[shift+8*j]))); 1241 } else { 1242 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]))); 1243 } 1244 #else 1245 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[shift+8*j],(double)a->val[shift+8*j])); 1246 #endif 1247 } 1248 PetscCall(PetscViewerASCIIPrintf(viewer,"\n")); 1249 } 1250 } 1251 PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_TRUE)); 1252 } 1253 PetscCall(PetscViewerFlush(viewer)); 1254 PetscFunctionReturn(0); 1255 } 1256 1257 #include <petscdraw.h> 1258 PetscErrorCode MatView_SeqSELL_Draw_Zoom(PetscDraw draw,void *Aa) 1259 { 1260 Mat A=(Mat)Aa; 1261 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 1262 PetscInt i,j,m=A->rmap->n,shift; 1263 int color; 1264 PetscReal xl,yl,xr,yr,x_l,x_r,y_l,y_r; 1265 PetscViewer viewer; 1266 PetscViewerFormat format; 1267 1268 PetscFunctionBegin; 1269 PetscCall(PetscObjectQuery((PetscObject)A,"Zoomviewer",(PetscObject*)&viewer)); 1270 PetscCall(PetscViewerGetFormat(viewer,&format)); 1271 PetscCall(PetscDrawGetCoordinates(draw,&xl,&yl,&xr,&yr)); 1272 1273 /* loop over matrix elements drawing boxes */ 1274 1275 if (format != PETSC_VIEWER_DRAW_CONTOUR) { 1276 PetscDrawCollectiveBegin(draw); 1277 /* Blue for negative, Cyan for zero and Red for positive */ 1278 color = PETSC_DRAW_BLUE; 1279 for (i=0; i<m; i++) { 1280 shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */ 1281 y_l = m - i - 1.0; y_r = y_l + 1.0; 1282 for (j=0; j<a->rlen[i]; j++) { 1283 x_l = a->colidx[shift+j*8]; x_r = x_l + 1.0; 1284 if (PetscRealPart(a->val[shift+8*j]) >= 0.) continue; 1285 PetscCall(PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color)); 1286 } 1287 } 1288 color = PETSC_DRAW_CYAN; 1289 for (i=0; i<m; i++) { 1290 shift = a->sliidx[i>>3]+(i&0x07); 1291 y_l = m - i - 1.0; y_r = y_l + 1.0; 1292 for (j=0; j<a->rlen[i]; j++) { 1293 x_l = a->colidx[shift+j*8]; x_r = x_l + 1.0; 1294 if (a->val[shift+8*j] != 0.) continue; 1295 PetscCall(PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color)); 1296 } 1297 } 1298 color = PETSC_DRAW_RED; 1299 for (i=0; i<m; i++) { 1300 shift = a->sliidx[i>>3]+(i&0x07); 1301 y_l = m - i - 1.0; y_r = y_l + 1.0; 1302 for (j=0; j<a->rlen[i]; j++) { 1303 x_l = a->colidx[shift+j*8]; x_r = x_l + 1.0; 1304 if (PetscRealPart(a->val[shift+8*j]) <= 0.) continue; 1305 PetscCall(PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color)); 1306 } 1307 } 1308 PetscDrawCollectiveEnd(draw); 1309 } else { 1310 /* use contour shading to indicate magnitude of values */ 1311 /* first determine max of all nonzero values */ 1312 PetscReal minv=0.0,maxv=0.0; 1313 PetscInt count=0; 1314 PetscDraw popup; 1315 for (i=0; i<a->sliidx[a->totalslices]; i++) { 1316 if (PetscAbsScalar(a->val[i]) > maxv) maxv = PetscAbsScalar(a->val[i]); 1317 } 1318 if (minv >= maxv) maxv = minv + PETSC_SMALL; 1319 PetscCall(PetscDrawGetPopup(draw,&popup)); 1320 PetscCall(PetscDrawScalePopup(popup,minv,maxv)); 1321 1322 PetscDrawCollectiveBegin(draw); 1323 for (i=0; i<m; i++) { 1324 shift = a->sliidx[i>>3]+(i&0x07); 1325 y_l = m - i - 1.0; 1326 y_r = y_l + 1.0; 1327 for (j=0; j<a->rlen[i]; j++) { 1328 x_l = a->colidx[shift+j*8]; 1329 x_r = x_l + 1.0; 1330 color = PetscDrawRealToColor(PetscAbsScalar(a->val[count]),minv,maxv); 1331 PetscCall(PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color)); 1332 count++; 1333 } 1334 } 1335 PetscDrawCollectiveEnd(draw); 1336 } 1337 PetscFunctionReturn(0); 1338 } 1339 1340 #include <petscdraw.h> 1341 PetscErrorCode MatView_SeqSELL_Draw(Mat A,PetscViewer viewer) 1342 { 1343 PetscDraw draw; 1344 PetscReal xr,yr,xl,yl,h,w; 1345 PetscBool isnull; 1346 1347 PetscFunctionBegin; 1348 PetscCall(PetscViewerDrawGetDraw(viewer,0,&draw)); 1349 PetscCall(PetscDrawIsNull(draw,&isnull)); 1350 if (isnull) PetscFunctionReturn(0); 1351 1352 xr = A->cmap->n; yr = A->rmap->n; h = yr/10.0; w = xr/10.0; 1353 xr += w; yr += h; xl = -w; yl = -h; 1354 PetscCall(PetscDrawSetCoordinates(draw,xl,yl,xr,yr)); 1355 PetscCall(PetscObjectCompose((PetscObject)A,"Zoomviewer",(PetscObject)viewer)); 1356 PetscCall(PetscDrawZoom(draw,MatView_SeqSELL_Draw_Zoom,A)); 1357 PetscCall(PetscObjectCompose((PetscObject)A,"Zoomviewer",NULL)); 1358 PetscCall(PetscDrawSave(draw)); 1359 PetscFunctionReturn(0); 1360 } 1361 1362 PetscErrorCode MatView_SeqSELL(Mat A,PetscViewer viewer) 1363 { 1364 PetscBool iascii,isbinary,isdraw; 1365 1366 PetscFunctionBegin; 1367 PetscCall(PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&iascii)); 1368 PetscCall(PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERBINARY,&isbinary)); 1369 PetscCall(PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERDRAW,&isdraw)); 1370 if (iascii) { 1371 PetscCall(MatView_SeqSELL_ASCII(A,viewer)); 1372 } else if (isbinary) { 1373 /* PetscCall(MatView_SeqSELL_Binary(A,viewer)); */ 1374 } else if (isdraw) { 1375 PetscCall(MatView_SeqSELL_Draw(A,viewer)); 1376 } 1377 PetscFunctionReturn(0); 1378 } 1379 1380 PetscErrorCode MatAssemblyEnd_SeqSELL(Mat A,MatAssemblyType mode) 1381 { 1382 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 1383 PetscInt i,shift,row_in_slice,row,nrow,*cp,lastcol,j,k; 1384 MatScalar *vp; 1385 1386 PetscFunctionBegin; 1387 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 1388 /* To do: compress out the unused elements */ 1389 PetscCall(MatMarkDiagonal_SeqSELL(A)); 1390 PetscCall(PetscInfo(A,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: %" PetscInt_FMT " allocated %" PetscInt_FMT " used (%" PetscInt_FMT " nonzeros+%" PetscInt_FMT " paddedzeros)\n",A->rmap->n,A->cmap->n,a->maxallocmat,a->sliidx[a->totalslices],a->nz,a->sliidx[a->totalslices]-a->nz)); 1391 PetscCall(PetscInfo(A,"Number of mallocs during MatSetValues() is %" PetscInt_FMT "\n",a->reallocs)); 1392 PetscCall(PetscInfo(A,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",a->rlenmax)); 1393 /* Set unused slots for column indices to last valid column index. Set unused slots for values to zero. This allows for a use of unmasked intrinsics -> higher performance */ 1394 for (i=0; i<a->totalslices; ++i) { 1395 shift = a->sliidx[i]; /* starting index of the slice */ 1396 cp = a->colidx+shift; /* pointer to the column indices of the slice */ 1397 vp = a->val+shift; /* pointer to the nonzero values of the slice */ 1398 for (row_in_slice=0; row_in_slice<8; ++row_in_slice) { /* loop over rows in the slice */ 1399 row = 8*i + row_in_slice; 1400 nrow = a->rlen[row]; /* number of nonzeros in row */ 1401 /* 1402 Search for the nearest nonzero. Normally setting the index to zero may cause extra communication. 1403 But if the entire slice are empty, it is fine to use 0 since the index will not be loaded. 1404 */ 1405 lastcol = 0; 1406 if (nrow>0) { /* nonempty row */ 1407 lastcol = cp[8*(nrow-1)+row_in_slice]; /* use the index from the last nonzero at current row */ 1408 } else if (!row_in_slice) { /* first row of the currect slice is empty */ 1409 for (j=1;j<8;j++) { 1410 if (a->rlen[8*i+j]) { 1411 lastcol = cp[j]; 1412 break; 1413 } 1414 } 1415 } else { 1416 if (a->sliidx[i+1] != shift) lastcol = cp[row_in_slice-1]; /* use the index from the previous row */ 1417 } 1418 1419 for (k=nrow; k<(a->sliidx[i+1]-shift)/8; ++k) { 1420 cp[8*k+row_in_slice] = lastcol; 1421 vp[8*k+row_in_slice] = (MatScalar)0; 1422 } 1423 } 1424 } 1425 1426 A->info.mallocs += a->reallocs; 1427 a->reallocs = 0; 1428 1429 PetscCall(MatSeqSELLInvalidateDiagonal(A)); 1430 PetscFunctionReturn(0); 1431 } 1432 1433 PetscErrorCode MatGetInfo_SeqSELL(Mat A,MatInfoType flag,MatInfo *info) 1434 { 1435 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 1436 1437 PetscFunctionBegin; 1438 info->block_size = 1.0; 1439 info->nz_allocated = a->maxallocmat; 1440 info->nz_used = a->sliidx[a->totalslices]; /* include padding zeros */ 1441 info->nz_unneeded = (a->maxallocmat-a->sliidx[a->totalslices]); 1442 info->assemblies = A->num_ass; 1443 info->mallocs = A->info.mallocs; 1444 info->memory = ((PetscObject)A)->mem; 1445 if (A->factortype) { 1446 info->fill_ratio_given = A->info.fill_ratio_given; 1447 info->fill_ratio_needed = A->info.fill_ratio_needed; 1448 info->factor_mallocs = A->info.factor_mallocs; 1449 } else { 1450 info->fill_ratio_given = 0; 1451 info->fill_ratio_needed = 0; 1452 info->factor_mallocs = 0; 1453 } 1454 PetscFunctionReturn(0); 1455 } 1456 1457 PetscErrorCode MatSetValues_SeqSELL(Mat A,PetscInt m,const PetscInt im[],PetscInt n,const PetscInt in[],const PetscScalar v[],InsertMode is) 1458 { 1459 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 1460 PetscInt shift,i,k,l,low,high,t,ii,row,col,nrow; 1461 PetscInt *cp,nonew=a->nonew,lastcol=-1; 1462 MatScalar *vp,value; 1463 1464 PetscFunctionBegin; 1465 for (k=0; k<m; k++) { /* loop over added rows */ 1466 row = im[k]; 1467 if (row < 0) continue; 1468 PetscCheck(row < A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT,row,A->rmap->n-1); 1469 shift = a->sliidx[row>>3]+(row&0x07); /* starting index of the row */ 1470 cp = a->colidx+shift; /* pointer to the row */ 1471 vp = a->val+shift; /* pointer to the row */ 1472 nrow = a->rlen[row]; 1473 low = 0; 1474 high = nrow; 1475 1476 for (l=0; l<n; l++) { /* loop over added columns */ 1477 col = in[l]; 1478 if (col<0) continue; 1479 PetscCheck(col < A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Col too large: row %" PetscInt_FMT " max %" PetscInt_FMT,col,A->cmap->n-1); 1480 if (a->roworiented) { 1481 value = v[l+k*n]; 1482 } else { 1483 value = v[k+l*m]; 1484 } 1485 if ((value == 0.0 && a->ignorezeroentries) && (is == ADD_VALUES)) continue; 1486 1487 /* search in this row for the specified column, i indicates the column to be set */ 1488 if (col <= lastcol) low = 0; 1489 else high = nrow; 1490 lastcol = col; 1491 while (high-low > 5) { 1492 t = (low+high)/2; 1493 if (*(cp+t*8) > col) high = t; 1494 else low = t; 1495 } 1496 for (i=low; i<high; i++) { 1497 if (*(cp+i*8) > col) break; 1498 if (*(cp+i*8) == col) { 1499 if (is == ADD_VALUES) *(vp+i*8) += value; 1500 else *(vp+i*8) = value; 1501 low = i + 1; 1502 goto noinsert; 1503 } 1504 } 1505 if (value == 0.0 && a->ignorezeroentries) goto noinsert; 1506 if (nonew == 1) goto noinsert; 1507 PetscCheck(nonew != -1,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Inserting a new nonzero (%" PetscInt_FMT ", %" PetscInt_FMT ") in the matrix", row, col); 1508 /* If the current row length exceeds the slice width (e.g. nrow==slice_width), allocate a new space, otherwise do nothing */ 1509 MatSeqXSELLReallocateSELL(A,A->rmap->n,1,nrow,a->sliidx,row/8,row,col,a->colidx,a->val,cp,vp,nonew,MatScalar); 1510 /* add the new nonzero to the high position, shift the remaining elements in current row to the right by one slot */ 1511 for (ii=nrow-1; ii>=i; ii--) { 1512 *(cp+(ii+1)*8) = *(cp+ii*8); 1513 *(vp+(ii+1)*8) = *(vp+ii*8); 1514 } 1515 a->rlen[row]++; 1516 *(cp+i*8) = col; 1517 *(vp+i*8) = value; 1518 a->nz++; 1519 A->nonzerostate++; 1520 low = i+1; high++; nrow++; 1521 noinsert:; 1522 } 1523 a->rlen[row] = nrow; 1524 } 1525 PetscFunctionReturn(0); 1526 } 1527 1528 PetscErrorCode MatCopy_SeqSELL(Mat A,Mat B,MatStructure str) 1529 { 1530 PetscFunctionBegin; 1531 /* If the two matrices have the same copy implementation, use fast copy. */ 1532 if (str == SAME_NONZERO_PATTERN && (A->ops->copy == B->ops->copy)) { 1533 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 1534 Mat_SeqSELL *b=(Mat_SeqSELL*)B->data; 1535 1536 PetscCheck(a->sliidx[a->totalslices] == b->sliidx[b->totalslices],PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Number of nonzeros in two matrices are different"); 1537 PetscCall(PetscArraycpy(b->val,a->val,a->sliidx[a->totalslices])); 1538 } else { 1539 PetscCall(MatCopy_Basic(A,B,str)); 1540 } 1541 PetscFunctionReturn(0); 1542 } 1543 1544 PetscErrorCode MatSetUp_SeqSELL(Mat A) 1545 { 1546 PetscFunctionBegin; 1547 PetscCall(MatSeqSELLSetPreallocation(A,PETSC_DEFAULT,NULL)); 1548 PetscFunctionReturn(0); 1549 } 1550 1551 PetscErrorCode MatSeqSELLGetArray_SeqSELL(Mat A,PetscScalar *array[]) 1552 { 1553 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 1554 1555 PetscFunctionBegin; 1556 *array = a->val; 1557 PetscFunctionReturn(0); 1558 } 1559 1560 PetscErrorCode MatSeqSELLRestoreArray_SeqSELL(Mat A,PetscScalar *array[]) 1561 { 1562 PetscFunctionBegin; 1563 PetscFunctionReturn(0); 1564 } 1565 1566 PetscErrorCode MatRealPart_SeqSELL(Mat A) 1567 { 1568 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 1569 PetscInt i; 1570 MatScalar *aval=a->val; 1571 1572 PetscFunctionBegin; 1573 for (i=0; i<a->sliidx[a->totalslices]; i++) aval[i]=PetscRealPart(aval[i]); 1574 PetscFunctionReturn(0); 1575 } 1576 1577 PetscErrorCode MatImaginaryPart_SeqSELL(Mat A) 1578 { 1579 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 1580 PetscInt i; 1581 MatScalar *aval=a->val; 1582 1583 PetscFunctionBegin; 1584 for (i=0; i<a->sliidx[a->totalslices]; i++) aval[i] = PetscImaginaryPart(aval[i]); 1585 PetscCall(MatSeqSELLInvalidateDiagonal(A)); 1586 PetscFunctionReturn(0); 1587 } 1588 1589 PetscErrorCode MatScale_SeqSELL(Mat inA,PetscScalar alpha) 1590 { 1591 Mat_SeqSELL *a=(Mat_SeqSELL*)inA->data; 1592 MatScalar *aval=a->val; 1593 PetscScalar oalpha=alpha; 1594 PetscBLASInt one=1,size; 1595 1596 PetscFunctionBegin; 1597 PetscCall(PetscBLASIntCast(a->sliidx[a->totalslices],&size)); 1598 PetscStackCallBLAS("BLASscal",BLASscal_(&size,&oalpha,aval,&one)); 1599 PetscCall(PetscLogFlops(a->nz)); 1600 PetscCall(MatSeqSELLInvalidateDiagonal(inA)); 1601 PetscFunctionReturn(0); 1602 } 1603 1604 PetscErrorCode MatShift_SeqSELL(Mat Y,PetscScalar a) 1605 { 1606 Mat_SeqSELL *y=(Mat_SeqSELL*)Y->data; 1607 1608 PetscFunctionBegin; 1609 if (!Y->preallocated || !y->nz) { 1610 PetscCall(MatSeqSELLSetPreallocation(Y,1,NULL)); 1611 } 1612 PetscCall(MatShift_Basic(Y,a)); 1613 PetscFunctionReturn(0); 1614 } 1615 1616 PetscErrorCode MatSOR_SeqSELL(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx) 1617 { 1618 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 1619 PetscScalar *x,sum,*t; 1620 const MatScalar *idiag=NULL,*mdiag; 1621 const PetscScalar *b,*xb; 1622 PetscInt n,m=A->rmap->n,i,j,shift; 1623 const PetscInt *diag; 1624 1625 PetscFunctionBegin; 1626 its = its*lits; 1627 1628 if (fshift != a->fshift || omega != a->omega) a->idiagvalid = PETSC_FALSE; /* must recompute idiag[] */ 1629 if (!a->idiagvalid) PetscCall(MatInvertDiagonal_SeqSELL(A,omega,fshift)); 1630 a->fshift = fshift; 1631 a->omega = omega; 1632 1633 diag = a->diag; 1634 t = a->ssor_work; 1635 idiag = a->idiag; 1636 mdiag = a->mdiag; 1637 1638 PetscCall(VecGetArray(xx,&x)); 1639 PetscCall(VecGetArrayRead(bb,&b)); 1640 /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */ 1641 PetscCheck(flag != SOR_APPLY_UPPER,PETSC_COMM_SELF,PETSC_ERR_SUP,"SOR_APPLY_UPPER is not implemented"); 1642 PetscCheck(flag != SOR_APPLY_LOWER,PETSC_COMM_SELF,PETSC_ERR_SUP,"SOR_APPLY_LOWER is not implemented"); 1643 PetscCheckFalse(flag & SOR_EISENSTAT,PETSC_COMM_SELF,PETSC_ERR_SUP,"No support yet for Eisenstat"); 1644 1645 if (flag & SOR_ZERO_INITIAL_GUESS) { 1646 if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) { 1647 for (i=0; i<m; i++) { 1648 shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */ 1649 sum = b[i]; 1650 n = (diag[i]-shift)/8; 1651 for (j=0; j<n; j++) sum -= a->val[shift+j*8]*x[a->colidx[shift+j*8]]; 1652 t[i] = sum; 1653 x[i] = sum*idiag[i]; 1654 } 1655 xb = t; 1656 PetscCall(PetscLogFlops(a->nz)); 1657 } else xb = b; 1658 if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) { 1659 for (i=m-1; i>=0; i--) { 1660 shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */ 1661 sum = xb[i]; 1662 n = a->rlen[i]-(diag[i]-shift)/8-1; 1663 for (j=1; j<=n; j++) sum -= a->val[diag[i]+j*8]*x[a->colidx[diag[i]+j*8]]; 1664 if (xb == b) { 1665 x[i] = sum*idiag[i]; 1666 } else { 1667 x[i] = (1.-omega)*x[i]+sum*idiag[i]; /* omega in idiag */ 1668 } 1669 } 1670 PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */ 1671 } 1672 its--; 1673 } 1674 while (its--) { 1675 if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) { 1676 for (i=0; i<m; i++) { 1677 /* lower */ 1678 shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */ 1679 sum = b[i]; 1680 n = (diag[i]-shift)/8; 1681 for (j=0; j<n; j++) sum -= a->val[shift+j*8]*x[a->colidx[shift+j*8]]; 1682 t[i] = sum; /* save application of the lower-triangular part */ 1683 /* upper */ 1684 n = a->rlen[i]-(diag[i]-shift)/8-1; 1685 for (j=1; j<=n; j++) sum -= a->val[diag[i]+j*8]*x[a->colidx[diag[i]+j*8]]; 1686 x[i] = (1.-omega)*x[i]+sum*idiag[i]; /* omega in idiag */ 1687 } 1688 xb = t; 1689 PetscCall(PetscLogFlops(2.0*a->nz)); 1690 } else xb = b; 1691 if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) { 1692 for (i=m-1; i>=0; i--) { 1693 shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */ 1694 sum = xb[i]; 1695 if (xb == b) { 1696 /* whole matrix (no checkpointing available) */ 1697 n = a->rlen[i]; 1698 for (j=0; j<n; j++) sum -= a->val[shift+j*8]*x[a->colidx[shift+j*8]]; 1699 x[i] = (1.-omega)*x[i]+(sum+mdiag[i]*x[i])*idiag[i]; 1700 } else { /* lower-triangular part has been saved, so only apply upper-triangular */ 1701 n = a->rlen[i]-(diag[i]-shift)/8-1; 1702 for (j=1; j<=n; j++) sum -= a->val[diag[i]+j*8]*x[a->colidx[diag[i]+j*8]]; 1703 x[i] = (1.-omega)*x[i]+sum*idiag[i]; /* omega in idiag */ 1704 } 1705 } 1706 if (xb == b) { 1707 PetscCall(PetscLogFlops(2.0*a->nz)); 1708 } else { 1709 PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */ 1710 } 1711 } 1712 } 1713 PetscCall(VecRestoreArray(xx,&x)); 1714 PetscCall(VecRestoreArrayRead(bb,&b)); 1715 PetscFunctionReturn(0); 1716 } 1717 1718 /* -------------------------------------------------------------------*/ 1719 static struct _MatOps MatOps_Values = {MatSetValues_SeqSELL, 1720 MatGetRow_SeqSELL, 1721 MatRestoreRow_SeqSELL, 1722 MatMult_SeqSELL, 1723 /* 4*/ MatMultAdd_SeqSELL, 1724 MatMultTranspose_SeqSELL, 1725 MatMultTransposeAdd_SeqSELL, 1726 NULL, 1727 NULL, 1728 NULL, 1729 /* 10*/ NULL, 1730 NULL, 1731 NULL, 1732 MatSOR_SeqSELL, 1733 NULL, 1734 /* 15*/ MatGetInfo_SeqSELL, 1735 MatEqual_SeqSELL, 1736 MatGetDiagonal_SeqSELL, 1737 MatDiagonalScale_SeqSELL, 1738 NULL, 1739 /* 20*/ NULL, 1740 MatAssemblyEnd_SeqSELL, 1741 MatSetOption_SeqSELL, 1742 MatZeroEntries_SeqSELL, 1743 /* 24*/ NULL, 1744 NULL, 1745 NULL, 1746 NULL, 1747 NULL, 1748 /* 29*/ MatSetUp_SeqSELL, 1749 NULL, 1750 NULL, 1751 NULL, 1752 NULL, 1753 /* 34*/ MatDuplicate_SeqSELL, 1754 NULL, 1755 NULL, 1756 NULL, 1757 NULL, 1758 /* 39*/ NULL, 1759 NULL, 1760 NULL, 1761 MatGetValues_SeqSELL, 1762 MatCopy_SeqSELL, 1763 /* 44*/ NULL, 1764 MatScale_SeqSELL, 1765 MatShift_SeqSELL, 1766 NULL, 1767 NULL, 1768 /* 49*/ NULL, 1769 NULL, 1770 NULL, 1771 NULL, 1772 NULL, 1773 /* 54*/ MatFDColoringCreate_SeqXAIJ, 1774 NULL, 1775 NULL, 1776 NULL, 1777 NULL, 1778 /* 59*/ NULL, 1779 MatDestroy_SeqSELL, 1780 MatView_SeqSELL, 1781 NULL, 1782 NULL, 1783 /* 64*/ NULL, 1784 NULL, 1785 NULL, 1786 NULL, 1787 NULL, 1788 /* 69*/ NULL, 1789 NULL, 1790 NULL, 1791 NULL, 1792 NULL, 1793 /* 74*/ NULL, 1794 MatFDColoringApply_AIJ, /* reuse the FDColoring function for AIJ */ 1795 NULL, 1796 NULL, 1797 NULL, 1798 /* 79*/ NULL, 1799 NULL, 1800 NULL, 1801 NULL, 1802 NULL, 1803 /* 84*/ NULL, 1804 NULL, 1805 NULL, 1806 NULL, 1807 NULL, 1808 /* 89*/ NULL, 1809 NULL, 1810 NULL, 1811 NULL, 1812 NULL, 1813 /* 94*/ NULL, 1814 NULL, 1815 NULL, 1816 NULL, 1817 NULL, 1818 /* 99*/ NULL, 1819 NULL, 1820 NULL, 1821 MatConjugate_SeqSELL, 1822 NULL, 1823 /*104*/ NULL, 1824 NULL, 1825 NULL, 1826 NULL, 1827 NULL, 1828 /*109*/ NULL, 1829 NULL, 1830 NULL, 1831 NULL, 1832 MatMissingDiagonal_SeqSELL, 1833 /*114*/ NULL, 1834 NULL, 1835 NULL, 1836 NULL, 1837 NULL, 1838 /*119*/ NULL, 1839 NULL, 1840 NULL, 1841 NULL, 1842 NULL, 1843 /*124*/ NULL, 1844 NULL, 1845 NULL, 1846 NULL, 1847 NULL, 1848 /*129*/ NULL, 1849 NULL, 1850 NULL, 1851 NULL, 1852 NULL, 1853 /*134*/ NULL, 1854 NULL, 1855 NULL, 1856 NULL, 1857 NULL, 1858 /*139*/ NULL, 1859 NULL, 1860 NULL, 1861 MatFDColoringSetUp_SeqXAIJ, 1862 NULL, 1863 /*144*/ NULL, 1864 NULL, 1865 NULL, 1866 NULL 1867 }; 1868 1869 PetscErrorCode MatStoreValues_SeqSELL(Mat mat) 1870 { 1871 Mat_SeqSELL *a=(Mat_SeqSELL*)mat->data; 1872 1873 PetscFunctionBegin; 1874 PetscCheck(a->nonew,PETSC_COMM_SELF,PETSC_ERR_ORDER,"Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first"); 1875 1876 /* allocate space for values if not already there */ 1877 if (!a->saved_values) { 1878 PetscCall(PetscMalloc1(a->sliidx[a->totalslices]+1,&a->saved_values)); 1879 PetscCall(PetscLogObjectMemory((PetscObject)mat,(a->sliidx[a->totalslices]+1)*sizeof(PetscScalar))); 1880 } 1881 1882 /* copy values over */ 1883 PetscCall(PetscArraycpy(a->saved_values,a->val,a->sliidx[a->totalslices])); 1884 PetscFunctionReturn(0); 1885 } 1886 1887 PetscErrorCode MatRetrieveValues_SeqSELL(Mat mat) 1888 { 1889 Mat_SeqSELL *a=(Mat_SeqSELL*)mat->data; 1890 1891 PetscFunctionBegin; 1892 PetscCheck(a->nonew,PETSC_COMM_SELF,PETSC_ERR_ORDER,"Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first"); 1893 PetscCheck(a->saved_values,PETSC_COMM_SELF,PETSC_ERR_ORDER,"Must call MatStoreValues(A);first"); 1894 PetscCall(PetscArraycpy(a->val,a->saved_values,a->sliidx[a->totalslices])); 1895 PetscFunctionReturn(0); 1896 } 1897 1898 /*@C 1899 MatSeqSELLRestoreArray - returns access to the array where the data for a MATSEQSELL matrix is stored obtained by MatSeqSELLGetArray() 1900 1901 Not Collective 1902 1903 Input Parameters: 1904 . mat - a MATSEQSELL matrix 1905 . array - pointer to the data 1906 1907 Level: intermediate 1908 1909 .seealso: MatSeqSELLGetArray(), MatSeqSELLRestoreArrayF90() 1910 @*/ 1911 PetscErrorCode MatSeqSELLRestoreArray(Mat A,PetscScalar **array) 1912 { 1913 PetscFunctionBegin; 1914 PetscUseMethod(A,"MatSeqSELLRestoreArray_C",(Mat,PetscScalar**),(A,array)); 1915 PetscFunctionReturn(0); 1916 } 1917 1918 PETSC_EXTERN PetscErrorCode MatCreate_SeqSELL(Mat B) 1919 { 1920 Mat_SeqSELL *b; 1921 PetscMPIInt size; 1922 1923 PetscFunctionBegin; 1924 PetscCall(PetscCitationsRegister(citation,&cited)); 1925 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B),&size)); 1926 PetscCheck(size <= 1,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Comm must be of size 1"); 1927 1928 PetscCall(PetscNewLog(B,&b)); 1929 1930 B->data = (void*)b; 1931 1932 PetscCall(PetscMemcpy(B->ops,&MatOps_Values,sizeof(struct _MatOps))); 1933 1934 b->row = NULL; 1935 b->col = NULL; 1936 b->icol = NULL; 1937 b->reallocs = 0; 1938 b->ignorezeroentries = PETSC_FALSE; 1939 b->roworiented = PETSC_TRUE; 1940 b->nonew = 0; 1941 b->diag = NULL; 1942 b->solve_work = NULL; 1943 B->spptr = NULL; 1944 b->saved_values = NULL; 1945 b->idiag = NULL; 1946 b->mdiag = NULL; 1947 b->ssor_work = NULL; 1948 b->omega = 1.0; 1949 b->fshift = 0.0; 1950 b->idiagvalid = PETSC_FALSE; 1951 b->keepnonzeropattern = PETSC_FALSE; 1952 1953 PetscCall(PetscObjectChangeTypeName((PetscObject)B,MATSEQSELL)); 1954 PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatSeqSELLGetArray_C",MatSeqSELLGetArray_SeqSELL)); 1955 PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatSeqSELLRestoreArray_C",MatSeqSELLRestoreArray_SeqSELL)); 1956 PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatStoreValues_C",MatStoreValues_SeqSELL)); 1957 PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatRetrieveValues_C",MatRetrieveValues_SeqSELL)); 1958 PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatSeqSELLSetPreallocation_C",MatSeqSELLSetPreallocation_SeqSELL)); 1959 PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqsell_seqaij_C",MatConvert_SeqSELL_SeqAIJ)); 1960 PetscFunctionReturn(0); 1961 } 1962 1963 /* 1964 Given a matrix generated with MatGetFactor() duplicates all the information in A into B 1965 */ 1966 PetscErrorCode MatDuplicateNoCreate_SeqSELL(Mat C,Mat A,MatDuplicateOption cpvalues,PetscBool mallocmatspace) 1967 { 1968 Mat_SeqSELL *c = (Mat_SeqSELL*)C->data,*a = (Mat_SeqSELL*)A->data; 1969 PetscInt i,m=A->rmap->n; 1970 PetscInt totalslices=a->totalslices; 1971 1972 PetscFunctionBegin; 1973 C->factortype = A->factortype; 1974 c->row = NULL; 1975 c->col = NULL; 1976 c->icol = NULL; 1977 c->reallocs = 0; 1978 C->assembled = PETSC_TRUE; 1979 1980 PetscCall(PetscLayoutReference(A->rmap,&C->rmap)); 1981 PetscCall(PetscLayoutReference(A->cmap,&C->cmap)); 1982 1983 PetscCall(PetscMalloc1(8*totalslices,&c->rlen)); 1984 PetscCall(PetscLogObjectMemory((PetscObject)C,m*sizeof(PetscInt))); 1985 PetscCall(PetscMalloc1(totalslices+1,&c->sliidx)); 1986 PetscCall(PetscLogObjectMemory((PetscObject)C, (totalslices+1)*sizeof(PetscInt))); 1987 1988 for (i=0; i<m; i++) c->rlen[i] = a->rlen[i]; 1989 for (i=0; i<totalslices+1; i++) c->sliidx[i] = a->sliidx[i]; 1990 1991 /* allocate the matrix space */ 1992 if (mallocmatspace) { 1993 PetscCall(PetscMalloc2(a->maxallocmat,&c->val,a->maxallocmat,&c->colidx)); 1994 PetscCall(PetscLogObjectMemory((PetscObject)C,a->maxallocmat*(sizeof(PetscScalar)+sizeof(PetscInt)))); 1995 1996 c->singlemalloc = PETSC_TRUE; 1997 1998 if (m > 0) { 1999 PetscCall(PetscArraycpy(c->colidx,a->colidx,a->maxallocmat)); 2000 if (cpvalues == MAT_COPY_VALUES) { 2001 PetscCall(PetscArraycpy(c->val,a->val,a->maxallocmat)); 2002 } else { 2003 PetscCall(PetscArrayzero(c->val,a->maxallocmat)); 2004 } 2005 } 2006 } 2007 2008 c->ignorezeroentries = a->ignorezeroentries; 2009 c->roworiented = a->roworiented; 2010 c->nonew = a->nonew; 2011 if (a->diag) { 2012 PetscCall(PetscMalloc1(m,&c->diag)); 2013 PetscCall(PetscLogObjectMemory((PetscObject)C,m*sizeof(PetscInt))); 2014 for (i=0; i<m; i++) { 2015 c->diag[i] = a->diag[i]; 2016 } 2017 } else c->diag = NULL; 2018 2019 c->solve_work = NULL; 2020 c->saved_values = NULL; 2021 c->idiag = NULL; 2022 c->ssor_work = NULL; 2023 c->keepnonzeropattern = a->keepnonzeropattern; 2024 c->free_val = PETSC_TRUE; 2025 c->free_colidx = PETSC_TRUE; 2026 2027 c->maxallocmat = a->maxallocmat; 2028 c->maxallocrow = a->maxallocrow; 2029 c->rlenmax = a->rlenmax; 2030 c->nz = a->nz; 2031 C->preallocated = PETSC_TRUE; 2032 2033 c->nonzerorowcnt = a->nonzerorowcnt; 2034 C->nonzerostate = A->nonzerostate; 2035 2036 PetscCall(PetscFunctionListDuplicate(((PetscObject)A)->qlist,&((PetscObject)C)->qlist)); 2037 PetscFunctionReturn(0); 2038 } 2039 2040 PetscErrorCode MatDuplicate_SeqSELL(Mat A,MatDuplicateOption cpvalues,Mat *B) 2041 { 2042 PetscFunctionBegin; 2043 PetscCall(MatCreate(PetscObjectComm((PetscObject)A),B)); 2044 PetscCall(MatSetSizes(*B,A->rmap->n,A->cmap->n,A->rmap->n,A->cmap->n)); 2045 if (!(A->rmap->n % A->rmap->bs) && !(A->cmap->n % A->cmap->bs)) { 2046 PetscCall(MatSetBlockSizesFromMats(*B,A,A)); 2047 } 2048 PetscCall(MatSetType(*B,((PetscObject)A)->type_name)); 2049 PetscCall(MatDuplicateNoCreate_SeqSELL(*B,A,cpvalues,PETSC_TRUE)); 2050 PetscFunctionReturn(0); 2051 } 2052 2053 /*MC 2054 MATSEQSELL - MATSEQSELL = "seqsell" - A matrix type to be used for sequential sparse matrices, 2055 based on the sliced Ellpack format 2056 2057 Options Database Keys: 2058 . -mat_type seqsell - sets the matrix type to "seqsell" during a call to MatSetFromOptions() 2059 2060 Level: beginner 2061 2062 .seealso: MatCreateSeqSell(), MATSELL, MATMPISELL, MATSEQAIJ, MATAIJ, MATMPIAIJ 2063 M*/ 2064 2065 /*MC 2066 MATSELL - MATSELL = "sell" - A matrix type to be used for sparse matrices. 2067 2068 This matrix type is identical to MATSEQSELL when constructed with a single process communicator, 2069 and MATMPISELL otherwise. As a result, for single process communicators, 2070 MatSeqSELLSetPreallocation() is supported, and similarly MatMPISELLSetPreallocation() is supported 2071 for communicators controlling multiple processes. It is recommended that you call both of 2072 the above preallocation routines for simplicity. 2073 2074 Options Database Keys: 2075 . -mat_type sell - sets the matrix type to "sell" during a call to MatSetFromOptions() 2076 2077 Level: beginner 2078 2079 Notes: 2080 This format is only supported for real scalars, double precision, and 32 bit indices (the defaults). 2081 2082 It can provide better performance on Intel and AMD processes with AVX2 or AVX512 support for matrices that have a similar number of 2083 non-zeros in contiguous groups of rows. However if the computation is memory bandwidth limited it may not provide much improvement. 2084 2085 Developer Notes: 2086 On Intel (and AMD) systems some of the matrix operations use SIMD (AVX) instructions to achieve higher performance. 2087 2088 The sparse matrix format is as follows. For simplicity we assume a slice size of 2, it is actually 8 2089 .vb 2090 (2 0 3 4) 2091 Consider the matrix A = (5 0 6 0) 2092 (0 0 7 8) 2093 (0 0 9 9) 2094 2095 symbolically the Ellpack format can be written as 2096 2097 (2 3 4 |) (0 2 3 |) 2098 v = (5 6 0 |) colidx = (0 2 2 |) 2099 -------- --------- 2100 (7 8 |) (2 3 |) 2101 (9 9 |) (2 3 |) 2102 2103 The data for 2 contiguous rows of the matrix are stored together (in column-major format) (with any left-over rows handled as a special case). 2104 Any of the rows in a slice fewer columns than the rest of the slice (row 1 above) are padded with a previous valid column in their "extra" colidx[] locations and 2105 zeros in their "extra" v locations so that the matrix operations do not need special code to handle different length rows within the 2 rows in a slice. 2106 2107 The one-dimensional representation of v used in the code is (2 5 3 6 4 0 7 9 8 9) and for colidx is (0 0 2 2 3 2 2 2 3 3) 2108 2109 .ve 2110 2111 See MatMult_SeqSELL() for how this format is used with the SIMD operations to achieve high performance. 2112 2113 References: 2114 . * - Hong Zhang, Richard T. Mills, Karl Rupp, and Barry F. Smith, Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512}, 2115 Proceedings of the 47th International Conference on Parallel Processing, 2018. 2116 2117 .seealso: MatCreateSeqSELL(), MatCreateSeqAIJ(), MatCreateSell(), MATSEQSELL, MATMPISELL, MATSEQAIJ, MATMPIAIJ, MATAIJ 2118 M*/ 2119 2120 /*@C 2121 MatCreateSeqSELL - Creates a sparse matrix in SELL format. 2122 2123 Collective on comm 2124 2125 Input Parameters: 2126 + comm - MPI communicator, set to PETSC_COMM_SELF 2127 . m - number of rows 2128 . n - number of columns 2129 . rlenmax - maximum number of nonzeros in a row 2130 - rlen - array containing the number of nonzeros in the various rows 2131 (possibly different for each row) or NULL 2132 2133 Output Parameter: 2134 . A - the matrix 2135 2136 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 2137 MatXXXXSetPreallocation() paradigm instead of this routine directly. 2138 [MatXXXXSetPreallocation() is, for example, MatSeqSELLSetPreallocation] 2139 2140 Notes: 2141 If nnz is given then nz is ignored 2142 2143 Specify the preallocated storage with either rlenmax or rlen (not both). 2144 Set rlenmax=PETSC_DEFAULT and rlen=NULL for PETSc to control dynamic memory 2145 allocation. For large problems you MUST preallocate memory or you 2146 will get TERRIBLE performance, see the users' manual chapter on matrices. 2147 2148 Level: intermediate 2149 2150 .seealso: MatCreate(), MatCreateSELL(), MatSetValues(), MatSeqSELLSetPreallocation(), MATSELL, MATSEQSELL, MATMPISELL 2151 2152 @*/ 2153 PetscErrorCode MatCreateSeqSELL(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt maxallocrow,const PetscInt rlen[],Mat *A) 2154 { 2155 PetscFunctionBegin; 2156 PetscCall(MatCreate(comm,A)); 2157 PetscCall(MatSetSizes(*A,m,n,m,n)); 2158 PetscCall(MatSetType(*A,MATSEQSELL)); 2159 PetscCall(MatSeqSELLSetPreallocation_SeqSELL(*A,maxallocrow,rlen)); 2160 PetscFunctionReturn(0); 2161 } 2162 2163 PetscErrorCode MatEqual_SeqSELL(Mat A,Mat B,PetscBool * flg) 2164 { 2165 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data,*b=(Mat_SeqSELL*)B->data; 2166 PetscInt totalslices=a->totalslices; 2167 2168 PetscFunctionBegin; 2169 /* If the matrix dimensions are not equal,or no of nonzeros */ 2170 if ((A->rmap->n != B->rmap->n) || (A->cmap->n != B->cmap->n) ||(a->nz != b->nz) || (a->rlenmax != b->rlenmax)) { 2171 *flg = PETSC_FALSE; 2172 PetscFunctionReturn(0); 2173 } 2174 /* if the a->colidx are the same */ 2175 PetscCall(PetscArraycmp(a->colidx,b->colidx,a->sliidx[totalslices],flg)); 2176 if (!*flg) PetscFunctionReturn(0); 2177 /* if a->val are the same */ 2178 PetscCall(PetscArraycmp(a->val,b->val,a->sliidx[totalslices],flg)); 2179 PetscFunctionReturn(0); 2180 } 2181 2182 PetscErrorCode MatSeqSELLInvalidateDiagonal(Mat A) 2183 { 2184 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 2185 2186 PetscFunctionBegin; 2187 a->idiagvalid = PETSC_FALSE; 2188 PetscFunctionReturn(0); 2189 } 2190 2191 PetscErrorCode MatConjugate_SeqSELL(Mat A) 2192 { 2193 #if defined(PETSC_USE_COMPLEX) 2194 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 2195 PetscInt i; 2196 PetscScalar *val = a->val; 2197 2198 PetscFunctionBegin; 2199 for (i=0; i<a->sliidx[a->totalslices]; i++) { 2200 val[i] = PetscConj(val[i]); 2201 } 2202 #else 2203 PetscFunctionBegin; 2204 #endif 2205 PetscFunctionReturn(0); 2206 } 2207