1 2 /* 3 Defines the basic matrix operations for the SELL matrix storage format. 4 */ 5 #include <../src/mat/impls/sell/seq/sell.h> /*I "petscmat.h" I*/ 6 #include <petscblaslapack.h> 7 #include <petsc/private/kernels/blocktranspose.h> 8 9 static PetscBool cited = PETSC_FALSE; 10 static const char citation[] = 11 "@inproceedings{ZhangELLPACK2018,\n" 12 " author = {Hong Zhang and Richard T. Mills and Karl Rupp and Barry F. Smith},\n" 13 " title = {Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512}},\n" 14 " booktitle = {Proceedings of the 47th International Conference on Parallel Processing},\n" 15 " year = 2018\n" 16 "}\n"; 17 18 #if defined(PETSC_HAVE_IMMINTRIN_H) && (defined(__AVX512F__) || (defined(__AVX2__) && defined(__FMA__)) || defined(__AVX__)) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 19 20 #include <immintrin.h> 21 22 #if !defined(_MM_SCALE_8) 23 #define _MM_SCALE_8 8 24 #endif 25 26 #if defined(__AVX512F__) 27 /* these do not work 28 vec_idx = _mm512_loadunpackhi_epi32(vec_idx,acolidx); 29 vec_vals = _mm512_loadunpackhi_pd(vec_vals,aval); 30 */ 31 #define AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y) \ 32 /* if the mask bit is set, copy from acolidx, otherwise from vec_idx */ \ 33 vec_idx = _mm256_loadu_si256((__m256i const*)acolidx); \ 34 vec_vals = _mm512_loadu_pd(aval); \ 35 vec_x = _mm512_i32gather_pd(vec_idx,x,_MM_SCALE_8); \ 36 vec_y = _mm512_fmadd_pd(vec_x,vec_vals,vec_y) 37 #elif defined(__AVX2__) && defined(__FMA__) 38 #define AVX2_Mult_Private(vec_idx,vec_x,vec_vals,vec_y) \ 39 vec_vals = _mm256_loadu_pd(aval); \ 40 vec_idx = _mm_loadu_si128((__m128i const*)acolidx); /* SSE2 */ \ 41 vec_x = _mm256_i32gather_pd(x,vec_idx,_MM_SCALE_8); \ 42 vec_y = _mm256_fmadd_pd(vec_x,vec_vals,vec_y) 43 #endif 44 #endif /* PETSC_HAVE_IMMINTRIN_H */ 45 46 /*@C 47 MatSeqSELLSetPreallocation - For good matrix assembly performance 48 the user should preallocate the matrix storage by setting the parameter nz 49 (or the array nnz). By setting these parameters accurately, performance 50 during matrix assembly can be increased significantly. 51 52 Collective 53 54 Input Parameters: 55 + B - The matrix 56 . nz - number of nonzeros per row (same for all rows) 57 - nnz - array containing the number of nonzeros in the various rows 58 (possibly different for each row) or NULL 59 60 Notes: 61 If nnz is given then nz is ignored. 62 63 Specify the preallocated storage with either nz or nnz (not both). 64 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 65 allocation. For large problems you MUST preallocate memory or you 66 will get TERRIBLE performance, see the users' manual chapter on matrices. 67 68 You can call MatGetInfo() to get information on how effective the preallocation was; 69 for example the fields mallocs,nz_allocated,nz_used,nz_unneeded; 70 You can also run with the option -info and look for messages with the string 71 malloc in them to see if additional memory allocation was needed. 72 73 Developers: Use nz of MAT_SKIP_ALLOCATION to not allocate any space for the matrix 74 entries or columns indices. 75 76 The maximum number of nonzeos in any row should be as accurate as possible. 77 If it is underestimated, you will get bad performance due to reallocation 78 (MatSeqXSELLReallocateSELL). 79 80 Level: intermediate 81 82 .seealso: MatCreate(), MatCreateSELL(), MatSetValues(), MatGetInfo() 83 84 @*/ 85 PetscErrorCode MatSeqSELLSetPreallocation(Mat B,PetscInt rlenmax,const PetscInt rlen[]) 86 { 87 PetscFunctionBegin; 88 PetscValidHeaderSpecific(B,MAT_CLASSID,1); 89 PetscValidType(B,1); 90 CHKERRQ(PetscTryMethod(B,"MatSeqSELLSetPreallocation_C",(Mat,PetscInt,const PetscInt[]),(B,rlenmax,rlen))); 91 PetscFunctionReturn(0); 92 } 93 94 PetscErrorCode MatSeqSELLSetPreallocation_SeqSELL(Mat B,PetscInt maxallocrow,const PetscInt rlen[]) 95 { 96 Mat_SeqSELL *b; 97 PetscInt i,j,totalslices; 98 PetscBool skipallocation=PETSC_FALSE,realalloc=PETSC_FALSE; 99 100 PetscFunctionBegin; 101 if (maxallocrow >= 0 || rlen) realalloc = PETSC_TRUE; 102 if (maxallocrow == MAT_SKIP_ALLOCATION) { 103 skipallocation = PETSC_TRUE; 104 maxallocrow = 0; 105 } 106 107 CHKERRQ(PetscLayoutSetUp(B->rmap)); 108 CHKERRQ(PetscLayoutSetUp(B->cmap)); 109 110 /* FIXME: if one preallocates more space than needed, the matrix does not shrink automatically, but for best performance it should */ 111 if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 5; 112 PetscCheckFalse(maxallocrow < 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"maxallocrow cannot be less than 0: value %" PetscInt_FMT,maxallocrow); 113 if (rlen) { 114 for (i=0; i<B->rmap->n; i++) { 115 PetscCheckFalse(rlen[i] < 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"rlen cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT,i,rlen[i]); 116 PetscCheckFalse(rlen[i] > B->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"rlen cannot be greater than row length: local row %" PetscInt_FMT " value %" PetscInt_FMT " rowlength %" PetscInt_FMT,i,rlen[i],B->cmap->n); 117 } 118 } 119 120 B->preallocated = PETSC_TRUE; 121 122 b = (Mat_SeqSELL*)B->data; 123 124 totalslices = PetscCeilInt(B->rmap->n,8); 125 b->totalslices = totalslices; 126 if (!skipallocation) { 127 if (B->rmap->n & 0x07) CHKERRQ(PetscInfo(B,"Padding rows to the SEQSELL matrix because the number of rows is not the multiple of 8 (value %" PetscInt_FMT ")\n",B->rmap->n)); 128 129 if (!b->sliidx) { /* sliidx gives the starting index of each slice, the last element is the total space allocated */ 130 CHKERRQ(PetscMalloc1(totalslices+1,&b->sliidx)); 131 CHKERRQ(PetscLogObjectMemory((PetscObject)B,(totalslices+1)*sizeof(PetscInt))); 132 } 133 if (!rlen) { /* if rlen is not provided, allocate same space for all the slices */ 134 if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 10; 135 else if (maxallocrow < 0) maxallocrow = 1; 136 for (i=0; i<=totalslices; i++) b->sliidx[i] = i*8*maxallocrow; 137 } else { 138 maxallocrow = 0; 139 b->sliidx[0] = 0; 140 for (i=1; i<totalslices; i++) { 141 b->sliidx[i] = 0; 142 for (j=0;j<8;j++) { 143 b->sliidx[i] = PetscMax(b->sliidx[i],rlen[8*(i-1)+j]); 144 } 145 maxallocrow = PetscMax(b->sliidx[i],maxallocrow); 146 CHKERRQ(PetscIntSumError(b->sliidx[i-1],8*b->sliidx[i],&b->sliidx[i])); 147 } 148 /* last slice */ 149 b->sliidx[totalslices] = 0; 150 for (j=(totalslices-1)*8;j<B->rmap->n;j++) b->sliidx[totalslices] = PetscMax(b->sliidx[totalslices],rlen[j]); 151 maxallocrow = PetscMax(b->sliidx[totalslices],maxallocrow); 152 b->sliidx[totalslices] = b->sliidx[totalslices-1] + 8*b->sliidx[totalslices]; 153 } 154 155 /* allocate space for val, colidx, rlen */ 156 /* FIXME: should B's old memory be unlogged? */ 157 CHKERRQ(MatSeqXSELLFreeSELL(B,&b->val,&b->colidx)); 158 /* FIXME: assuming an element of the bit array takes 8 bits */ 159 CHKERRQ(PetscMalloc2(b->sliidx[totalslices],&b->val,b->sliidx[totalslices],&b->colidx)); 160 CHKERRQ(PetscLogObjectMemory((PetscObject)B,b->sliidx[totalslices]*(sizeof(PetscScalar)+sizeof(PetscInt)))); 161 /* b->rlen will count nonzeros in each row so far. We dont copy rlen to b->rlen because the matrix has not been set. */ 162 CHKERRQ(PetscCalloc1(8*totalslices,&b->rlen)); 163 CHKERRQ(PetscLogObjectMemory((PetscObject)B,8*totalslices*sizeof(PetscInt))); 164 165 b->singlemalloc = PETSC_TRUE; 166 b->free_val = PETSC_TRUE; 167 b->free_colidx = PETSC_TRUE; 168 } else { 169 b->free_val = PETSC_FALSE; 170 b->free_colidx = PETSC_FALSE; 171 } 172 173 b->nz = 0; 174 b->maxallocrow = maxallocrow; 175 b->rlenmax = maxallocrow; 176 b->maxallocmat = b->sliidx[totalslices]; 177 B->info.nz_unneeded = (double)b->maxallocmat; 178 if (realalloc) { 179 CHKERRQ(MatSetOption(B,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE)); 180 } 181 PetscFunctionReturn(0); 182 } 183 184 PetscErrorCode MatGetRow_SeqSELL(Mat A,PetscInt row,PetscInt *nz,PetscInt **idx,PetscScalar **v) 185 { 186 Mat_SeqSELL *a = (Mat_SeqSELL*)A->data; 187 PetscInt shift; 188 189 PetscFunctionBegin; 190 PetscCheckFalse(row < 0 || row >= A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Row %" PetscInt_FMT " out of range",row); 191 if (nz) *nz = a->rlen[row]; 192 shift = a->sliidx[row>>3]+(row&0x07); 193 if (!a->getrowcols) { 194 195 CHKERRQ(PetscMalloc2(a->rlenmax,&a->getrowcols,a->rlenmax,&a->getrowvals)); 196 } 197 if (idx) { 198 PetscInt j; 199 for (j=0; j<a->rlen[row]; j++) a->getrowcols[j] = a->colidx[shift+8*j]; 200 *idx = a->getrowcols; 201 } 202 if (v) { 203 PetscInt j; 204 for (j=0; j<a->rlen[row]; j++) a->getrowvals[j] = a->val[shift+8*j]; 205 *v = a->getrowvals; 206 } 207 PetscFunctionReturn(0); 208 } 209 210 PetscErrorCode MatRestoreRow_SeqSELL(Mat A,PetscInt row,PetscInt *nz,PetscInt **idx,PetscScalar **v) 211 { 212 PetscFunctionBegin; 213 PetscFunctionReturn(0); 214 } 215 216 PetscErrorCode MatConvert_SeqSELL_SeqAIJ(Mat A, MatType newtype,MatReuse reuse,Mat *newmat) 217 { 218 Mat B; 219 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 220 PetscInt i; 221 222 PetscFunctionBegin; 223 if (reuse == MAT_REUSE_MATRIX) { 224 B = *newmat; 225 CHKERRQ(MatZeroEntries(B)); 226 } else { 227 CHKERRQ(MatCreate(PetscObjectComm((PetscObject)A),&B)); 228 CHKERRQ(MatSetSizes(B,A->rmap->n,A->cmap->n,A->rmap->N,A->cmap->N)); 229 CHKERRQ(MatSetType(B,MATSEQAIJ)); 230 CHKERRQ(MatSeqAIJSetPreallocation(B,0,a->rlen)); 231 } 232 233 for (i=0; i<A->rmap->n; i++) { 234 PetscInt nz = 0,*cols = NULL; 235 PetscScalar *vals = NULL; 236 237 CHKERRQ(MatGetRow_SeqSELL(A,i,&nz,&cols,&vals)); 238 CHKERRQ(MatSetValues(B,1,&i,nz,cols,vals,INSERT_VALUES)); 239 CHKERRQ(MatRestoreRow_SeqSELL(A,i,&nz,&cols,&vals)); 240 } 241 242 CHKERRQ(MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY)); 243 CHKERRQ(MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY)); 244 B->rmap->bs = A->rmap->bs; 245 246 if (reuse == MAT_INPLACE_MATRIX) { 247 CHKERRQ(MatHeaderReplace(A,&B)); 248 } else { 249 *newmat = B; 250 } 251 PetscFunctionReturn(0); 252 } 253 254 #include <../src/mat/impls/aij/seq/aij.h> 255 256 PetscErrorCode MatConvert_SeqAIJ_SeqSELL(Mat A,MatType newtype,MatReuse reuse,Mat *newmat) 257 { 258 Mat B; 259 Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data; 260 PetscInt *ai=a->i,m=A->rmap->N,n=A->cmap->N,i,*rowlengths,row,ncols; 261 const PetscInt *cols; 262 const PetscScalar *vals; 263 264 PetscFunctionBegin; 265 266 if (reuse == MAT_REUSE_MATRIX) { 267 B = *newmat; 268 } else { 269 if (PetscDefined(USE_DEBUG) || !a->ilen) { 270 CHKERRQ(PetscMalloc1(m,&rowlengths)); 271 for (i=0; i<m; i++) { 272 rowlengths[i] = ai[i+1] - ai[i]; 273 } 274 } 275 if (PetscDefined(USE_DEBUG) && a->ilen) { 276 PetscBool eq; 277 CHKERRQ(PetscMemcmp(rowlengths,a->ilen,m*sizeof(PetscInt),&eq)); 278 PetscCheck(eq,PETSC_COMM_SELF,PETSC_ERR_PLIB,"SeqAIJ ilen array incorrect"); 279 CHKERRQ(PetscFree(rowlengths)); 280 rowlengths = a->ilen; 281 } else if (a->ilen) rowlengths = a->ilen; 282 CHKERRQ(MatCreate(PetscObjectComm((PetscObject)A),&B)); 283 CHKERRQ(MatSetSizes(B,m,n,m,n)); 284 CHKERRQ(MatSetType(B,MATSEQSELL)); 285 CHKERRQ(MatSeqSELLSetPreallocation(B,0,rowlengths)); 286 if (rowlengths != a->ilen) CHKERRQ(PetscFree(rowlengths)); 287 } 288 289 for (row=0; row<m; row++) { 290 CHKERRQ(MatGetRow_SeqAIJ(A,row,&ncols,(PetscInt**)&cols,(PetscScalar**)&vals)); 291 CHKERRQ(MatSetValues_SeqSELL(B,1,&row,ncols,cols,vals,INSERT_VALUES)); 292 CHKERRQ(MatRestoreRow_SeqAIJ(A,row,&ncols,(PetscInt**)&cols,(PetscScalar**)&vals)); 293 } 294 CHKERRQ(MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY)); 295 CHKERRQ(MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY)); 296 B->rmap->bs = A->rmap->bs; 297 298 if (reuse == MAT_INPLACE_MATRIX) { 299 CHKERRQ(MatHeaderReplace(A,&B)); 300 } else { 301 *newmat = B; 302 } 303 PetscFunctionReturn(0); 304 } 305 306 PetscErrorCode MatMult_SeqSELL(Mat A,Vec xx,Vec yy) 307 { 308 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 309 PetscScalar *y; 310 const PetscScalar *x; 311 const MatScalar *aval=a->val; 312 PetscInt totalslices=a->totalslices; 313 const PetscInt *acolidx=a->colidx; 314 PetscInt i,j; 315 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 316 __m512d vec_x,vec_y,vec_vals; 317 __m256i vec_idx; 318 __mmask8 mask; 319 __m512d vec_x2,vec_y2,vec_vals2,vec_x3,vec_y3,vec_vals3,vec_x4,vec_y4,vec_vals4; 320 __m256i vec_idx2,vec_idx3,vec_idx4; 321 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 322 __m128i vec_idx; 323 __m256d vec_x,vec_y,vec_y2,vec_vals; 324 MatScalar yval; 325 PetscInt r,rows_left,row,nnz_in_row; 326 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 327 __m128d vec_x_tmp; 328 __m256d vec_x,vec_y,vec_y2,vec_vals; 329 MatScalar yval; 330 PetscInt r,rows_left,row,nnz_in_row; 331 #else 332 PetscScalar sum[8]; 333 #endif 334 335 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 336 #pragma disjoint(*x,*y,*aval) 337 #endif 338 339 PetscFunctionBegin; 340 CHKERRQ(VecGetArrayRead(xx,&x)); 341 CHKERRQ(VecGetArray(yy,&y)); 342 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 343 for (i=0; i<totalslices; i++) { /* loop over slices */ 344 PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0); 345 PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0); 346 347 vec_y = _mm512_setzero_pd(); 348 vec_y2 = _mm512_setzero_pd(); 349 vec_y3 = _mm512_setzero_pd(); 350 vec_y4 = _mm512_setzero_pd(); 351 352 j = a->sliidx[i]>>3; /* 8 bytes are read at each time, corresponding to a slice columnn */ 353 switch ((a->sliidx[i+1]-a->sliidx[i])/8 & 3) { 354 case 3: 355 AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y); 356 acolidx += 8; aval += 8; 357 AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2); 358 acolidx += 8; aval += 8; 359 AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3); 360 acolidx += 8; aval += 8; 361 j += 3; 362 break; 363 case 2: 364 AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y); 365 acolidx += 8; aval += 8; 366 AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2); 367 acolidx += 8; aval += 8; 368 j += 2; 369 break; 370 case 1: 371 AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y); 372 acolidx += 8; aval += 8; 373 j += 1; 374 break; 375 } 376 #pragma novector 377 for (; j<(a->sliidx[i+1]>>3); j+=4) { 378 AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y); 379 acolidx += 8; aval += 8; 380 AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2); 381 acolidx += 8; aval += 8; 382 AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3); 383 acolidx += 8; aval += 8; 384 AVX512_Mult_Private(vec_idx4,vec_x4,vec_vals4,vec_y4); 385 acolidx += 8; aval += 8; 386 } 387 388 vec_y = _mm512_add_pd(vec_y,vec_y2); 389 vec_y = _mm512_add_pd(vec_y,vec_y3); 390 vec_y = _mm512_add_pd(vec_y,vec_y4); 391 if (i == totalslices-1 && A->rmap->n & 0x07) { /* if last slice has padding rows */ 392 mask = (__mmask8)(0xff >> (8-(A->rmap->n & 0x07))); 393 _mm512_mask_storeu_pd(&y[8*i],mask,vec_y); 394 } else { 395 _mm512_storeu_pd(&y[8*i],vec_y); 396 } 397 } 398 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 399 for (i=0; i<totalslices; i++) { /* loop over full slices */ 400 PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0); 401 PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0); 402 403 /* last slice may have padding rows. Don't use vectorization. */ 404 if (i == totalslices-1 && (A->rmap->n & 0x07)) { 405 rows_left = A->rmap->n - 8*i; 406 for (r=0; r<rows_left; ++r) { 407 yval = (MatScalar)0; 408 row = 8*i + r; 409 nnz_in_row = a->rlen[row]; 410 for (j=0; j<nnz_in_row; ++j) yval += aval[8*j+r] * x[acolidx[8*j+r]]; 411 y[row] = yval; 412 } 413 break; 414 } 415 416 vec_y = _mm256_setzero_pd(); 417 vec_y2 = _mm256_setzero_pd(); 418 419 /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */ 420 #pragma novector 421 #pragma unroll(2) 422 for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) { 423 AVX2_Mult_Private(vec_idx,vec_x,vec_vals,vec_y); 424 aval += 4; acolidx += 4; 425 AVX2_Mult_Private(vec_idx,vec_x,vec_vals,vec_y2); 426 aval += 4; acolidx += 4; 427 } 428 429 _mm256_storeu_pd(y+i*8,vec_y); 430 _mm256_storeu_pd(y+i*8+4,vec_y2); 431 } 432 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 433 for (i=0; i<totalslices; i++) { /* loop over full slices */ 434 PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0); 435 PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0); 436 437 vec_y = _mm256_setzero_pd(); 438 vec_y2 = _mm256_setzero_pd(); 439 440 /* last slice may have padding rows. Don't use vectorization. */ 441 if (i == totalslices-1 && (A->rmap->n & 0x07)) { 442 rows_left = A->rmap->n - 8*i; 443 for (r=0; r<rows_left; ++r) { 444 yval = (MatScalar)0; 445 row = 8*i + r; 446 nnz_in_row = a->rlen[row]; 447 for (j=0; j<nnz_in_row; ++j) yval += aval[8*j + r] * x[acolidx[8*j + r]]; 448 y[row] = yval; 449 } 450 break; 451 } 452 453 /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */ 454 #pragma novector 455 #pragma unroll(2) 456 for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) { 457 vec_vals = _mm256_loadu_pd(aval); 458 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 459 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 460 vec_x = _mm256_insertf128_pd(vec_x,vec_x_tmp,0); 461 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 462 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 463 vec_x = _mm256_insertf128_pd(vec_x,vec_x_tmp,1); 464 vec_y = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y); 465 aval += 4; 466 467 vec_vals = _mm256_loadu_pd(aval); 468 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 469 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 470 vec_x = _mm256_insertf128_pd(vec_x,vec_x_tmp,0); 471 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 472 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 473 vec_x = _mm256_insertf128_pd(vec_x,vec_x_tmp,1); 474 vec_y2 = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y2); 475 aval += 4; 476 } 477 478 _mm256_storeu_pd(y + i*8, vec_y); 479 _mm256_storeu_pd(y + i*8 + 4, vec_y2); 480 } 481 #else 482 for (i=0; i<totalslices; i++) { /* loop over slices */ 483 for (j=0; j<8; j++) sum[j] = 0.0; 484 for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) { 485 sum[0] += aval[j] * x[acolidx[j]]; 486 sum[1] += aval[j+1] * x[acolidx[j+1]]; 487 sum[2] += aval[j+2] * x[acolidx[j+2]]; 488 sum[3] += aval[j+3] * x[acolidx[j+3]]; 489 sum[4] += aval[j+4] * x[acolidx[j+4]]; 490 sum[5] += aval[j+5] * x[acolidx[j+5]]; 491 sum[6] += aval[j+6] * x[acolidx[j+6]]; 492 sum[7] += aval[j+7] * x[acolidx[j+7]]; 493 } 494 if (i == totalslices-1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */ 495 for (j=0; j<(A->rmap->n & 0x07); j++) y[8*i+j] = sum[j]; 496 } else { 497 for (j=0; j<8; j++) y[8*i+j] = sum[j]; 498 } 499 } 500 #endif 501 502 CHKERRQ(PetscLogFlops(2.0*a->nz-a->nonzerorowcnt)); /* theoretical minimal FLOPs */ 503 CHKERRQ(VecRestoreArrayRead(xx,&x)); 504 CHKERRQ(VecRestoreArray(yy,&y)); 505 PetscFunctionReturn(0); 506 } 507 508 #include <../src/mat/impls/aij/seq/ftn-kernels/fmultadd.h> 509 PetscErrorCode MatMultAdd_SeqSELL(Mat A,Vec xx,Vec yy,Vec zz) 510 { 511 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 512 PetscScalar *y,*z; 513 const PetscScalar *x; 514 const MatScalar *aval=a->val; 515 PetscInt totalslices=a->totalslices; 516 const PetscInt *acolidx=a->colidx; 517 PetscInt i,j; 518 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 519 __m512d vec_x,vec_y,vec_vals; 520 __m256i vec_idx; 521 __mmask8 mask; 522 __m512d vec_x2,vec_y2,vec_vals2,vec_x3,vec_y3,vec_vals3,vec_x4,vec_y4,vec_vals4; 523 __m256i vec_idx2,vec_idx3,vec_idx4; 524 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 525 __m128d vec_x_tmp; 526 __m256d vec_x,vec_y,vec_y2,vec_vals; 527 MatScalar yval; 528 PetscInt r,row,nnz_in_row; 529 #else 530 PetscScalar sum[8]; 531 #endif 532 533 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 534 #pragma disjoint(*x,*y,*aval) 535 #endif 536 537 PetscFunctionBegin; 538 CHKERRQ(VecGetArrayRead(xx,&x)); 539 CHKERRQ(VecGetArrayPair(yy,zz,&y,&z)); 540 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 541 for (i=0; i<totalslices; i++) { /* loop over slices */ 542 PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0); 543 PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0); 544 545 if (i == totalslices-1 && A->rmap->n & 0x07) { /* if last slice has padding rows */ 546 mask = (__mmask8)(0xff >> (8-(A->rmap->n & 0x07))); 547 vec_y = _mm512_mask_loadu_pd(vec_y,mask,&y[8*i]); 548 } else { 549 vec_y = _mm512_loadu_pd(&y[8*i]); 550 } 551 vec_y2 = _mm512_setzero_pd(); 552 vec_y3 = _mm512_setzero_pd(); 553 vec_y4 = _mm512_setzero_pd(); 554 555 j = a->sliidx[i]>>3; /* 8 bytes are read at each time, corresponding to a slice columnn */ 556 switch ((a->sliidx[i+1]-a->sliidx[i])/8 & 3) { 557 case 3: 558 AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y); 559 acolidx += 8; aval += 8; 560 AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2); 561 acolidx += 8; aval += 8; 562 AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3); 563 acolidx += 8; aval += 8; 564 j += 3; 565 break; 566 case 2: 567 AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y); 568 acolidx += 8; aval += 8; 569 AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2); 570 acolidx += 8; aval += 8; 571 j += 2; 572 break; 573 case 1: 574 AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y); 575 acolidx += 8; aval += 8; 576 j += 1; 577 break; 578 } 579 #pragma novector 580 for (; j<(a->sliidx[i+1]>>3); j+=4) { 581 AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y); 582 acolidx += 8; aval += 8; 583 AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2); 584 acolidx += 8; aval += 8; 585 AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3); 586 acolidx += 8; aval += 8; 587 AVX512_Mult_Private(vec_idx4,vec_x4,vec_vals4,vec_y4); 588 acolidx += 8; aval += 8; 589 } 590 591 vec_y = _mm512_add_pd(vec_y,vec_y2); 592 vec_y = _mm512_add_pd(vec_y,vec_y3); 593 vec_y = _mm512_add_pd(vec_y,vec_y4); 594 if (i == totalslices-1 && A->rmap->n & 0x07) { /* if last slice has padding rows */ 595 _mm512_mask_storeu_pd(&z[8*i],mask,vec_y); 596 } else { 597 _mm512_storeu_pd(&z[8*i],vec_y); 598 } 599 } 600 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 601 for (i=0; i<totalslices; i++) { /* loop over full slices */ 602 PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0); 603 PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0); 604 605 /* last slice may have padding rows. Don't use vectorization. */ 606 if (i == totalslices-1 && (A->rmap->n & 0x07)) { 607 for (r=0; r<(A->rmap->n & 0x07); ++r) { 608 row = 8*i + r; 609 yval = (MatScalar)0.0; 610 nnz_in_row = a->rlen[row]; 611 for (j=0; j<nnz_in_row; ++j) yval += aval[8*j+r] * x[acolidx[8*j+r]]; 612 z[row] = y[row] + yval; 613 } 614 break; 615 } 616 617 vec_y = _mm256_loadu_pd(y+8*i); 618 vec_y2 = _mm256_loadu_pd(y+8*i+4); 619 620 /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */ 621 for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) { 622 vec_vals = _mm256_loadu_pd(aval); 623 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 624 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 625 vec_x = _mm256_insertf128_pd(vec_x,vec_x_tmp,0); 626 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 627 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 628 vec_x = _mm256_insertf128_pd(vec_x,vec_x_tmp,1); 629 vec_y = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y); 630 aval += 4; 631 632 vec_vals = _mm256_loadu_pd(aval); 633 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 634 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 635 vec_x = _mm256_insertf128_pd(vec_x,vec_x_tmp,0); 636 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 637 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 638 vec_x = _mm256_insertf128_pd(vec_x,vec_x_tmp,1); 639 vec_y2 = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y2); 640 aval += 4; 641 } 642 643 _mm256_storeu_pd(z+i*8,vec_y); 644 _mm256_storeu_pd(z+i*8+4,vec_y2); 645 } 646 #else 647 for (i=0; i<totalslices; i++) { /* loop over slices */ 648 for (j=0; j<8; j++) sum[j] = 0.0; 649 for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) { 650 sum[0] += aval[j] * x[acolidx[j]]; 651 sum[1] += aval[j+1] * x[acolidx[j+1]]; 652 sum[2] += aval[j+2] * x[acolidx[j+2]]; 653 sum[3] += aval[j+3] * x[acolidx[j+3]]; 654 sum[4] += aval[j+4] * x[acolidx[j+4]]; 655 sum[5] += aval[j+5] * x[acolidx[j+5]]; 656 sum[6] += aval[j+6] * x[acolidx[j+6]]; 657 sum[7] += aval[j+7] * x[acolidx[j+7]]; 658 } 659 if (i == totalslices-1 && (A->rmap->n & 0x07)) { 660 for (j=0; j<(A->rmap->n & 0x07); j++) z[8*i+j] = y[8*i+j] + sum[j]; 661 } else { 662 for (j=0; j<8; j++) z[8*i+j] = y[8*i+j] + sum[j]; 663 } 664 } 665 #endif 666 667 CHKERRQ(PetscLogFlops(2.0*a->nz)); 668 CHKERRQ(VecRestoreArrayRead(xx,&x)); 669 CHKERRQ(VecRestoreArrayPair(yy,zz,&y,&z)); 670 PetscFunctionReturn(0); 671 } 672 673 PetscErrorCode MatMultTransposeAdd_SeqSELL(Mat A,Vec xx,Vec zz,Vec yy) 674 { 675 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 676 PetscScalar *y; 677 const PetscScalar *x; 678 const MatScalar *aval=a->val; 679 const PetscInt *acolidx=a->colidx; 680 PetscInt i,j,r,row,nnz_in_row,totalslices=a->totalslices; 681 682 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 683 #pragma disjoint(*x,*y,*aval) 684 #endif 685 686 PetscFunctionBegin; 687 if (A->symmetric) { 688 CHKERRQ(MatMultAdd_SeqSELL(A,xx,zz,yy)); 689 PetscFunctionReturn(0); 690 } 691 if (zz != yy) CHKERRQ(VecCopy(zz,yy)); 692 CHKERRQ(VecGetArrayRead(xx,&x)); 693 CHKERRQ(VecGetArray(yy,&y)); 694 for (i=0; i<a->totalslices; i++) { /* loop over slices */ 695 if (i == totalslices-1 && (A->rmap->n & 0x07)) { 696 for (r=0; r<(A->rmap->n & 0x07); ++r) { 697 row = 8*i + r; 698 nnz_in_row = a->rlen[row]; 699 for (j=0; j<nnz_in_row; ++j) y[acolidx[8*j+r]] += aval[8*j+r] * x[row]; 700 } 701 break; 702 } 703 for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) { 704 y[acolidx[j]] += aval[j] * x[8*i]; 705 y[acolidx[j+1]] += aval[j+1] * x[8*i+1]; 706 y[acolidx[j+2]] += aval[j+2] * x[8*i+2]; 707 y[acolidx[j+3]] += aval[j+3] * x[8*i+3]; 708 y[acolidx[j+4]] += aval[j+4] * x[8*i+4]; 709 y[acolidx[j+5]] += aval[j+5] * x[8*i+5]; 710 y[acolidx[j+6]] += aval[j+6] * x[8*i+6]; 711 y[acolidx[j+7]] += aval[j+7] * x[8*i+7]; 712 } 713 } 714 CHKERRQ(PetscLogFlops(2.0*a->sliidx[a->totalslices])); 715 CHKERRQ(VecRestoreArrayRead(xx,&x)); 716 CHKERRQ(VecRestoreArray(yy,&y)); 717 PetscFunctionReturn(0); 718 } 719 720 PetscErrorCode MatMultTranspose_SeqSELL(Mat A,Vec xx,Vec yy) 721 { 722 PetscFunctionBegin; 723 if (A->symmetric) { 724 CHKERRQ(MatMult_SeqSELL(A,xx,yy)); 725 } else { 726 CHKERRQ(VecSet(yy,0.0)); 727 CHKERRQ(MatMultTransposeAdd_SeqSELL(A,xx,yy,yy)); 728 } 729 PetscFunctionReturn(0); 730 } 731 732 /* 733 Checks for missing diagonals 734 */ 735 PetscErrorCode MatMissingDiagonal_SeqSELL(Mat A,PetscBool *missing,PetscInt *d) 736 { 737 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 738 PetscInt *diag,i; 739 740 PetscFunctionBegin; 741 *missing = PETSC_FALSE; 742 if (A->rmap->n > 0 && !(a->colidx)) { 743 *missing = PETSC_TRUE; 744 if (d) *d = 0; 745 CHKERRQ(PetscInfo(A,"Matrix has no entries therefore is missing diagonal\n")); 746 } else { 747 diag = a->diag; 748 for (i=0; i<A->rmap->n; i++) { 749 if (diag[i] == -1) { 750 *missing = PETSC_TRUE; 751 if (d) *d = i; 752 CHKERRQ(PetscInfo(A,"Matrix is missing diagonal number %" PetscInt_FMT "\n",i)); 753 break; 754 } 755 } 756 } 757 PetscFunctionReturn(0); 758 } 759 760 PetscErrorCode MatMarkDiagonal_SeqSELL(Mat A) 761 { 762 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 763 PetscInt i,j,m=A->rmap->n,shift; 764 765 PetscFunctionBegin; 766 if (!a->diag) { 767 CHKERRQ(PetscMalloc1(m,&a->diag)); 768 CHKERRQ(PetscLogObjectMemory((PetscObject)A,m*sizeof(PetscInt))); 769 a->free_diag = PETSC_TRUE; 770 } 771 for (i=0; i<m; i++) { /* loop over rows */ 772 shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */ 773 a->diag[i] = -1; 774 for (j=0; j<a->rlen[i]; j++) { 775 if (a->colidx[shift+j*8] == i) { 776 a->diag[i] = shift+j*8; 777 break; 778 } 779 } 780 } 781 PetscFunctionReturn(0); 782 } 783 784 /* 785 Negative shift indicates do not generate an error if there is a zero diagonal, just invert it anyways 786 */ 787 PetscErrorCode MatInvertDiagonal_SeqSELL(Mat A,PetscScalar omega,PetscScalar fshift) 788 { 789 Mat_SeqSELL *a=(Mat_SeqSELL*) A->data; 790 PetscInt i,*diag,m = A->rmap->n; 791 MatScalar *val = a->val; 792 PetscScalar *idiag,*mdiag; 793 794 PetscFunctionBegin; 795 if (a->idiagvalid) PetscFunctionReturn(0); 796 CHKERRQ(MatMarkDiagonal_SeqSELL(A)); 797 diag = a->diag; 798 if (!a->idiag) { 799 CHKERRQ(PetscMalloc3(m,&a->idiag,m,&a->mdiag,m,&a->ssor_work)); 800 CHKERRQ(PetscLogObjectMemory((PetscObject)A, 3*m*sizeof(PetscScalar))); 801 val = a->val; 802 } 803 mdiag = a->mdiag; 804 idiag = a->idiag; 805 806 if (omega == 1.0 && PetscRealPart(fshift) <= 0.0) { 807 for (i=0; i<m; i++) { 808 mdiag[i] = val[diag[i]]; 809 if (!PetscAbsScalar(mdiag[i])) { /* zero diagonal */ 810 if (PetscRealPart(fshift)) { 811 CHKERRQ(PetscInfo(A,"Zero diagonal on row %" PetscInt_FMT "\n",i)); 812 A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 813 A->factorerror_zeropivot_value = 0.0; 814 A->factorerror_zeropivot_row = i; 815 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Zero diagonal on row %" PetscInt_FMT,i); 816 } 817 idiag[i] = 1.0/val[diag[i]]; 818 } 819 CHKERRQ(PetscLogFlops(m)); 820 } else { 821 for (i=0; i<m; i++) { 822 mdiag[i] = val[diag[i]]; 823 idiag[i] = omega/(fshift + val[diag[i]]); 824 } 825 CHKERRQ(PetscLogFlops(2.0*m)); 826 } 827 a->idiagvalid = PETSC_TRUE; 828 PetscFunctionReturn(0); 829 } 830 831 PetscErrorCode MatZeroEntries_SeqSELL(Mat A) 832 { 833 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 834 835 PetscFunctionBegin; 836 CHKERRQ(PetscArrayzero(a->val,a->sliidx[a->totalslices])); 837 CHKERRQ(MatSeqSELLInvalidateDiagonal(A)); 838 PetscFunctionReturn(0); 839 } 840 841 PetscErrorCode MatDestroy_SeqSELL(Mat A) 842 { 843 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 844 845 PetscFunctionBegin; 846 #if defined(PETSC_USE_LOG) 847 PetscLogObjectState((PetscObject)A,"Rows=%" PetscInt_FMT ", Cols=%" PetscInt_FMT ", NZ=%" PetscInt_FMT,A->rmap->n,A->cmap->n,a->nz); 848 #endif 849 CHKERRQ(MatSeqXSELLFreeSELL(A,&a->val,&a->colidx)); 850 CHKERRQ(ISDestroy(&a->row)); 851 CHKERRQ(ISDestroy(&a->col)); 852 CHKERRQ(PetscFree(a->diag)); 853 CHKERRQ(PetscFree(a->rlen)); 854 CHKERRQ(PetscFree(a->sliidx)); 855 CHKERRQ(PetscFree3(a->idiag,a->mdiag,a->ssor_work)); 856 CHKERRQ(PetscFree(a->solve_work)); 857 CHKERRQ(ISDestroy(&a->icol)); 858 CHKERRQ(PetscFree(a->saved_values)); 859 CHKERRQ(PetscFree2(a->getrowcols,a->getrowvals)); 860 861 CHKERRQ(PetscFree(A->data)); 862 863 CHKERRQ(PetscObjectChangeTypeName((PetscObject)A,NULL)); 864 CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatStoreValues_C",NULL)); 865 CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatRetrieveValues_C",NULL)); 866 CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatSeqSELLSetPreallocation_C",NULL)); 867 PetscFunctionReturn(0); 868 } 869 870 PetscErrorCode MatSetOption_SeqSELL(Mat A,MatOption op,PetscBool flg) 871 { 872 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 873 874 PetscFunctionBegin; 875 switch (op) { 876 case MAT_ROW_ORIENTED: 877 a->roworiented = flg; 878 break; 879 case MAT_KEEP_NONZERO_PATTERN: 880 a->keepnonzeropattern = flg; 881 break; 882 case MAT_NEW_NONZERO_LOCATIONS: 883 a->nonew = (flg ? 0 : 1); 884 break; 885 case MAT_NEW_NONZERO_LOCATION_ERR: 886 a->nonew = (flg ? -1 : 0); 887 break; 888 case MAT_NEW_NONZERO_ALLOCATION_ERR: 889 a->nonew = (flg ? -2 : 0); 890 break; 891 case MAT_UNUSED_NONZERO_LOCATION_ERR: 892 a->nounused = (flg ? -1 : 0); 893 break; 894 case MAT_FORCE_DIAGONAL_ENTRIES: 895 case MAT_IGNORE_OFF_PROC_ENTRIES: 896 case MAT_USE_HASH_TABLE: 897 case MAT_SORTED_FULL: 898 CHKERRQ(PetscInfo(A,"Option %s ignored\n",MatOptions[op])); 899 break; 900 case MAT_SPD: 901 case MAT_SYMMETRIC: 902 case MAT_STRUCTURALLY_SYMMETRIC: 903 case MAT_HERMITIAN: 904 case MAT_SYMMETRY_ETERNAL: 905 /* These options are handled directly by MatSetOption() */ 906 break; 907 default: 908 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"unknown option %d",op); 909 } 910 PetscFunctionReturn(0); 911 } 912 913 PetscErrorCode MatGetDiagonal_SeqSELL(Mat A,Vec v) 914 { 915 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 916 PetscInt i,j,n,shift; 917 PetscScalar *x,zero=0.0; 918 919 PetscFunctionBegin; 920 CHKERRQ(VecGetLocalSize(v,&n)); 921 PetscCheckFalse(n != A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 922 923 if (A->factortype == MAT_FACTOR_ILU || A->factortype == MAT_FACTOR_LU) { 924 PetscInt *diag=a->diag; 925 CHKERRQ(VecGetArray(v,&x)); 926 for (i=0; i<n; i++) x[i] = 1.0/a->val[diag[i]]; 927 CHKERRQ(VecRestoreArray(v,&x)); 928 PetscFunctionReturn(0); 929 } 930 931 CHKERRQ(VecSet(v,zero)); 932 CHKERRQ(VecGetArray(v,&x)); 933 for (i=0; i<n; i++) { /* loop over rows */ 934 shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */ 935 x[i] = 0; 936 for (j=0; j<a->rlen[i]; j++) { 937 if (a->colidx[shift+j*8] == i) { 938 x[i] = a->val[shift+j*8]; 939 break; 940 } 941 } 942 } 943 CHKERRQ(VecRestoreArray(v,&x)); 944 PetscFunctionReturn(0); 945 } 946 947 PetscErrorCode MatDiagonalScale_SeqSELL(Mat A,Vec ll,Vec rr) 948 { 949 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 950 const PetscScalar *l,*r; 951 PetscInt i,j,m,n,row; 952 953 PetscFunctionBegin; 954 if (ll) { 955 /* The local size is used so that VecMPI can be passed to this routine 956 by MatDiagonalScale_MPISELL */ 957 CHKERRQ(VecGetLocalSize(ll,&m)); 958 PetscCheckFalse(m != A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length"); 959 CHKERRQ(VecGetArrayRead(ll,&l)); 960 for (i=0; i<a->totalslices; i++) { /* loop over slices */ 961 if (i == a->totalslices-1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */ 962 for (j=a->sliidx[i],row=0; j<a->sliidx[i+1]; j++,row=((row+1)&0x07)) { 963 if (row < (A->rmap->n & 0x07)) a->val[j] *= l[8*i+row]; 964 } 965 } else { 966 for (j=a->sliidx[i],row=0; j<a->sliidx[i+1]; j++,row=((row+1)&0x07)) { 967 a->val[j] *= l[8*i+row]; 968 } 969 } 970 } 971 CHKERRQ(VecRestoreArrayRead(ll,&l)); 972 CHKERRQ(PetscLogFlops(a->nz)); 973 } 974 if (rr) { 975 CHKERRQ(VecGetLocalSize(rr,&n)); 976 PetscCheckFalse(n != A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length"); 977 CHKERRQ(VecGetArrayRead(rr,&r)); 978 for (i=0; i<a->totalslices; i++) { /* loop over slices */ 979 if (i == a->totalslices-1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */ 980 for (j=a->sliidx[i],row=0; j<a->sliidx[i+1]; j++,row=((row+1)&0x07)) { 981 if (row < (A->rmap->n & 0x07)) a->val[j] *= r[a->colidx[j]]; 982 } 983 } else { 984 for (j=a->sliidx[i]; j<a->sliidx[i+1]; j++) { 985 a->val[j] *= r[a->colidx[j]]; 986 } 987 } 988 } 989 CHKERRQ(VecRestoreArrayRead(rr,&r)); 990 CHKERRQ(PetscLogFlops(a->nz)); 991 } 992 CHKERRQ(MatSeqSELLInvalidateDiagonal(A)); 993 PetscFunctionReturn(0); 994 } 995 996 PetscErrorCode MatGetValues_SeqSELL(Mat A,PetscInt m,const PetscInt im[],PetscInt n,const PetscInt in[],PetscScalar v[]) 997 { 998 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 999 PetscInt *cp,i,k,low,high,t,row,col,l; 1000 PetscInt shift; 1001 MatScalar *vp; 1002 1003 PetscFunctionBegin; 1004 for (k=0; k<m; k++) { /* loop over requested rows */ 1005 row = im[k]; 1006 if (row<0) continue; 1007 PetscCheck(row < A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT,row,A->rmap->n-1); 1008 shift = a->sliidx[row>>3]+(row&0x07); /* starting index of the row */ 1009 cp = a->colidx+shift; /* pointer to the row */ 1010 vp = a->val+shift; /* pointer to the row */ 1011 for (l=0; l<n; l++) { /* loop over requested columns */ 1012 col = in[l]; 1013 if (col<0) continue; 1014 PetscCheck(col < A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Column too large: row %" PetscInt_FMT " max %" PetscInt_FMT,col,A->cmap->n-1); 1015 high = a->rlen[row]; low = 0; /* assume unsorted */ 1016 while (high-low > 5) { 1017 t = (low+high)/2; 1018 if (*(cp+t*8) > col) high = t; 1019 else low = t; 1020 } 1021 for (i=low; i<high; i++) { 1022 if (*(cp+8*i) > col) break; 1023 if (*(cp+8*i) == col) { 1024 *v++ = *(vp+8*i); 1025 goto finished; 1026 } 1027 } 1028 *v++ = 0.0; 1029 finished:; 1030 } 1031 } 1032 PetscFunctionReturn(0); 1033 } 1034 1035 PetscErrorCode MatView_SeqSELL_ASCII(Mat A,PetscViewer viewer) 1036 { 1037 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 1038 PetscInt i,j,m=A->rmap->n,shift; 1039 const char *name; 1040 PetscViewerFormat format; 1041 1042 PetscFunctionBegin; 1043 CHKERRQ(PetscViewerGetFormat(viewer,&format)); 1044 if (format == PETSC_VIEWER_ASCII_MATLAB) { 1045 PetscInt nofinalvalue = 0; 1046 /* 1047 if (m && ((a->i[m] == a->i[m-1]) || (a->j[a->nz-1] != A->cmap->n-1))) { 1048 nofinalvalue = 1; 1049 } 1050 */ 1051 CHKERRQ(PetscViewerASCIIUseTabs(viewer,PETSC_FALSE)); 1052 CHKERRQ(PetscViewerASCIIPrintf(viewer,"%% Size = %" PetscInt_FMT " %" PetscInt_FMT " \n",m,A->cmap->n)); 1053 CHKERRQ(PetscViewerASCIIPrintf(viewer,"%% Nonzeros = %" PetscInt_FMT " \n",a->nz)); 1054 #if defined(PETSC_USE_COMPLEX) 1055 CHKERRQ(PetscViewerASCIIPrintf(viewer,"zzz = zeros(%" PetscInt_FMT ",4);\n",a->nz+nofinalvalue)); 1056 #else 1057 CHKERRQ(PetscViewerASCIIPrintf(viewer,"zzz = zeros(%" PetscInt_FMT ",3);\n",a->nz+nofinalvalue)); 1058 #endif 1059 CHKERRQ(PetscViewerASCIIPrintf(viewer,"zzz = [\n")); 1060 1061 for (i=0; i<m; i++) { 1062 shift = a->sliidx[i>>3]+(i&0x07); 1063 for (j=0; j<a->rlen[i]; j++) { 1064 #if defined(PETSC_USE_COMPLEX) 1065 CHKERRQ(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %18.16e %18.16e\n",i+1,a->colidx[shift+8*j]+1,(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j]))); 1066 #else 1067 CHKERRQ(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %18.16e\n",i+1,a->colidx[shift+8*j]+1,(double)a->val[shift+8*j])); 1068 #endif 1069 } 1070 } 1071 /* 1072 if (nofinalvalue) { 1073 #if defined(PETSC_USE_COMPLEX) 1074 CHKERRQ(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %18.16e %18.16e\n",m,A->cmap->n,0.,0.)); 1075 #else 1076 CHKERRQ(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %18.16e\n",m,A->cmap->n,0.0)); 1077 #endif 1078 } 1079 */ 1080 CHKERRQ(PetscObjectGetName((PetscObject)A,&name)); 1081 CHKERRQ(PetscViewerASCIIPrintf(viewer,"];\n %s = spconvert(zzz);\n",name)); 1082 CHKERRQ(PetscViewerASCIIUseTabs(viewer,PETSC_TRUE)); 1083 } else if (format == PETSC_VIEWER_ASCII_FACTOR_INFO || format == PETSC_VIEWER_ASCII_INFO) { 1084 PetscFunctionReturn(0); 1085 } else if (format == PETSC_VIEWER_ASCII_COMMON) { 1086 CHKERRQ(PetscViewerASCIIUseTabs(viewer,PETSC_FALSE)); 1087 for (i=0; i<m; i++) { 1088 CHKERRQ(PetscViewerASCIIPrintf(viewer,"row %" PetscInt_FMT ":",i)); 1089 shift = a->sliidx[i>>3]+(i&0x07); 1090 for (j=0; j<a->rlen[i]; j++) { 1091 #if defined(PETSC_USE_COMPLEX) 1092 if (PetscImaginaryPart(a->val[shift+8*j]) > 0.0 && PetscRealPart(a->val[shift+8*j]) != 0.0) { 1093 CHKERRQ(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g + %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j]))); 1094 } else if (PetscImaginaryPart(a->val[shift+8*j]) < 0.0 && PetscRealPart(a->val[shift+8*j]) != 0.0) { 1095 CHKERRQ(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g - %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)-PetscImaginaryPart(a->val[shift+8*j]))); 1096 } else if (PetscRealPart(a->val[shift+8*j]) != 0.0) { 1097 CHKERRQ(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]))); 1098 } 1099 #else 1100 if (a->val[shift+8*j] != 0.0) CHKERRQ(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[shift+8*j],(double)a->val[shift+8*j])); 1101 #endif 1102 } 1103 CHKERRQ(PetscViewerASCIIPrintf(viewer,"\n")); 1104 } 1105 CHKERRQ(PetscViewerASCIIUseTabs(viewer,PETSC_TRUE)); 1106 } else if (format == PETSC_VIEWER_ASCII_DENSE) { 1107 PetscInt cnt=0,jcnt; 1108 PetscScalar value; 1109 #if defined(PETSC_USE_COMPLEX) 1110 PetscBool realonly=PETSC_TRUE; 1111 for (i=0; i<a->sliidx[a->totalslices]; i++) { 1112 if (PetscImaginaryPart(a->val[i]) != 0.0) { 1113 realonly = PETSC_FALSE; 1114 break; 1115 } 1116 } 1117 #endif 1118 1119 CHKERRQ(PetscViewerASCIIUseTabs(viewer,PETSC_FALSE)); 1120 for (i=0; i<m; i++) { 1121 jcnt = 0; 1122 shift = a->sliidx[i>>3]+(i&0x07); 1123 for (j=0; j<A->cmap->n; j++) { 1124 if (jcnt < a->rlen[i] && j == a->colidx[shift+8*j]) { 1125 value = a->val[cnt++]; 1126 jcnt++; 1127 } else { 1128 value = 0.0; 1129 } 1130 #if defined(PETSC_USE_COMPLEX) 1131 if (realonly) { 1132 CHKERRQ(PetscViewerASCIIPrintf(viewer," %7.5e ",(double)PetscRealPart(value))); 1133 } else { 1134 CHKERRQ(PetscViewerASCIIPrintf(viewer," %7.5e+%7.5e i ",(double)PetscRealPart(value),(double)PetscImaginaryPart(value))); 1135 } 1136 #else 1137 CHKERRQ(PetscViewerASCIIPrintf(viewer," %7.5e ",(double)value)); 1138 #endif 1139 } 1140 CHKERRQ(PetscViewerASCIIPrintf(viewer,"\n")); 1141 } 1142 CHKERRQ(PetscViewerASCIIUseTabs(viewer,PETSC_TRUE)); 1143 } else if (format == PETSC_VIEWER_ASCII_MATRIXMARKET) { 1144 PetscInt fshift=1; 1145 CHKERRQ(PetscViewerASCIIUseTabs(viewer,PETSC_FALSE)); 1146 #if defined(PETSC_USE_COMPLEX) 1147 CHKERRQ(PetscViewerASCIIPrintf(viewer,"%%%%MatrixMarket matrix coordinate complex general\n")); 1148 #else 1149 CHKERRQ(PetscViewerASCIIPrintf(viewer,"%%%%MatrixMarket matrix coordinate real general\n")); 1150 #endif 1151 CHKERRQ(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %" PetscInt_FMT "\n", m, A->cmap->n, a->nz)); 1152 for (i=0; i<m; i++) { 1153 shift = a->sliidx[i>>3]+(i&0x07); 1154 for (j=0; j<a->rlen[i]; j++) { 1155 #if defined(PETSC_USE_COMPLEX) 1156 CHKERRQ(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %g %g\n",i+fshift,a->colidx[shift+8*j]+fshift,(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j]))); 1157 #else 1158 CHKERRQ(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %g\n",i+fshift,a->colidx[shift+8*j]+fshift,(double)a->val[shift+8*j])); 1159 #endif 1160 } 1161 } 1162 CHKERRQ(PetscViewerASCIIUseTabs(viewer,PETSC_TRUE)); 1163 } else if (format == PETSC_VIEWER_NATIVE) { 1164 for (i=0; i<a->totalslices; i++) { /* loop over slices */ 1165 PetscInt row; 1166 CHKERRQ(PetscViewerASCIIPrintf(viewer,"slice %" PetscInt_FMT ": %" PetscInt_FMT " %" PetscInt_FMT "\n",i,a->sliidx[i],a->sliidx[i+1])); 1167 for (j=a->sliidx[i],row=0; j<a->sliidx[i+1]; j++,row=((row+1)&0x07)) { 1168 #if defined(PETSC_USE_COMPLEX) 1169 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1170 CHKERRQ(PetscViewerASCIIPrintf(viewer," %" PetscInt_FMT " %" PetscInt_FMT " %g + %g i\n",8*i+row,a->colidx[j],(double)PetscRealPart(a->val[j]),(double)PetscImaginaryPart(a->val[j]))); 1171 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1172 CHKERRQ(PetscViewerASCIIPrintf(viewer," %" PetscInt_FMT " %" PetscInt_FMT " %g - %g i\n",8*i+row,a->colidx[j],(double)PetscRealPart(a->val[j]),-(double)PetscImaginaryPart(a->val[j]))); 1173 } else { 1174 CHKERRQ(PetscViewerASCIIPrintf(viewer," %" PetscInt_FMT " %" PetscInt_FMT " %g\n",8*i+row,a->colidx[j],(double)PetscRealPart(a->val[j]))); 1175 } 1176 #else 1177 CHKERRQ(PetscViewerASCIIPrintf(viewer," %" PetscInt_FMT " %" PetscInt_FMT " %g\n",8*i+row,a->colidx[j],(double)a->val[j])); 1178 #endif 1179 } 1180 } 1181 } else { 1182 CHKERRQ(PetscViewerASCIIUseTabs(viewer,PETSC_FALSE)); 1183 if (A->factortype) { 1184 for (i=0; i<m; i++) { 1185 shift = a->sliidx[i>>3]+(i&0x07); 1186 CHKERRQ(PetscViewerASCIIPrintf(viewer,"row %" PetscInt_FMT ":",i)); 1187 /* L part */ 1188 for (j=shift; j<a->diag[i]; j+=8) { 1189 #if defined(PETSC_USE_COMPLEX) 1190 if (PetscImaginaryPart(a->val[shift+8*j]) > 0.0) { 1191 CHKERRQ(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g + %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)PetscImaginaryPart(a->val[j]))); 1192 } else if (PetscImaginaryPart(a->val[shift+8*j]) < 0.0) { 1193 CHKERRQ(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g - %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)(-PetscImaginaryPart(a->val[j])))); 1194 } else { 1195 CHKERRQ(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[j],(double)PetscRealPart(a->val[j]))); 1196 } 1197 #else 1198 CHKERRQ(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[j],(double)a->val[j])); 1199 #endif 1200 } 1201 /* diagonal */ 1202 j = a->diag[i]; 1203 #if defined(PETSC_USE_COMPLEX) 1204 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1205 CHKERRQ(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g + %g i)",a->colidx[j],(double)PetscRealPart(1.0/a->val[j]),(double)PetscImaginaryPart(1.0/a->val[j]))); 1206 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1207 CHKERRQ(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g - %g i)",a->colidx[j],(double)PetscRealPart(1.0/a->val[j]),(double)(-PetscImaginaryPart(1.0/a->val[j])))); 1208 } else { 1209 CHKERRQ(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[j],(double)PetscRealPart(1.0/a->val[j]))); 1210 } 1211 #else 1212 CHKERRQ(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[j],(double)(1.0/a->val[j]))); 1213 #endif 1214 1215 /* U part */ 1216 for (j=a->diag[i]+1; j<shift+8*a->rlen[i]; j+=8) { 1217 #if defined(PETSC_USE_COMPLEX) 1218 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1219 CHKERRQ(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g + %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)PetscImaginaryPart(a->val[j]))); 1220 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1221 CHKERRQ(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g - %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)(-PetscImaginaryPart(a->val[j])))); 1222 } else { 1223 CHKERRQ(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[j],(double)PetscRealPart(a->val[j]))); 1224 } 1225 #else 1226 CHKERRQ(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[j],(double)a->val[j])); 1227 #endif 1228 } 1229 CHKERRQ(PetscViewerASCIIPrintf(viewer,"\n")); 1230 } 1231 } else { 1232 for (i=0; i<m; i++) { 1233 shift = a->sliidx[i>>3]+(i&0x07); 1234 CHKERRQ(PetscViewerASCIIPrintf(viewer,"row %" PetscInt_FMT ":",i)); 1235 for (j=0; j<a->rlen[i]; j++) { 1236 #if defined(PETSC_USE_COMPLEX) 1237 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1238 CHKERRQ(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g + %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j]))); 1239 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1240 CHKERRQ(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g - %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)-PetscImaginaryPart(a->val[shift+8*j]))); 1241 } else { 1242 CHKERRQ(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]))); 1243 } 1244 #else 1245 CHKERRQ(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[shift+8*j],(double)a->val[shift+8*j])); 1246 #endif 1247 } 1248 CHKERRQ(PetscViewerASCIIPrintf(viewer,"\n")); 1249 } 1250 } 1251 CHKERRQ(PetscViewerASCIIUseTabs(viewer,PETSC_TRUE)); 1252 } 1253 CHKERRQ(PetscViewerFlush(viewer)); 1254 PetscFunctionReturn(0); 1255 } 1256 1257 #include <petscdraw.h> 1258 PetscErrorCode MatView_SeqSELL_Draw_Zoom(PetscDraw draw,void *Aa) 1259 { 1260 Mat A=(Mat)Aa; 1261 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 1262 PetscInt i,j,m=A->rmap->n,shift; 1263 int color; 1264 PetscReal xl,yl,xr,yr,x_l,x_r,y_l,y_r; 1265 PetscViewer viewer; 1266 PetscViewerFormat format; 1267 PetscErrorCode ierr; 1268 1269 PetscFunctionBegin; 1270 CHKERRQ(PetscObjectQuery((PetscObject)A,"Zoomviewer",(PetscObject*)&viewer)); 1271 CHKERRQ(PetscViewerGetFormat(viewer,&format)); 1272 CHKERRQ(PetscDrawGetCoordinates(draw,&xl,&yl,&xr,&yr)); 1273 1274 /* loop over matrix elements drawing boxes */ 1275 1276 if (format != PETSC_VIEWER_DRAW_CONTOUR) { 1277 ierr = PetscDrawCollectiveBegin(draw);CHKERRQ(ierr); 1278 /* Blue for negative, Cyan for zero and Red for positive */ 1279 color = PETSC_DRAW_BLUE; 1280 for (i=0; i<m; i++) { 1281 shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */ 1282 y_l = m - i - 1.0; y_r = y_l + 1.0; 1283 for (j=0; j<a->rlen[i]; j++) { 1284 x_l = a->colidx[shift+j*8]; x_r = x_l + 1.0; 1285 if (PetscRealPart(a->val[shift+8*j]) >= 0.) continue; 1286 CHKERRQ(PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color)); 1287 } 1288 } 1289 color = PETSC_DRAW_CYAN; 1290 for (i=0; i<m; i++) { 1291 shift = a->sliidx[i>>3]+(i&0x07); 1292 y_l = m - i - 1.0; y_r = y_l + 1.0; 1293 for (j=0; j<a->rlen[i]; j++) { 1294 x_l = a->colidx[shift+j*8]; x_r = x_l + 1.0; 1295 if (a->val[shift+8*j] != 0.) continue; 1296 CHKERRQ(PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color)); 1297 } 1298 } 1299 color = PETSC_DRAW_RED; 1300 for (i=0; i<m; i++) { 1301 shift = a->sliidx[i>>3]+(i&0x07); 1302 y_l = m - i - 1.0; y_r = y_l + 1.0; 1303 for (j=0; j<a->rlen[i]; j++) { 1304 x_l = a->colidx[shift+j*8]; x_r = x_l + 1.0; 1305 if (PetscRealPart(a->val[shift+8*j]) <= 0.) continue; 1306 CHKERRQ(PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color)); 1307 } 1308 } 1309 ierr = PetscDrawCollectiveEnd(draw);CHKERRQ(ierr); 1310 } else { 1311 /* use contour shading to indicate magnitude of values */ 1312 /* first determine max of all nonzero values */ 1313 PetscReal minv=0.0,maxv=0.0; 1314 PetscInt count=0; 1315 PetscDraw popup; 1316 for (i=0; i<a->sliidx[a->totalslices]; i++) { 1317 if (PetscAbsScalar(a->val[i]) > maxv) maxv = PetscAbsScalar(a->val[i]); 1318 } 1319 if (minv >= maxv) maxv = minv + PETSC_SMALL; 1320 CHKERRQ(PetscDrawGetPopup(draw,&popup)); 1321 CHKERRQ(PetscDrawScalePopup(popup,minv,maxv)); 1322 1323 ierr = PetscDrawCollectiveBegin(draw);CHKERRQ(ierr); 1324 for (i=0; i<m; i++) { 1325 shift = a->sliidx[i>>3]+(i&0x07); 1326 y_l = m - i - 1.0; 1327 y_r = y_l + 1.0; 1328 for (j=0; j<a->rlen[i]; j++) { 1329 x_l = a->colidx[shift+j*8]; 1330 x_r = x_l + 1.0; 1331 color = PetscDrawRealToColor(PetscAbsScalar(a->val[count]),minv,maxv); 1332 CHKERRQ(PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color)); 1333 count++; 1334 } 1335 } 1336 ierr = PetscDrawCollectiveEnd(draw);CHKERRQ(ierr); 1337 } 1338 PetscFunctionReturn(0); 1339 } 1340 1341 #include <petscdraw.h> 1342 PetscErrorCode MatView_SeqSELL_Draw(Mat A,PetscViewer viewer) 1343 { 1344 PetscDraw draw; 1345 PetscReal xr,yr,xl,yl,h,w; 1346 PetscBool isnull; 1347 1348 PetscFunctionBegin; 1349 CHKERRQ(PetscViewerDrawGetDraw(viewer,0,&draw)); 1350 CHKERRQ(PetscDrawIsNull(draw,&isnull)); 1351 if (isnull) PetscFunctionReturn(0); 1352 1353 xr = A->cmap->n; yr = A->rmap->n; h = yr/10.0; w = xr/10.0; 1354 xr += w; yr += h; xl = -w; yl = -h; 1355 CHKERRQ(PetscDrawSetCoordinates(draw,xl,yl,xr,yr)); 1356 CHKERRQ(PetscObjectCompose((PetscObject)A,"Zoomviewer",(PetscObject)viewer)); 1357 CHKERRQ(PetscDrawZoom(draw,MatView_SeqSELL_Draw_Zoom,A)); 1358 CHKERRQ(PetscObjectCompose((PetscObject)A,"Zoomviewer",NULL)); 1359 CHKERRQ(PetscDrawSave(draw)); 1360 PetscFunctionReturn(0); 1361 } 1362 1363 PetscErrorCode MatView_SeqSELL(Mat A,PetscViewer viewer) 1364 { 1365 PetscBool iascii,isbinary,isdraw; 1366 1367 PetscFunctionBegin; 1368 CHKERRQ(PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&iascii)); 1369 CHKERRQ(PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERBINARY,&isbinary)); 1370 CHKERRQ(PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERDRAW,&isdraw)); 1371 if (iascii) { 1372 CHKERRQ(MatView_SeqSELL_ASCII(A,viewer)); 1373 } else if (isbinary) { 1374 /* CHKERRQ(MatView_SeqSELL_Binary(A,viewer)); */ 1375 } else if (isdraw) { 1376 CHKERRQ(MatView_SeqSELL_Draw(A,viewer)); 1377 } 1378 PetscFunctionReturn(0); 1379 } 1380 1381 PetscErrorCode MatAssemblyEnd_SeqSELL(Mat A,MatAssemblyType mode) 1382 { 1383 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 1384 PetscInt i,shift,row_in_slice,row,nrow,*cp,lastcol,j,k; 1385 MatScalar *vp; 1386 1387 PetscFunctionBegin; 1388 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 1389 /* To do: compress out the unused elements */ 1390 CHKERRQ(MatMarkDiagonal_SeqSELL(A)); 1391 CHKERRQ(PetscInfo(A,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: %" PetscInt_FMT " allocated %" PetscInt_FMT " used (%" PetscInt_FMT " nonzeros+%" PetscInt_FMT " paddedzeros)\n",A->rmap->n,A->cmap->n,a->maxallocmat,a->sliidx[a->totalslices],a->nz,a->sliidx[a->totalslices]-a->nz)); 1392 CHKERRQ(PetscInfo(A,"Number of mallocs during MatSetValues() is %" PetscInt_FMT "\n",a->reallocs)); 1393 CHKERRQ(PetscInfo(A,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",a->rlenmax)); 1394 /* Set unused slots for column indices to last valid column index. Set unused slots for values to zero. This allows for a use of unmasked intrinsics -> higher performance */ 1395 for (i=0; i<a->totalslices; ++i) { 1396 shift = a->sliidx[i]; /* starting index of the slice */ 1397 cp = a->colidx+shift; /* pointer to the column indices of the slice */ 1398 vp = a->val+shift; /* pointer to the nonzero values of the slice */ 1399 for (row_in_slice=0; row_in_slice<8; ++row_in_slice) { /* loop over rows in the slice */ 1400 row = 8*i + row_in_slice; 1401 nrow = a->rlen[row]; /* number of nonzeros in row */ 1402 /* 1403 Search for the nearest nonzero. Normally setting the index to zero may cause extra communication. 1404 But if the entire slice are empty, it is fine to use 0 since the index will not be loaded. 1405 */ 1406 lastcol = 0; 1407 if (nrow>0) { /* nonempty row */ 1408 lastcol = cp[8*(nrow-1)+row_in_slice]; /* use the index from the last nonzero at current row */ 1409 } else if (!row_in_slice) { /* first row of the currect slice is empty */ 1410 for (j=1;j<8;j++) { 1411 if (a->rlen[8*i+j]) { 1412 lastcol = cp[j]; 1413 break; 1414 } 1415 } 1416 } else { 1417 if (a->sliidx[i+1] != shift) lastcol = cp[row_in_slice-1]; /* use the index from the previous row */ 1418 } 1419 1420 for (k=nrow; k<(a->sliidx[i+1]-shift)/8; ++k) { 1421 cp[8*k+row_in_slice] = lastcol; 1422 vp[8*k+row_in_slice] = (MatScalar)0; 1423 } 1424 } 1425 } 1426 1427 A->info.mallocs += a->reallocs; 1428 a->reallocs = 0; 1429 1430 CHKERRQ(MatSeqSELLInvalidateDiagonal(A)); 1431 PetscFunctionReturn(0); 1432 } 1433 1434 PetscErrorCode MatGetInfo_SeqSELL(Mat A,MatInfoType flag,MatInfo *info) 1435 { 1436 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 1437 1438 PetscFunctionBegin; 1439 info->block_size = 1.0; 1440 info->nz_allocated = a->maxallocmat; 1441 info->nz_used = a->sliidx[a->totalslices]; /* include padding zeros */ 1442 info->nz_unneeded = (a->maxallocmat-a->sliidx[a->totalslices]); 1443 info->assemblies = A->num_ass; 1444 info->mallocs = A->info.mallocs; 1445 info->memory = ((PetscObject)A)->mem; 1446 if (A->factortype) { 1447 info->fill_ratio_given = A->info.fill_ratio_given; 1448 info->fill_ratio_needed = A->info.fill_ratio_needed; 1449 info->factor_mallocs = A->info.factor_mallocs; 1450 } else { 1451 info->fill_ratio_given = 0; 1452 info->fill_ratio_needed = 0; 1453 info->factor_mallocs = 0; 1454 } 1455 PetscFunctionReturn(0); 1456 } 1457 1458 PetscErrorCode MatSetValues_SeqSELL(Mat A,PetscInt m,const PetscInt im[],PetscInt n,const PetscInt in[],const PetscScalar v[],InsertMode is) 1459 { 1460 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 1461 PetscInt shift,i,k,l,low,high,t,ii,row,col,nrow; 1462 PetscInt *cp,nonew=a->nonew,lastcol=-1; 1463 MatScalar *vp,value; 1464 1465 PetscFunctionBegin; 1466 for (k=0; k<m; k++) { /* loop over added rows */ 1467 row = im[k]; 1468 if (row < 0) continue; 1469 PetscCheck(row < A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT,row,A->rmap->n-1); 1470 shift = a->sliidx[row>>3]+(row&0x07); /* starting index of the row */ 1471 cp = a->colidx+shift; /* pointer to the row */ 1472 vp = a->val+shift; /* pointer to the row */ 1473 nrow = a->rlen[row]; 1474 low = 0; 1475 high = nrow; 1476 1477 for (l=0; l<n; l++) { /* loop over added columns */ 1478 col = in[l]; 1479 if (col<0) continue; 1480 PetscCheck(col < A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Col too large: row %" PetscInt_FMT " max %" PetscInt_FMT,col,A->cmap->n-1); 1481 if (a->roworiented) { 1482 value = v[l+k*n]; 1483 } else { 1484 value = v[k+l*m]; 1485 } 1486 if ((value == 0.0 && a->ignorezeroentries) && (is == ADD_VALUES)) continue; 1487 1488 /* search in this row for the specified column, i indicates the column to be set */ 1489 if (col <= lastcol) low = 0; 1490 else high = nrow; 1491 lastcol = col; 1492 while (high-low > 5) { 1493 t = (low+high)/2; 1494 if (*(cp+t*8) > col) high = t; 1495 else low = t; 1496 } 1497 for (i=low; i<high; i++) { 1498 if (*(cp+i*8) > col) break; 1499 if (*(cp+i*8) == col) { 1500 if (is == ADD_VALUES) *(vp+i*8) += value; 1501 else *(vp+i*8) = value; 1502 low = i + 1; 1503 goto noinsert; 1504 } 1505 } 1506 if (value == 0.0 && a->ignorezeroentries) goto noinsert; 1507 if (nonew == 1) goto noinsert; 1508 PetscCheckFalse(nonew == -1,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Inserting a new nonzero (%" PetscInt_FMT ", %" PetscInt_FMT ") in the matrix", row, col); 1509 /* If the current row length exceeds the slice width (e.g. nrow==slice_width), allocate a new space, otherwise do nothing */ 1510 MatSeqXSELLReallocateSELL(A,A->rmap->n,1,nrow,a->sliidx,row/8,row,col,a->colidx,a->val,cp,vp,nonew,MatScalar); 1511 /* add the new nonzero to the high position, shift the remaining elements in current row to the right by one slot */ 1512 for (ii=nrow-1; ii>=i; ii--) { 1513 *(cp+(ii+1)*8) = *(cp+ii*8); 1514 *(vp+(ii+1)*8) = *(vp+ii*8); 1515 } 1516 a->rlen[row]++; 1517 *(cp+i*8) = col; 1518 *(vp+i*8) = value; 1519 a->nz++; 1520 A->nonzerostate++; 1521 low = i+1; high++; nrow++; 1522 noinsert:; 1523 } 1524 a->rlen[row] = nrow; 1525 } 1526 PetscFunctionReturn(0); 1527 } 1528 1529 PetscErrorCode MatCopy_SeqSELL(Mat A,Mat B,MatStructure str) 1530 { 1531 PetscFunctionBegin; 1532 /* If the two matrices have the same copy implementation, use fast copy. */ 1533 if (str == SAME_NONZERO_PATTERN && (A->ops->copy == B->ops->copy)) { 1534 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 1535 Mat_SeqSELL *b=(Mat_SeqSELL*)B->data; 1536 1537 PetscCheckFalse(a->sliidx[a->totalslices] != b->sliidx[b->totalslices],PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Number of nonzeros in two matrices are different"); 1538 CHKERRQ(PetscArraycpy(b->val,a->val,a->sliidx[a->totalslices])); 1539 } else { 1540 CHKERRQ(MatCopy_Basic(A,B,str)); 1541 } 1542 PetscFunctionReturn(0); 1543 } 1544 1545 PetscErrorCode MatSetUp_SeqSELL(Mat A) 1546 { 1547 PetscFunctionBegin; 1548 CHKERRQ(MatSeqSELLSetPreallocation(A,PETSC_DEFAULT,NULL)); 1549 PetscFunctionReturn(0); 1550 } 1551 1552 PetscErrorCode MatSeqSELLGetArray_SeqSELL(Mat A,PetscScalar *array[]) 1553 { 1554 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 1555 1556 PetscFunctionBegin; 1557 *array = a->val; 1558 PetscFunctionReturn(0); 1559 } 1560 1561 PetscErrorCode MatSeqSELLRestoreArray_SeqSELL(Mat A,PetscScalar *array[]) 1562 { 1563 PetscFunctionBegin; 1564 PetscFunctionReturn(0); 1565 } 1566 1567 PetscErrorCode MatRealPart_SeqSELL(Mat A) 1568 { 1569 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 1570 PetscInt i; 1571 MatScalar *aval=a->val; 1572 1573 PetscFunctionBegin; 1574 for (i=0; i<a->sliidx[a->totalslices]; i++) aval[i]=PetscRealPart(aval[i]); 1575 PetscFunctionReturn(0); 1576 } 1577 1578 PetscErrorCode MatImaginaryPart_SeqSELL(Mat A) 1579 { 1580 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 1581 PetscInt i; 1582 MatScalar *aval=a->val; 1583 1584 PetscFunctionBegin; 1585 for (i=0; i<a->sliidx[a->totalslices]; i++) aval[i] = PetscImaginaryPart(aval[i]); 1586 CHKERRQ(MatSeqSELLInvalidateDiagonal(A)); 1587 PetscFunctionReturn(0); 1588 } 1589 1590 PetscErrorCode MatScale_SeqSELL(Mat inA,PetscScalar alpha) 1591 { 1592 Mat_SeqSELL *a=(Mat_SeqSELL*)inA->data; 1593 MatScalar *aval=a->val; 1594 PetscScalar oalpha=alpha; 1595 PetscBLASInt one=1,size; 1596 1597 PetscFunctionBegin; 1598 CHKERRQ(PetscBLASIntCast(a->sliidx[a->totalslices],&size)); 1599 PetscStackCallBLAS("BLASscal",BLASscal_(&size,&oalpha,aval,&one)); 1600 CHKERRQ(PetscLogFlops(a->nz)); 1601 CHKERRQ(MatSeqSELLInvalidateDiagonal(inA)); 1602 PetscFunctionReturn(0); 1603 } 1604 1605 PetscErrorCode MatShift_SeqSELL(Mat Y,PetscScalar a) 1606 { 1607 Mat_SeqSELL *y=(Mat_SeqSELL*)Y->data; 1608 1609 PetscFunctionBegin; 1610 if (!Y->preallocated || !y->nz) { 1611 CHKERRQ(MatSeqSELLSetPreallocation(Y,1,NULL)); 1612 } 1613 CHKERRQ(MatShift_Basic(Y,a)); 1614 PetscFunctionReturn(0); 1615 } 1616 1617 PetscErrorCode MatSOR_SeqSELL(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx) 1618 { 1619 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 1620 PetscScalar *x,sum,*t; 1621 const MatScalar *idiag=NULL,*mdiag; 1622 const PetscScalar *b,*xb; 1623 PetscInt n,m=A->rmap->n,i,j,shift; 1624 const PetscInt *diag; 1625 1626 PetscFunctionBegin; 1627 its = its*lits; 1628 1629 if (fshift != a->fshift || omega != a->omega) a->idiagvalid = PETSC_FALSE; /* must recompute idiag[] */ 1630 if (!a->idiagvalid) CHKERRQ(MatInvertDiagonal_SeqSELL(A,omega,fshift)); 1631 a->fshift = fshift; 1632 a->omega = omega; 1633 1634 diag = a->diag; 1635 t = a->ssor_work; 1636 idiag = a->idiag; 1637 mdiag = a->mdiag; 1638 1639 CHKERRQ(VecGetArray(xx,&x)); 1640 CHKERRQ(VecGetArrayRead(bb,&b)); 1641 /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */ 1642 PetscCheckFalse(flag == SOR_APPLY_UPPER,PETSC_COMM_SELF,PETSC_ERR_SUP,"SOR_APPLY_UPPER is not implemented"); 1643 PetscCheckFalse(flag == SOR_APPLY_LOWER,PETSC_COMM_SELF,PETSC_ERR_SUP,"SOR_APPLY_LOWER is not implemented"); 1644 PetscCheckFalse(flag & SOR_EISENSTAT,PETSC_COMM_SELF,PETSC_ERR_SUP,"No support yet for Eisenstat"); 1645 1646 if (flag & SOR_ZERO_INITIAL_GUESS) { 1647 if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) { 1648 for (i=0; i<m; i++) { 1649 shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */ 1650 sum = b[i]; 1651 n = (diag[i]-shift)/8; 1652 for (j=0; j<n; j++) sum -= a->val[shift+j*8]*x[a->colidx[shift+j*8]]; 1653 t[i] = sum; 1654 x[i] = sum*idiag[i]; 1655 } 1656 xb = t; 1657 CHKERRQ(PetscLogFlops(a->nz)); 1658 } else xb = b; 1659 if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) { 1660 for (i=m-1; i>=0; i--) { 1661 shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */ 1662 sum = xb[i]; 1663 n = a->rlen[i]-(diag[i]-shift)/8-1; 1664 for (j=1; j<=n; j++) sum -= a->val[diag[i]+j*8]*x[a->colidx[diag[i]+j*8]]; 1665 if (xb == b) { 1666 x[i] = sum*idiag[i]; 1667 } else { 1668 x[i] = (1.-omega)*x[i]+sum*idiag[i]; /* omega in idiag */ 1669 } 1670 } 1671 CHKERRQ(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */ 1672 } 1673 its--; 1674 } 1675 while (its--) { 1676 if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) { 1677 for (i=0; i<m; i++) { 1678 /* lower */ 1679 shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */ 1680 sum = b[i]; 1681 n = (diag[i]-shift)/8; 1682 for (j=0; j<n; j++) sum -= a->val[shift+j*8]*x[a->colidx[shift+j*8]]; 1683 t[i] = sum; /* save application of the lower-triangular part */ 1684 /* upper */ 1685 n = a->rlen[i]-(diag[i]-shift)/8-1; 1686 for (j=1; j<=n; j++) sum -= a->val[diag[i]+j*8]*x[a->colidx[diag[i]+j*8]]; 1687 x[i] = (1.-omega)*x[i]+sum*idiag[i]; /* omega in idiag */ 1688 } 1689 xb = t; 1690 CHKERRQ(PetscLogFlops(2.0*a->nz)); 1691 } else xb = b; 1692 if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) { 1693 for (i=m-1; i>=0; i--) { 1694 shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */ 1695 sum = xb[i]; 1696 if (xb == b) { 1697 /* whole matrix (no checkpointing available) */ 1698 n = a->rlen[i]; 1699 for (j=0; j<n; j++) sum -= a->val[shift+j*8]*x[a->colidx[shift+j*8]]; 1700 x[i] = (1.-omega)*x[i]+(sum+mdiag[i]*x[i])*idiag[i]; 1701 } else { /* lower-triangular part has been saved, so only apply upper-triangular */ 1702 n = a->rlen[i]-(diag[i]-shift)/8-1; 1703 for (j=1; j<=n; j++) sum -= a->val[diag[i]+j*8]*x[a->colidx[diag[i]+j*8]]; 1704 x[i] = (1.-omega)*x[i]+sum*idiag[i]; /* omega in idiag */ 1705 } 1706 } 1707 if (xb == b) { 1708 CHKERRQ(PetscLogFlops(2.0*a->nz)); 1709 } else { 1710 CHKERRQ(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */ 1711 } 1712 } 1713 } 1714 CHKERRQ(VecRestoreArray(xx,&x)); 1715 CHKERRQ(VecRestoreArrayRead(bb,&b)); 1716 PetscFunctionReturn(0); 1717 } 1718 1719 /* -------------------------------------------------------------------*/ 1720 static struct _MatOps MatOps_Values = {MatSetValues_SeqSELL, 1721 MatGetRow_SeqSELL, 1722 MatRestoreRow_SeqSELL, 1723 MatMult_SeqSELL, 1724 /* 4*/ MatMultAdd_SeqSELL, 1725 MatMultTranspose_SeqSELL, 1726 MatMultTransposeAdd_SeqSELL, 1727 NULL, 1728 NULL, 1729 NULL, 1730 /* 10*/ NULL, 1731 NULL, 1732 NULL, 1733 MatSOR_SeqSELL, 1734 NULL, 1735 /* 15*/ MatGetInfo_SeqSELL, 1736 MatEqual_SeqSELL, 1737 MatGetDiagonal_SeqSELL, 1738 MatDiagonalScale_SeqSELL, 1739 NULL, 1740 /* 20*/ NULL, 1741 MatAssemblyEnd_SeqSELL, 1742 MatSetOption_SeqSELL, 1743 MatZeroEntries_SeqSELL, 1744 /* 24*/ NULL, 1745 NULL, 1746 NULL, 1747 NULL, 1748 NULL, 1749 /* 29*/ MatSetUp_SeqSELL, 1750 NULL, 1751 NULL, 1752 NULL, 1753 NULL, 1754 /* 34*/ MatDuplicate_SeqSELL, 1755 NULL, 1756 NULL, 1757 NULL, 1758 NULL, 1759 /* 39*/ NULL, 1760 NULL, 1761 NULL, 1762 MatGetValues_SeqSELL, 1763 MatCopy_SeqSELL, 1764 /* 44*/ NULL, 1765 MatScale_SeqSELL, 1766 MatShift_SeqSELL, 1767 NULL, 1768 NULL, 1769 /* 49*/ NULL, 1770 NULL, 1771 NULL, 1772 NULL, 1773 NULL, 1774 /* 54*/ MatFDColoringCreate_SeqXAIJ, 1775 NULL, 1776 NULL, 1777 NULL, 1778 NULL, 1779 /* 59*/ NULL, 1780 MatDestroy_SeqSELL, 1781 MatView_SeqSELL, 1782 NULL, 1783 NULL, 1784 /* 64*/ NULL, 1785 NULL, 1786 NULL, 1787 NULL, 1788 NULL, 1789 /* 69*/ NULL, 1790 NULL, 1791 NULL, 1792 NULL, 1793 NULL, 1794 /* 74*/ NULL, 1795 MatFDColoringApply_AIJ, /* reuse the FDColoring function for AIJ */ 1796 NULL, 1797 NULL, 1798 NULL, 1799 /* 79*/ NULL, 1800 NULL, 1801 NULL, 1802 NULL, 1803 NULL, 1804 /* 84*/ NULL, 1805 NULL, 1806 NULL, 1807 NULL, 1808 NULL, 1809 /* 89*/ NULL, 1810 NULL, 1811 NULL, 1812 NULL, 1813 NULL, 1814 /* 94*/ NULL, 1815 NULL, 1816 NULL, 1817 NULL, 1818 NULL, 1819 /* 99*/ NULL, 1820 NULL, 1821 NULL, 1822 MatConjugate_SeqSELL, 1823 NULL, 1824 /*104*/ NULL, 1825 NULL, 1826 NULL, 1827 NULL, 1828 NULL, 1829 /*109*/ NULL, 1830 NULL, 1831 NULL, 1832 NULL, 1833 MatMissingDiagonal_SeqSELL, 1834 /*114*/ NULL, 1835 NULL, 1836 NULL, 1837 NULL, 1838 NULL, 1839 /*119*/ NULL, 1840 NULL, 1841 NULL, 1842 NULL, 1843 NULL, 1844 /*124*/ NULL, 1845 NULL, 1846 NULL, 1847 NULL, 1848 NULL, 1849 /*129*/ NULL, 1850 NULL, 1851 NULL, 1852 NULL, 1853 NULL, 1854 /*134*/ NULL, 1855 NULL, 1856 NULL, 1857 NULL, 1858 NULL, 1859 /*139*/ NULL, 1860 NULL, 1861 NULL, 1862 MatFDColoringSetUp_SeqXAIJ, 1863 NULL, 1864 /*144*/ NULL, 1865 NULL, 1866 NULL, 1867 NULL 1868 }; 1869 1870 PetscErrorCode MatStoreValues_SeqSELL(Mat mat) 1871 { 1872 Mat_SeqSELL *a=(Mat_SeqSELL*)mat->data; 1873 1874 PetscFunctionBegin; 1875 PetscCheck(a->nonew,PETSC_COMM_SELF,PETSC_ERR_ORDER,"Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first"); 1876 1877 /* allocate space for values if not already there */ 1878 if (!a->saved_values) { 1879 CHKERRQ(PetscMalloc1(a->sliidx[a->totalslices]+1,&a->saved_values)); 1880 CHKERRQ(PetscLogObjectMemory((PetscObject)mat,(a->sliidx[a->totalslices]+1)*sizeof(PetscScalar))); 1881 } 1882 1883 /* copy values over */ 1884 CHKERRQ(PetscArraycpy(a->saved_values,a->val,a->sliidx[a->totalslices])); 1885 PetscFunctionReturn(0); 1886 } 1887 1888 PetscErrorCode MatRetrieveValues_SeqSELL(Mat mat) 1889 { 1890 Mat_SeqSELL *a=(Mat_SeqSELL*)mat->data; 1891 1892 PetscFunctionBegin; 1893 PetscCheck(a->nonew,PETSC_COMM_SELF,PETSC_ERR_ORDER,"Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first"); 1894 PetscCheck(a->saved_values,PETSC_COMM_SELF,PETSC_ERR_ORDER,"Must call MatStoreValues(A);first"); 1895 CHKERRQ(PetscArraycpy(a->val,a->saved_values,a->sliidx[a->totalslices])); 1896 PetscFunctionReturn(0); 1897 } 1898 1899 /*@C 1900 MatSeqSELLRestoreArray - returns access to the array where the data for a MATSEQSELL matrix is stored obtained by MatSeqSELLGetArray() 1901 1902 Not Collective 1903 1904 Input Parameters: 1905 . mat - a MATSEQSELL matrix 1906 . array - pointer to the data 1907 1908 Level: intermediate 1909 1910 .seealso: MatSeqSELLGetArray(), MatSeqSELLRestoreArrayF90() 1911 @*/ 1912 PetscErrorCode MatSeqSELLRestoreArray(Mat A,PetscScalar **array) 1913 { 1914 PetscFunctionBegin; 1915 CHKERRQ(PetscUseMethod(A,"MatSeqSELLRestoreArray_C",(Mat,PetscScalar**),(A,array))); 1916 PetscFunctionReturn(0); 1917 } 1918 1919 PETSC_EXTERN PetscErrorCode MatCreate_SeqSELL(Mat B) 1920 { 1921 Mat_SeqSELL *b; 1922 PetscMPIInt size; 1923 1924 PetscFunctionBegin; 1925 CHKERRQ(PetscCitationsRegister(citation,&cited)); 1926 CHKERRMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B),&size)); 1927 PetscCheckFalse(size > 1,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Comm must be of size 1"); 1928 1929 CHKERRQ(PetscNewLog(B,&b)); 1930 1931 B->data = (void*)b; 1932 1933 CHKERRQ(PetscMemcpy(B->ops,&MatOps_Values,sizeof(struct _MatOps))); 1934 1935 b->row = NULL; 1936 b->col = NULL; 1937 b->icol = NULL; 1938 b->reallocs = 0; 1939 b->ignorezeroentries = PETSC_FALSE; 1940 b->roworiented = PETSC_TRUE; 1941 b->nonew = 0; 1942 b->diag = NULL; 1943 b->solve_work = NULL; 1944 B->spptr = NULL; 1945 b->saved_values = NULL; 1946 b->idiag = NULL; 1947 b->mdiag = NULL; 1948 b->ssor_work = NULL; 1949 b->omega = 1.0; 1950 b->fshift = 0.0; 1951 b->idiagvalid = PETSC_FALSE; 1952 b->keepnonzeropattern = PETSC_FALSE; 1953 1954 CHKERRQ(PetscObjectChangeTypeName((PetscObject)B,MATSEQSELL)); 1955 CHKERRQ(PetscObjectComposeFunction((PetscObject)B,"MatSeqSELLGetArray_C",MatSeqSELLGetArray_SeqSELL)); 1956 CHKERRQ(PetscObjectComposeFunction((PetscObject)B,"MatSeqSELLRestoreArray_C",MatSeqSELLRestoreArray_SeqSELL)); 1957 CHKERRQ(PetscObjectComposeFunction((PetscObject)B,"MatStoreValues_C",MatStoreValues_SeqSELL)); 1958 CHKERRQ(PetscObjectComposeFunction((PetscObject)B,"MatRetrieveValues_C",MatRetrieveValues_SeqSELL)); 1959 CHKERRQ(PetscObjectComposeFunction((PetscObject)B,"MatSeqSELLSetPreallocation_C",MatSeqSELLSetPreallocation_SeqSELL)); 1960 CHKERRQ(PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqsell_seqaij_C",MatConvert_SeqSELL_SeqAIJ)); 1961 PetscFunctionReturn(0); 1962 } 1963 1964 /* 1965 Given a matrix generated with MatGetFactor() duplicates all the information in A into B 1966 */ 1967 PetscErrorCode MatDuplicateNoCreate_SeqSELL(Mat C,Mat A,MatDuplicateOption cpvalues,PetscBool mallocmatspace) 1968 { 1969 Mat_SeqSELL *c = (Mat_SeqSELL*)C->data,*a = (Mat_SeqSELL*)A->data; 1970 PetscInt i,m=A->rmap->n; 1971 PetscInt totalslices=a->totalslices; 1972 1973 PetscFunctionBegin; 1974 C->factortype = A->factortype; 1975 c->row = NULL; 1976 c->col = NULL; 1977 c->icol = NULL; 1978 c->reallocs = 0; 1979 C->assembled = PETSC_TRUE; 1980 1981 CHKERRQ(PetscLayoutReference(A->rmap,&C->rmap)); 1982 CHKERRQ(PetscLayoutReference(A->cmap,&C->cmap)); 1983 1984 CHKERRQ(PetscMalloc1(8*totalslices,&c->rlen)); 1985 CHKERRQ(PetscLogObjectMemory((PetscObject)C,m*sizeof(PetscInt))); 1986 CHKERRQ(PetscMalloc1(totalslices+1,&c->sliidx)); 1987 CHKERRQ(PetscLogObjectMemory((PetscObject)C, (totalslices+1)*sizeof(PetscInt))); 1988 1989 for (i=0; i<m; i++) c->rlen[i] = a->rlen[i]; 1990 for (i=0; i<totalslices+1; i++) c->sliidx[i] = a->sliidx[i]; 1991 1992 /* allocate the matrix space */ 1993 if (mallocmatspace) { 1994 CHKERRQ(PetscMalloc2(a->maxallocmat,&c->val,a->maxallocmat,&c->colidx)); 1995 CHKERRQ(PetscLogObjectMemory((PetscObject)C,a->maxallocmat*(sizeof(PetscScalar)+sizeof(PetscInt)))); 1996 1997 c->singlemalloc = PETSC_TRUE; 1998 1999 if (m > 0) { 2000 CHKERRQ(PetscArraycpy(c->colidx,a->colidx,a->maxallocmat)); 2001 if (cpvalues == MAT_COPY_VALUES) { 2002 CHKERRQ(PetscArraycpy(c->val,a->val,a->maxallocmat)); 2003 } else { 2004 CHKERRQ(PetscArrayzero(c->val,a->maxallocmat)); 2005 } 2006 } 2007 } 2008 2009 c->ignorezeroentries = a->ignorezeroentries; 2010 c->roworiented = a->roworiented; 2011 c->nonew = a->nonew; 2012 if (a->diag) { 2013 CHKERRQ(PetscMalloc1(m,&c->diag)); 2014 CHKERRQ(PetscLogObjectMemory((PetscObject)C,m*sizeof(PetscInt))); 2015 for (i=0; i<m; i++) { 2016 c->diag[i] = a->diag[i]; 2017 } 2018 } else c->diag = NULL; 2019 2020 c->solve_work = NULL; 2021 c->saved_values = NULL; 2022 c->idiag = NULL; 2023 c->ssor_work = NULL; 2024 c->keepnonzeropattern = a->keepnonzeropattern; 2025 c->free_val = PETSC_TRUE; 2026 c->free_colidx = PETSC_TRUE; 2027 2028 c->maxallocmat = a->maxallocmat; 2029 c->maxallocrow = a->maxallocrow; 2030 c->rlenmax = a->rlenmax; 2031 c->nz = a->nz; 2032 C->preallocated = PETSC_TRUE; 2033 2034 c->nonzerorowcnt = a->nonzerorowcnt; 2035 C->nonzerostate = A->nonzerostate; 2036 2037 CHKERRQ(PetscFunctionListDuplicate(((PetscObject)A)->qlist,&((PetscObject)C)->qlist)); 2038 PetscFunctionReturn(0); 2039 } 2040 2041 PetscErrorCode MatDuplicate_SeqSELL(Mat A,MatDuplicateOption cpvalues,Mat *B) 2042 { 2043 PetscFunctionBegin; 2044 CHKERRQ(MatCreate(PetscObjectComm((PetscObject)A),B)); 2045 CHKERRQ(MatSetSizes(*B,A->rmap->n,A->cmap->n,A->rmap->n,A->cmap->n)); 2046 if (!(A->rmap->n % A->rmap->bs) && !(A->cmap->n % A->cmap->bs)) { 2047 CHKERRQ(MatSetBlockSizesFromMats(*B,A,A)); 2048 } 2049 CHKERRQ(MatSetType(*B,((PetscObject)A)->type_name)); 2050 CHKERRQ(MatDuplicateNoCreate_SeqSELL(*B,A,cpvalues,PETSC_TRUE)); 2051 PetscFunctionReturn(0); 2052 } 2053 2054 /*MC 2055 MATSEQSELL - MATSEQSELL = "seqsell" - A matrix type to be used for sequential sparse matrices, 2056 based on the sliced Ellpack format 2057 2058 Options Database Keys: 2059 . -mat_type seqsell - sets the matrix type to "seqsell" during a call to MatSetFromOptions() 2060 2061 Level: beginner 2062 2063 .seealso: MatCreateSeqSell(), MATSELL, MATMPISELL, MATSEQAIJ, MATAIJ, MATMPIAIJ 2064 M*/ 2065 2066 /*MC 2067 MATSELL - MATSELL = "sell" - A matrix type to be used for sparse matrices. 2068 2069 This matrix type is identical to MATSEQSELL when constructed with a single process communicator, 2070 and MATMPISELL otherwise. As a result, for single process communicators, 2071 MatSeqSELLSetPreallocation() is supported, and similarly MatMPISELLSetPreallocation() is supported 2072 for communicators controlling multiple processes. It is recommended that you call both of 2073 the above preallocation routines for simplicity. 2074 2075 Options Database Keys: 2076 . -mat_type sell - sets the matrix type to "sell" during a call to MatSetFromOptions() 2077 2078 Level: beginner 2079 2080 Notes: 2081 This format is only supported for real scalars, double precision, and 32 bit indices (the defaults). 2082 2083 It can provide better performance on Intel and AMD processes with AVX2 or AVX512 support for matrices that have a similar number of 2084 non-zeros in contiguous groups of rows. However if the computation is memory bandwidth limited it may not provide much improvement. 2085 2086 Developer Notes: 2087 On Intel (and AMD) systems some of the matrix operations use SIMD (AVX) instructions to achieve higher performance. 2088 2089 The sparse matrix format is as follows. For simplicity we assume a slice size of 2, it is actually 8 2090 .vb 2091 (2 0 3 4) 2092 Consider the matrix A = (5 0 6 0) 2093 (0 0 7 8) 2094 (0 0 9 9) 2095 2096 symbolically the Ellpack format can be written as 2097 2098 (2 3 4 |) (0 2 3 |) 2099 v = (5 6 0 |) colidx = (0 2 2 |) 2100 -------- --------- 2101 (7 8 |) (2 3 |) 2102 (9 9 |) (2 3 |) 2103 2104 The data for 2 contiguous rows of the matrix are stored together (in column-major format) (with any left-over rows handled as a special case). 2105 Any of the rows in a slice fewer columns than the rest of the slice (row 1 above) are padded with a previous valid column in their "extra" colidx[] locations and 2106 zeros in their "extra" v locations so that the matrix operations do not need special code to handle different length rows within the 2 rows in a slice. 2107 2108 The one-dimensional representation of v used in the code is (2 5 3 6 4 0 7 9 8 9) and for colidx is (0 0 2 2 3 2 2 2 3 3) 2109 2110 .ve 2111 2112 See MatMult_SeqSELL() for how this format is used with the SIMD operations to achieve high performance. 2113 2114 References: 2115 . * - Hong Zhang, Richard T. Mills, Karl Rupp, and Barry F. Smith, Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512}, 2116 Proceedings of the 47th International Conference on Parallel Processing, 2018. 2117 2118 .seealso: MatCreateSeqSELL(), MatCreateSeqAIJ(), MatCreateSell(), MATSEQSELL, MATMPISELL, MATSEQAIJ, MATMPIAIJ, MATAIJ 2119 M*/ 2120 2121 /*@C 2122 MatCreateSeqSELL - Creates a sparse matrix in SELL format. 2123 2124 Collective on comm 2125 2126 Input Parameters: 2127 + comm - MPI communicator, set to PETSC_COMM_SELF 2128 . m - number of rows 2129 . n - number of columns 2130 . rlenmax - maximum number of nonzeros in a row 2131 - rlen - array containing the number of nonzeros in the various rows 2132 (possibly different for each row) or NULL 2133 2134 Output Parameter: 2135 . A - the matrix 2136 2137 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 2138 MatXXXXSetPreallocation() paradigm instead of this routine directly. 2139 [MatXXXXSetPreallocation() is, for example, MatSeqSELLSetPreallocation] 2140 2141 Notes: 2142 If nnz is given then nz is ignored 2143 2144 Specify the preallocated storage with either rlenmax or rlen (not both). 2145 Set rlenmax=PETSC_DEFAULT and rlen=NULL for PETSc to control dynamic memory 2146 allocation. For large problems you MUST preallocate memory or you 2147 will get TERRIBLE performance, see the users' manual chapter on matrices. 2148 2149 Level: intermediate 2150 2151 .seealso: MatCreate(), MatCreateSELL(), MatSetValues(), MatSeqSELLSetPreallocation(), MATSELL, MATSEQSELL, MATMPISELL 2152 2153 @*/ 2154 PetscErrorCode MatCreateSeqSELL(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt maxallocrow,const PetscInt rlen[],Mat *A) 2155 { 2156 PetscFunctionBegin; 2157 CHKERRQ(MatCreate(comm,A)); 2158 CHKERRQ(MatSetSizes(*A,m,n,m,n)); 2159 CHKERRQ(MatSetType(*A,MATSEQSELL)); 2160 CHKERRQ(MatSeqSELLSetPreallocation_SeqSELL(*A,maxallocrow,rlen)); 2161 PetscFunctionReturn(0); 2162 } 2163 2164 PetscErrorCode MatEqual_SeqSELL(Mat A,Mat B,PetscBool * flg) 2165 { 2166 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data,*b=(Mat_SeqSELL*)B->data; 2167 PetscInt totalslices=a->totalslices; 2168 2169 PetscFunctionBegin; 2170 /* If the matrix dimensions are not equal,or no of nonzeros */ 2171 if ((A->rmap->n != B->rmap->n) || (A->cmap->n != B->cmap->n) ||(a->nz != b->nz) || (a->rlenmax != b->rlenmax)) { 2172 *flg = PETSC_FALSE; 2173 PetscFunctionReturn(0); 2174 } 2175 /* if the a->colidx are the same */ 2176 CHKERRQ(PetscArraycmp(a->colidx,b->colidx,a->sliidx[totalslices],flg)); 2177 if (!*flg) PetscFunctionReturn(0); 2178 /* if a->val are the same */ 2179 CHKERRQ(PetscArraycmp(a->val,b->val,a->sliidx[totalslices],flg)); 2180 PetscFunctionReturn(0); 2181 } 2182 2183 PetscErrorCode MatSeqSELLInvalidateDiagonal(Mat A) 2184 { 2185 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 2186 2187 PetscFunctionBegin; 2188 a->idiagvalid = PETSC_FALSE; 2189 PetscFunctionReturn(0); 2190 } 2191 2192 PetscErrorCode MatConjugate_SeqSELL(Mat A) 2193 { 2194 #if defined(PETSC_USE_COMPLEX) 2195 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 2196 PetscInt i; 2197 PetscScalar *val = a->val; 2198 2199 PetscFunctionBegin; 2200 for (i=0; i<a->sliidx[a->totalslices]; i++) { 2201 val[i] = PetscConj(val[i]); 2202 } 2203 #else 2204 PetscFunctionBegin; 2205 #endif 2206 PetscFunctionReturn(0); 2207 } 2208