1 2 /* 3 Defines basic operations for the MATSEQAIJPERM matrix class. 4 This class is derived from the MATSEQAIJ class and retains the 5 compressed row storage (aka Yale sparse matrix format) but augments 6 it with some permutation information that enables some operations 7 to be more vectorizable. A physically rearranged copy of the matrix 8 may be stored if the user desires. 9 10 Eventually a variety of permutations may be supported. 11 */ 12 13 #include <../src/mat/impls/aij/seq/aij.h> 14 15 #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 16 #include <immintrin.h> 17 18 #if !defined(_MM_SCALE_8) 19 #define _MM_SCALE_8 8 20 #endif 21 #if !defined(_MM_SCALE_4) 22 #define _MM_SCALE_4 4 23 #endif 24 #endif 25 26 #define NDIM 512 27 /* NDIM specifies how many rows at a time we should work with when 28 * performing the vectorized mat-vec. This depends on various factors 29 * such as vector register length, etc., and I really need to add a 30 * way for the user (or the library) to tune this. I'm setting it to 31 * 512 for now since that is what Ed D'Azevedo was using in his Fortran 32 * routines. */ 33 34 typedef struct { 35 PetscObjectState nonzerostate; /* used to determine if the nonzero structure has changed and hence the permutations need updating */ 36 37 PetscInt ngroup; 38 PetscInt *xgroup; 39 /* Denotes where groups of rows with same number of nonzeros 40 * begin and end, i.e., xgroup[i] gives us the position in iperm[] 41 * where the ith group begins. */ 42 43 PetscInt *nzgroup; /* how many nonzeros each row that is a member of group i has. */ 44 PetscInt *iperm; /* The permutation vector. */ 45 46 /* Some of this stuff is for Ed's recursive triangular solve. 47 * I'm not sure what I need yet. */ 48 PetscInt blocksize; 49 PetscInt nstep; 50 PetscInt *jstart_list; 51 PetscInt *jend_list; 52 PetscInt *action_list; 53 PetscInt *ngroup_list; 54 PetscInt **ipointer_list; 55 PetscInt **xgroup_list; 56 PetscInt **nzgroup_list; 57 PetscInt **iperm_list; 58 } Mat_SeqAIJPERM; 59 60 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJPERM_SeqAIJ(Mat A,MatType type,MatReuse reuse,Mat *newmat) 61 { 62 /* This routine is only called to convert a MATAIJPERM to its base PETSc type, */ 63 /* so we will ignore 'MatType type'. */ 64 Mat B = *newmat; 65 Mat_SeqAIJPERM *aijperm=(Mat_SeqAIJPERM*)A->spptr; 66 67 PetscFunctionBegin; 68 if (reuse == MAT_INITIAL_MATRIX) { 69 PetscCall(MatDuplicate(A,MAT_COPY_VALUES,&B)); 70 aijperm=(Mat_SeqAIJPERM*)B->spptr; 71 } 72 73 /* Reset the original function pointers. */ 74 B->ops->assemblyend = MatAssemblyEnd_SeqAIJ; 75 B->ops->destroy = MatDestroy_SeqAIJ; 76 B->ops->duplicate = MatDuplicate_SeqAIJ; 77 B->ops->mult = MatMult_SeqAIJ; 78 B->ops->multadd = MatMultAdd_SeqAIJ; 79 80 PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijperm_seqaij_C",NULL)); 81 82 /* Free everything in the Mat_SeqAIJPERM data structure.*/ 83 PetscCall(PetscFree(aijperm->xgroup)); 84 PetscCall(PetscFree(aijperm->nzgroup)); 85 PetscCall(PetscFree(aijperm->iperm)); 86 PetscCall(PetscFree(B->spptr)); 87 88 /* Change the type of B to MATSEQAIJ. */ 89 PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJ)); 90 91 *newmat = B; 92 PetscFunctionReturn(0); 93 } 94 95 PetscErrorCode MatDestroy_SeqAIJPERM(Mat A) 96 { 97 Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM*) A->spptr; 98 99 PetscFunctionBegin; 100 if (aijperm) { 101 /* If MatHeaderMerge() was used then this SeqAIJPERM matrix will not have a spprt. */ 102 PetscCall(PetscFree(aijperm->xgroup)); 103 PetscCall(PetscFree(aijperm->nzgroup)); 104 PetscCall(PetscFree(aijperm->iperm)); 105 PetscCall(PetscFree(A->spptr)); 106 } 107 /* Change the type of A back to SEQAIJ and use MatDestroy_SeqAIJ() 108 * to destroy everything that remains. */ 109 PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJ)); 110 /* Note that I don't call MatSetType(). I believe this is because that 111 * is only to be called when *building* a matrix. I could be wrong, but 112 * that is how things work for the SuperLU matrix class. */ 113 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaijperm_seqaij_C",NULL)); 114 PetscCall(MatDestroy_SeqAIJ(A)); 115 PetscFunctionReturn(0); 116 } 117 118 PetscErrorCode MatDuplicate_SeqAIJPERM(Mat A, MatDuplicateOption op, Mat *M) 119 { 120 Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM*) A->spptr; 121 Mat_SeqAIJPERM *aijperm_dest; 122 PetscBool perm; 123 124 PetscFunctionBegin; 125 PetscCall(MatDuplicate_SeqAIJ(A,op,M)); 126 PetscCall(PetscObjectTypeCompare((PetscObject)*M,MATSEQAIJPERM,&perm)); 127 if (perm) { 128 aijperm_dest = (Mat_SeqAIJPERM *) (*M)->spptr; 129 PetscCall(PetscFree(aijperm_dest->xgroup)); 130 PetscCall(PetscFree(aijperm_dest->nzgroup)); 131 PetscCall(PetscFree(aijperm_dest->iperm)); 132 } else { 133 PetscCall(PetscNewLog(*M,&aijperm_dest)); 134 (*M)->spptr = (void*) aijperm_dest; 135 PetscCall(PetscObjectChangeTypeName((PetscObject)*M,MATSEQAIJPERM)); 136 PetscCall(PetscObjectComposeFunction((PetscObject)*M,"MatConvert_seqaijperm_seqaij_C",MatConvert_SeqAIJPERM_SeqAIJ)); 137 } 138 PetscCall(PetscArraycpy(aijperm_dest,aijperm,1)); 139 /* Allocate space for, and copy the grouping and permutation info. 140 * I note that when the groups are initially determined in 141 * MatSeqAIJPERM_create_perm, xgroup and nzgroup may be sized larger than 142 * necessary. But at this point, we know how large they need to be, and 143 * allocate only the necessary amount of memory. So the duplicated matrix 144 * may actually use slightly less storage than the original! */ 145 PetscCall(PetscMalloc1(A->rmap->n, &aijperm_dest->iperm)); 146 PetscCall(PetscMalloc1(aijperm->ngroup+1, &aijperm_dest->xgroup)); 147 PetscCall(PetscMalloc1(aijperm->ngroup, &aijperm_dest->nzgroup)); 148 PetscCall(PetscArraycpy(aijperm_dest->iperm,aijperm->iperm,A->rmap->n)); 149 PetscCall(PetscArraycpy(aijperm_dest->xgroup,aijperm->xgroup,aijperm->ngroup+1)); 150 PetscCall(PetscArraycpy(aijperm_dest->nzgroup,aijperm->nzgroup,aijperm->ngroup)); 151 PetscFunctionReturn(0); 152 } 153 154 PetscErrorCode MatSeqAIJPERM_create_perm(Mat A) 155 { 156 Mat_SeqAIJ *a = (Mat_SeqAIJ*)(A)->data; 157 Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM*) A->spptr; 158 PetscInt m; /* Number of rows in the matrix. */ 159 PetscInt *ia; /* From the CSR representation; points to the beginning of each row. */ 160 PetscInt maxnz; /* Maximum number of nonzeros in any row. */ 161 PetscInt *rows_in_bucket; 162 /* To construct the permutation, we sort each row into one of maxnz 163 * buckets based on how many nonzeros are in the row. */ 164 PetscInt nz; 165 PetscInt *nz_in_row; /* the number of nonzero elements in row k. */ 166 PetscInt *ipnz; 167 /* When constructing the iperm permutation vector, 168 * ipnz[nz] is used to point to the next place in the permutation vector 169 * that a row with nz nonzero elements should be placed.*/ 170 PetscInt i, ngroup, istart, ipos; 171 172 PetscFunctionBegin; 173 if (aijperm->nonzerostate == A->nonzerostate) PetscFunctionReturn(0); /* permutation exists and matches current nonzero structure */ 174 aijperm->nonzerostate = A->nonzerostate; 175 /* Free anything previously put in the Mat_SeqAIJPERM data structure. */ 176 PetscCall(PetscFree(aijperm->xgroup)); 177 PetscCall(PetscFree(aijperm->nzgroup)); 178 PetscCall(PetscFree(aijperm->iperm)); 179 180 m = A->rmap->n; 181 ia = a->i; 182 183 /* Allocate the arrays that will hold the permutation vector. */ 184 PetscCall(PetscMalloc1(m, &aijperm->iperm)); 185 186 /* Allocate some temporary work arrays that will be used in 187 * calculating the permutation vector and groupings. */ 188 PetscCall(PetscMalloc1(m, &nz_in_row)); 189 190 /* Now actually figure out the permutation and grouping. */ 191 192 /* First pass: Determine number of nonzeros in each row, maximum 193 * number of nonzeros in any row, and how many rows fall into each 194 * "bucket" of rows with same number of nonzeros. */ 195 maxnz = 0; 196 for (i=0; i<m; i++) { 197 nz_in_row[i] = ia[i+1]-ia[i]; 198 if (nz_in_row[i] > maxnz) maxnz = nz_in_row[i]; 199 } 200 PetscCall(PetscMalloc1(PetscMax(maxnz,m)+1, &rows_in_bucket)); 201 PetscCall(PetscMalloc1(PetscMax(maxnz,m)+1, &ipnz)); 202 203 for (i=0; i<=maxnz; i++) { 204 rows_in_bucket[i] = 0; 205 } 206 for (i=0; i<m; i++) { 207 nz = nz_in_row[i]; 208 rows_in_bucket[nz]++; 209 } 210 211 /* Allocate space for the grouping info. There will be at most (maxnz + 1) 212 * groups. (It is maxnz + 1 instead of simply maxnz because there may be 213 * rows with no nonzero elements.) If there are (maxnz + 1) groups, 214 * then xgroup[] must consist of (maxnz + 2) elements, since the last 215 * element of xgroup will tell us where the (maxnz + 1)th group ends. 216 * We allocate space for the maximum number of groups; 217 * that is potentially a little wasteful, but not too much so. 218 * Perhaps I should fix it later. */ 219 PetscCall(PetscMalloc1(maxnz+2, &aijperm->xgroup)); 220 PetscCall(PetscMalloc1(maxnz+1, &aijperm->nzgroup)); 221 222 /* Second pass. Look at what is in the buckets and create the groupings. 223 * Note that it is OK to have a group of rows with no non-zero values. */ 224 ngroup = 0; 225 istart = 0; 226 for (i=0; i<=maxnz; i++) { 227 if (rows_in_bucket[i] > 0) { 228 aijperm->nzgroup[ngroup] = i; 229 aijperm->xgroup[ngroup] = istart; 230 ngroup++; 231 istart += rows_in_bucket[i]; 232 } 233 } 234 235 aijperm->xgroup[ngroup] = istart; 236 aijperm->ngroup = ngroup; 237 238 /* Now fill in the permutation vector iperm. */ 239 ipnz[0] = 0; 240 for (i=0; i<maxnz; i++) { 241 ipnz[i+1] = ipnz[i] + rows_in_bucket[i]; 242 } 243 244 for (i=0; i<m; i++) { 245 nz = nz_in_row[i]; 246 ipos = ipnz[nz]; 247 aijperm->iperm[ipos] = i; 248 ipnz[nz]++; 249 } 250 251 /* Clean up temporary work arrays. */ 252 PetscCall(PetscFree(rows_in_bucket)); 253 PetscCall(PetscFree(ipnz)); 254 PetscCall(PetscFree(nz_in_row)); 255 PetscFunctionReturn(0); 256 } 257 258 PetscErrorCode MatAssemblyEnd_SeqAIJPERM(Mat A, MatAssemblyType mode) 259 { 260 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 261 262 PetscFunctionBegin; 263 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 264 265 /* Since a MATSEQAIJPERM matrix is really just a MATSEQAIJ with some 266 * extra information, call the AssemblyEnd routine for a MATSEQAIJ. 267 * I'm not sure if this is the best way to do this, but it avoids 268 * a lot of code duplication. 269 * I also note that currently MATSEQAIJPERM doesn't know anything about 270 * the Mat_CompressedRow data structure that SeqAIJ now uses when there 271 * are many zero rows. If the SeqAIJ assembly end routine decides to use 272 * this, this may break things. (Don't know... haven't looked at it.) */ 273 a->inode.use = PETSC_FALSE; 274 PetscCall(MatAssemblyEnd_SeqAIJ(A, mode)); 275 276 /* Now calculate the permutation and grouping information. */ 277 PetscCall(MatSeqAIJPERM_create_perm(A)); 278 PetscFunctionReturn(0); 279 } 280 281 PetscErrorCode MatMult_SeqAIJPERM(Mat A,Vec xx,Vec yy) 282 { 283 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 284 const PetscScalar *x; 285 PetscScalar *y; 286 const MatScalar *aa; 287 const PetscInt *aj,*ai; 288 #if !(defined(PETSC_USE_FORTRAN_KERNEL_MULTAIJPERM) && defined(notworking)) 289 PetscInt i,j; 290 #endif 291 #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 292 __m512d vec_x,vec_y,vec_vals; 293 __m256i vec_idx,vec_ipos,vec_j; 294 __mmask8 mask; 295 #endif 296 297 /* Variables that don't appear in MatMult_SeqAIJ. */ 298 Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM*) A->spptr; 299 PetscInt *iperm; /* Points to the permutation vector. */ 300 PetscInt *xgroup; 301 /* Denotes where groups of rows with same number of nonzeros 302 * begin and end in iperm. */ 303 PetscInt *nzgroup; 304 PetscInt ngroup; 305 PetscInt igroup; 306 PetscInt jstart,jend; 307 /* jstart is used in loops to denote the position in iperm where a 308 * group starts; jend denotes the position where it ends. 309 * (jend + 1 is where the next group starts.) */ 310 PetscInt iold,nz; 311 PetscInt istart,iend,isize; 312 PetscInt ipos; 313 PetscScalar yp[NDIM]; 314 PetscInt ip[NDIM]; /* yp[] and ip[] are treated as vector "registers" for performing the mat-vec. */ 315 316 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 317 #pragma disjoint(*x,*y,*aa) 318 #endif 319 320 PetscFunctionBegin; 321 PetscCall(VecGetArrayRead(xx,&x)); 322 PetscCall(VecGetArray(yy,&y)); 323 aj = a->j; /* aj[k] gives column index for element aa[k]. */ 324 aa = a->a; /* Nonzero elements stored row-by-row. */ 325 ai = a->i; /* ai[k] is the position in aa and aj where row k starts. */ 326 327 /* Get the info we need about the permutations and groupings. */ 328 iperm = aijperm->iperm; 329 ngroup = aijperm->ngroup; 330 xgroup = aijperm->xgroup; 331 nzgroup = aijperm->nzgroup; 332 333 #if defined(PETSC_USE_FORTRAN_KERNEL_MULTAIJPERM) && defined(notworking) 334 fortranmultaijperm_(&m,x,ii,aj,aa,y); 335 #else 336 337 for (igroup=0; igroup<ngroup; igroup++) { 338 jstart = xgroup[igroup]; 339 jend = xgroup[igroup+1] - 1; 340 nz = nzgroup[igroup]; 341 342 /* Handle the special cases where the number of nonzeros per row 343 * in the group is either 0 or 1. */ 344 if (nz == 0) { 345 for (i=jstart; i<=jend; i++) { 346 y[iperm[i]] = 0.0; 347 } 348 } else if (nz == 1) { 349 for (i=jstart; i<=jend; i++) { 350 iold = iperm[i]; 351 ipos = ai[iold]; 352 y[iold] = aa[ipos] * x[aj[ipos]]; 353 } 354 } else { 355 356 /* We work our way through the current group in chunks of NDIM rows 357 * at a time. */ 358 359 for (istart=jstart; istart<=jend; istart+=NDIM) { 360 /* Figure out where the chunk of 'isize' rows ends in iperm. 361 * 'isize may of course be less than NDIM for the last chunk. */ 362 iend = istart + (NDIM - 1); 363 364 if (iend > jend) iend = jend; 365 366 isize = iend - istart + 1; 367 368 /* Initialize the yp[] array that will be used to hold part of 369 * the permuted results vector, and figure out where in aa each 370 * row of the chunk will begin. */ 371 for (i=0; i<isize; i++) { 372 iold = iperm[istart + i]; 373 /* iold is a row number from the matrix A *before* reordering. */ 374 ip[i] = ai[iold]; 375 /* ip[i] tells us where the ith row of the chunk begins in aa. */ 376 yp[i] = (PetscScalar) 0.0; 377 } 378 379 /* If the number of zeros per row exceeds the number of rows in 380 * the chunk, we should vectorize along nz, that is, perform the 381 * mat-vec one row at a time as in the usual CSR case. */ 382 if (nz > isize) { 383 #if defined(PETSC_HAVE_CRAY_VECTOR) 384 #pragma _CRI preferstream 385 #endif 386 for (i=0; i<isize; i++) { 387 #if defined(PETSC_HAVE_CRAY_VECTOR) 388 #pragma _CRI prefervector 389 #endif 390 391 #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 392 vec_y = _mm512_setzero_pd(); 393 ipos = ip[i]; 394 for (j=0; j<(nz>>3); j++) { 395 vec_idx = _mm256_loadu_si256((__m256i const*)&aj[ipos]); 396 vec_vals = _mm512_loadu_pd(&aa[ipos]); 397 vec_x = _mm512_i32gather_pd(vec_idx,x,_MM_SCALE_8); 398 vec_y = _mm512_fmadd_pd(vec_x,vec_vals,vec_y); 399 ipos += 8; 400 } 401 if ((nz&0x07)>2) { 402 mask = (__mmask8)(0xff >> (8-(nz&0x07))); 403 vec_idx = _mm256_loadu_si256((__m256i const*)&aj[ipos]); 404 vec_vals = _mm512_loadu_pd(&aa[ipos]); 405 vec_x = _mm512_mask_i32gather_pd(vec_x,mask,vec_idx,x,_MM_SCALE_8); 406 vec_y = _mm512_mask3_fmadd_pd(vec_x,vec_vals,vec_y,mask); 407 } else if ((nz&0x07)==2) { 408 yp[i] += aa[ipos]*x[aj[ipos]]; 409 yp[i] += aa[ipos+1]*x[aj[ipos+1]]; 410 } else if ((nz&0x07)==1) { 411 yp[i] += aa[ipos]*x[aj[ipos]]; 412 } 413 yp[i] += _mm512_reduce_add_pd(vec_y); 414 #else 415 for (j=0; j<nz; j++) { 416 ipos = ip[i] + j; 417 yp[i] += aa[ipos] * x[aj[ipos]]; 418 } 419 #endif 420 } 421 } else { 422 /* Otherwise, there are enough rows in the chunk to make it 423 * worthwhile to vectorize across the rows, that is, to do the 424 * matvec by operating with "columns" of the chunk. */ 425 for (j=0; j<nz; j++) { 426 #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 427 vec_j = _mm256_set1_epi32(j); 428 for (i=0; i<((isize>>3)<<3); i+=8) { 429 vec_y = _mm512_loadu_pd(&yp[i]); 430 vec_ipos = _mm256_loadu_si256((__m256i const*)&ip[i]); 431 vec_ipos = _mm256_add_epi32(vec_ipos,vec_j); 432 vec_idx = _mm256_i32gather_epi32(aj,vec_ipos,_MM_SCALE_4); 433 vec_vals = _mm512_i32gather_pd(vec_ipos,aa,_MM_SCALE_8); 434 vec_x = _mm512_i32gather_pd(vec_idx,x,_MM_SCALE_8); 435 vec_y = _mm512_fmadd_pd(vec_x,vec_vals,vec_y); 436 _mm512_storeu_pd(&yp[i],vec_y); 437 } 438 for (i=isize-(isize&0x07); i<isize; i++) { 439 ipos = ip[i]+j; 440 yp[i] += aa[ipos]*x[aj[ipos]]; 441 } 442 #else 443 for (i=0; i<isize; i++) { 444 ipos = ip[i] + j; 445 yp[i] += aa[ipos] * x[aj[ipos]]; 446 } 447 #endif 448 } 449 } 450 451 #if defined(PETSC_HAVE_CRAY_VECTOR) 452 #pragma _CRI ivdep 453 #endif 454 /* Put results from yp[] into non-permuted result vector y. */ 455 for (i=0; i<isize; i++) { 456 y[iperm[istart+i]] = yp[i]; 457 } 458 } /* End processing chunk of isize rows of a group. */ 459 } /* End handling matvec for chunk with nz > 1. */ 460 } /* End loop over igroup. */ 461 #endif 462 PetscCall(PetscLogFlops(PetscMax(2.0*a->nz - A->rmap->n,0))); 463 PetscCall(VecRestoreArrayRead(xx,&x)); 464 PetscCall(VecRestoreArray(yy,&y)); 465 PetscFunctionReturn(0); 466 } 467 468 /* MatMultAdd_SeqAIJPERM() calculates yy = ww + A * xx. 469 * Note that the names I used to designate the vectors differs from that 470 * used in MatMultAdd_SeqAIJ(). I did this to keep my notation consistent 471 * with the MatMult_SeqAIJPERM() routine, which is very similar to this one. */ 472 /* 473 I hate having virtually identical code for the mult and the multadd!!! 474 */ 475 PetscErrorCode MatMultAdd_SeqAIJPERM(Mat A,Vec xx,Vec ww,Vec yy) 476 { 477 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 478 const PetscScalar *x; 479 PetscScalar *y,*w; 480 const MatScalar *aa; 481 const PetscInt *aj,*ai; 482 #if !defined(PETSC_USE_FORTRAN_KERNEL_MULTADDAIJPERM) 483 PetscInt i,j; 484 #endif 485 486 /* Variables that don't appear in MatMultAdd_SeqAIJ. */ 487 Mat_SeqAIJPERM * aijperm; 488 PetscInt *iperm; /* Points to the permutation vector. */ 489 PetscInt *xgroup; 490 /* Denotes where groups of rows with same number of nonzeros 491 * begin and end in iperm. */ 492 PetscInt *nzgroup; 493 PetscInt ngroup; 494 PetscInt igroup; 495 PetscInt jstart,jend; 496 /* jstart is used in loops to denote the position in iperm where a 497 * group starts; jend denotes the position where it ends. 498 * (jend + 1 is where the next group starts.) */ 499 PetscInt iold,nz; 500 PetscInt istart,iend,isize; 501 PetscInt ipos; 502 PetscScalar yp[NDIM]; 503 PetscInt ip[NDIM]; 504 /* yp[] and ip[] are treated as vector "registers" for performing 505 * the mat-vec. */ 506 507 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 508 #pragma disjoint(*x,*y,*aa) 509 #endif 510 511 PetscFunctionBegin; 512 PetscCall(VecGetArrayRead(xx,&x)); 513 PetscCall(VecGetArrayPair(yy,ww,&y,&w)); 514 515 aj = a->j; /* aj[k] gives column index for element aa[k]. */ 516 aa = a->a; /* Nonzero elements stored row-by-row. */ 517 ai = a->i; /* ai[k] is the position in aa and aj where row k starts. */ 518 519 /* Get the info we need about the permutations and groupings. */ 520 aijperm = (Mat_SeqAIJPERM*) A->spptr; 521 iperm = aijperm->iperm; 522 ngroup = aijperm->ngroup; 523 xgroup = aijperm->xgroup; 524 nzgroup = aijperm->nzgroup; 525 526 #if defined(PETSC_USE_FORTRAN_KERNEL_MULTADDAIJPERM) 527 fortranmultaddaijperm_(&m,x,ii,aj,aa,y,w); 528 #else 529 530 for (igroup=0; igroup<ngroup; igroup++) { 531 jstart = xgroup[igroup]; 532 jend = xgroup[igroup+1] - 1; 533 534 nz = nzgroup[igroup]; 535 536 /* Handle the special cases where the number of nonzeros per row 537 * in the group is either 0 or 1. */ 538 if (nz == 0) { 539 for (i=jstart; i<=jend; i++) { 540 iold = iperm[i]; 541 y[iold] = w[iold]; 542 } 543 } 544 else if (nz == 1) { 545 for (i=jstart; i<=jend; i++) { 546 iold = iperm[i]; 547 ipos = ai[iold]; 548 y[iold] = w[iold] + aa[ipos] * x[aj[ipos]]; 549 } 550 } 551 /* For the general case: */ 552 else { 553 554 /* We work our way through the current group in chunks of NDIM rows 555 * at a time. */ 556 557 for (istart=jstart; istart<=jend; istart+=NDIM) { 558 /* Figure out where the chunk of 'isize' rows ends in iperm. 559 * 'isize may of course be less than NDIM for the last chunk. */ 560 iend = istart + (NDIM - 1); 561 if (iend > jend) iend = jend; 562 isize = iend - istart + 1; 563 564 /* Initialize the yp[] array that will be used to hold part of 565 * the permuted results vector, and figure out where in aa each 566 * row of the chunk will begin. */ 567 for (i=0; i<isize; i++) { 568 iold = iperm[istart + i]; 569 /* iold is a row number from the matrix A *before* reordering. */ 570 ip[i] = ai[iold]; 571 /* ip[i] tells us where the ith row of the chunk begins in aa. */ 572 yp[i] = w[iold]; 573 } 574 575 /* If the number of zeros per row exceeds the number of rows in 576 * the chunk, we should vectorize along nz, that is, perform the 577 * mat-vec one row at a time as in the usual CSR case. */ 578 if (nz > isize) { 579 #if defined(PETSC_HAVE_CRAY_VECTOR) 580 #pragma _CRI preferstream 581 #endif 582 for (i=0; i<isize; i++) { 583 #if defined(PETSC_HAVE_CRAY_VECTOR) 584 #pragma _CRI prefervector 585 #endif 586 for (j=0; j<nz; j++) { 587 ipos = ip[i] + j; 588 yp[i] += aa[ipos] * x[aj[ipos]]; 589 } 590 } 591 } 592 /* Otherwise, there are enough rows in the chunk to make it 593 * worthwhile to vectorize across the rows, that is, to do the 594 * matvec by operating with "columns" of the chunk. */ 595 else { 596 for (j=0; j<nz; j++) { 597 for (i=0; i<isize; i++) { 598 ipos = ip[i] + j; 599 yp[i] += aa[ipos] * x[aj[ipos]]; 600 } 601 } 602 } 603 604 #if defined(PETSC_HAVE_CRAY_VECTOR) 605 #pragma _CRI ivdep 606 #endif 607 /* Put results from yp[] into non-permuted result vector y. */ 608 for (i=0; i<isize; i++) { 609 y[iperm[istart+i]] = yp[i]; 610 } 611 } /* End processing chunk of isize rows of a group. */ 612 613 } /* End handling matvec for chunk with nz > 1. */ 614 } /* End loop over igroup. */ 615 616 #endif 617 PetscCall(PetscLogFlops(2.0*a->nz)); 618 PetscCall(VecRestoreArrayRead(xx,&x)); 619 PetscCall(VecRestoreArrayPair(yy,ww,&y,&w)); 620 PetscFunctionReturn(0); 621 } 622 623 /* MatConvert_SeqAIJ_SeqAIJPERM converts a SeqAIJ matrix into a 624 * SeqAIJPERM matrix. This routine is called by the MatCreate_SeqAIJPERM() 625 * routine, but can also be used to convert an assembled SeqAIJ matrix 626 * into a SeqAIJPERM one. */ 627 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJPERM(Mat A,MatType type,MatReuse reuse,Mat *newmat) 628 { 629 Mat B = *newmat; 630 Mat_SeqAIJPERM *aijperm; 631 PetscBool sametype; 632 633 PetscFunctionBegin; 634 if (reuse == MAT_INITIAL_MATRIX) { 635 PetscCall(MatDuplicate(A,MAT_COPY_VALUES,&B)); 636 } 637 PetscCall(PetscObjectTypeCompare((PetscObject)A,type,&sametype)); 638 if (sametype) PetscFunctionReturn(0); 639 640 PetscCall(PetscNewLog(B,&aijperm)); 641 B->spptr = (void*) aijperm; 642 643 /* Set function pointers for methods that we inherit from AIJ but override. */ 644 B->ops->duplicate = MatDuplicate_SeqAIJPERM; 645 B->ops->assemblyend = MatAssemblyEnd_SeqAIJPERM; 646 B->ops->destroy = MatDestroy_SeqAIJPERM; 647 B->ops->mult = MatMult_SeqAIJPERM; 648 B->ops->multadd = MatMultAdd_SeqAIJPERM; 649 650 aijperm->nonzerostate = -1; /* this will trigger the generation of the permutation information the first time through MatAssembly()*/ 651 /* If A has already been assembled, compute the permutation. */ 652 if (A->assembled) { 653 PetscCall(MatSeqAIJPERM_create_perm(B)); 654 } 655 656 PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijperm_seqaij_C",MatConvert_SeqAIJPERM_SeqAIJ)); 657 658 PetscCall(PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJPERM)); 659 *newmat = B; 660 PetscFunctionReturn(0); 661 } 662 663 /*@C 664 MatCreateSeqAIJPERM - Creates a sparse matrix of type SEQAIJPERM. 665 This type inherits from AIJ, but calculates some additional permutation 666 information that is used to allow better vectorization of some 667 operations. At the cost of increased storage, the AIJ formatted 668 matrix can be copied to a format in which pieces of the matrix are 669 stored in ELLPACK format, allowing the vectorized matrix multiply 670 routine to use stride-1 memory accesses. As with the AIJ type, it is 671 important to preallocate matrix storage in order to get good assembly 672 performance. 673 674 Collective 675 676 Input Parameters: 677 + comm - MPI communicator, set to PETSC_COMM_SELF 678 . m - number of rows 679 . n - number of columns 680 . nz - number of nonzeros per row (same for all rows) 681 - nnz - array containing the number of nonzeros in the various rows 682 (possibly different for each row) or NULL 683 684 Output Parameter: 685 . A - the matrix 686 687 Notes: 688 If nnz is given then nz is ignored 689 690 Level: intermediate 691 692 .seealso: `MatCreate()`, `MatCreateMPIAIJPERM()`, `MatSetValues()` 693 @*/ 694 PetscErrorCode MatCreateSeqAIJPERM(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 695 { 696 PetscFunctionBegin; 697 PetscCall(MatCreate(comm,A)); 698 PetscCall(MatSetSizes(*A,m,n,m,n)); 699 PetscCall(MatSetType(*A,MATSEQAIJPERM)); 700 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,nnz)); 701 PetscFunctionReturn(0); 702 } 703 704 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJPERM(Mat A) 705 { 706 PetscFunctionBegin; 707 PetscCall(MatSetType(A,MATSEQAIJ)); 708 PetscCall(MatConvert_SeqAIJ_SeqAIJPERM(A,MATSEQAIJPERM,MAT_INPLACE_MATRIX,&A)); 709 PetscFunctionReturn(0); 710 } 711