1 2 /* 3 Defines basic operations for the MATSEQAIJPERM matrix class. 4 This class is derived from the MATSEQAIJ class and retains the 5 compressed row storage (aka Yale sparse matrix format) but augments 6 it with some permutation information that enables some operations 7 to be more vectorizable. A physically rearranged copy of the matrix 8 may be stored if the user desires. 9 10 Eventually a variety of permutations may be supported. 11 */ 12 13 #include <../src/mat/impls/aij/seq/aij.h> 14 15 #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 16 #include <immintrin.h> 17 18 #if !defined(_MM_SCALE_8) 19 #define _MM_SCALE_8 8 20 #endif 21 #if !defined(_MM_SCALE_4) 22 #define _MM_SCALE_4 4 23 #endif 24 #endif 25 26 #define NDIM 512 27 /* NDIM specifies how many rows at a time we should work with when 28 * performing the vectorized mat-vec. This depends on various factors 29 * such as vector register length, etc., and I really need to add a 30 * way for the user (or the library) to tune this. I'm setting it to 31 * 512 for now since that is what Ed D'Azevedo was using in his Fortran 32 * routines. */ 33 34 typedef struct { 35 PetscObjectState nonzerostate; /* used to determine if the nonzero structure has changed and hence the permutations need updating */ 36 37 PetscInt ngroup; 38 PetscInt *xgroup; 39 /* Denotes where groups of rows with same number of nonzeros 40 * begin and end, i.e., xgroup[i] gives us the position in iperm[] 41 * where the ith group begins. */ 42 43 PetscInt *nzgroup; /* how many nonzeros each row that is a member of group i has. */ 44 PetscInt *iperm; /* The permutation vector. */ 45 46 /* Some of this stuff is for Ed's recursive triangular solve. 47 * I'm not sure what I need yet. */ 48 PetscInt blocksize; 49 PetscInt nstep; 50 PetscInt *jstart_list; 51 PetscInt *jend_list; 52 PetscInt *action_list; 53 PetscInt *ngroup_list; 54 PetscInt **ipointer_list; 55 PetscInt **xgroup_list; 56 PetscInt **nzgroup_list; 57 PetscInt **iperm_list; 58 } Mat_SeqAIJPERM; 59 60 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJPERM_SeqAIJ(Mat A,MatType type,MatReuse reuse,Mat *newmat) 61 { 62 /* This routine is only called to convert a MATAIJPERM to its base PETSc type, */ 63 /* so we will ignore 'MatType type'. */ 64 PetscErrorCode ierr; 65 Mat B = *newmat; 66 Mat_SeqAIJPERM *aijperm=(Mat_SeqAIJPERM*)A->spptr; 67 68 PetscFunctionBegin; 69 if (reuse == MAT_INITIAL_MATRIX) { 70 ierr = MatDuplicate(A,MAT_COPY_VALUES,&B);CHKERRQ(ierr); 71 aijperm=(Mat_SeqAIJPERM*)B->spptr; 72 } 73 74 /* Reset the original function pointers. */ 75 B->ops->assemblyend = MatAssemblyEnd_SeqAIJ; 76 B->ops->destroy = MatDestroy_SeqAIJ; 77 B->ops->duplicate = MatDuplicate_SeqAIJ; 78 B->ops->mult = MatMult_SeqAIJ; 79 B->ops->multadd = MatMultAdd_SeqAIJ; 80 81 ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijperm_seqaij_C",NULL);CHKERRQ(ierr); 82 ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultSymbolic_seqdense_seqaijperm_C",NULL);CHKERRQ(ierr); 83 ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultNumeric_seqdense_seqaijperm_C",NULL);CHKERRQ(ierr); 84 85 /* Free everything in the Mat_SeqAIJPERM data structure.*/ 86 ierr = PetscFree(aijperm->xgroup);CHKERRQ(ierr); 87 ierr = PetscFree(aijperm->nzgroup);CHKERRQ(ierr); 88 ierr = PetscFree(aijperm->iperm);CHKERRQ(ierr); 89 ierr = PetscFree(B->spptr);CHKERRQ(ierr); 90 91 /* Change the type of B to MATSEQAIJ. */ 92 ierr = PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJ);CHKERRQ(ierr); 93 94 *newmat = B; 95 PetscFunctionReturn(0); 96 } 97 98 PetscErrorCode MatDestroy_SeqAIJPERM(Mat A) 99 { 100 PetscErrorCode ierr; 101 Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM*) A->spptr; 102 103 PetscFunctionBegin; 104 if (aijperm) { 105 /* If MatHeaderMerge() was used then this SeqAIJPERM matrix will not have a spprt. */ 106 ierr = PetscFree(aijperm->xgroup);CHKERRQ(ierr); 107 ierr = PetscFree(aijperm->nzgroup);CHKERRQ(ierr); 108 ierr = PetscFree(aijperm->iperm);CHKERRQ(ierr); 109 ierr = PetscFree(A->spptr);CHKERRQ(ierr); 110 } 111 /* Change the type of A back to SEQAIJ and use MatDestroy_SeqAIJ() 112 * to destroy everything that remains. */ 113 ierr = PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJ);CHKERRQ(ierr); 114 /* Note that I don't call MatSetType(). I believe this is because that 115 * is only to be called when *building* a matrix. I could be wrong, but 116 * that is how things work for the SuperLU matrix class. */ 117 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 118 PetscFunctionReturn(0); 119 } 120 121 PetscErrorCode MatDuplicate_SeqAIJPERM(Mat A, MatDuplicateOption op, Mat *M) 122 { 123 PetscErrorCode ierr; 124 Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM*) A->spptr; 125 Mat_SeqAIJPERM *aijperm_dest; 126 PetscBool perm; 127 128 PetscFunctionBegin; 129 ierr = MatDuplicate_SeqAIJ(A,op,M);CHKERRQ(ierr); 130 ierr = PetscObjectTypeCompare((PetscObject)*M,MATSEQAIJPERM,&perm);CHKERRQ(ierr); 131 if (perm) { 132 aijperm_dest = (Mat_SeqAIJPERM *) (*M)->spptr; 133 ierr = PetscFree(aijperm_dest->xgroup);CHKERRQ(ierr); 134 ierr = PetscFree(aijperm_dest->nzgroup);CHKERRQ(ierr); 135 ierr = PetscFree(aijperm_dest->iperm);CHKERRQ(ierr); 136 } else { 137 ierr = PetscNewLog(*M,&aijperm_dest);CHKERRQ(ierr); 138 (*M)->spptr = (void*) aijperm_dest; 139 ierr = PetscObjectChangeTypeName((PetscObject)*M,MATSEQAIJPERM);CHKERRQ(ierr); 140 ierr = PetscObjectComposeFunction((PetscObject)*M,"MatConvert_seqaijperm_seqaij_C",MatConvert_SeqAIJPERM_SeqAIJ);CHKERRQ(ierr); 141 ierr = PetscObjectComposeFunction((PetscObject)*M,"MatMatMultSymbolic_seqdense_seqaijperm_C",MatMatMultSymbolic_SeqDense_SeqAIJ);CHKERRQ(ierr); 142 ierr = PetscObjectComposeFunction((PetscObject)*M,"MatMatMultNumeric_seqdense_seqaijperm_C",MatMatMultNumeric_SeqDense_SeqAIJ);CHKERRQ(ierr); 143 } 144 ierr = PetscArraycpy(aijperm_dest,aijperm,1);CHKERRQ(ierr); 145 /* Allocate space for, and copy the grouping and permutation info. 146 * I note that when the groups are initially determined in 147 * MatSeqAIJPERM_create_perm, xgroup and nzgroup may be sized larger than 148 * necessary. But at this point, we know how large they need to be, and 149 * allocate only the necessary amount of memory. So the duplicated matrix 150 * may actually use slightly less storage than the original! */ 151 ierr = PetscMalloc1(A->rmap->n, &aijperm_dest->iperm);CHKERRQ(ierr); 152 ierr = PetscMalloc1(aijperm->ngroup+1, &aijperm_dest->xgroup);CHKERRQ(ierr); 153 ierr = PetscMalloc1(aijperm->ngroup, &aijperm_dest->nzgroup);CHKERRQ(ierr); 154 ierr = PetscArraycpy(aijperm_dest->iperm,aijperm->iperm,A->rmap->n);CHKERRQ(ierr); 155 ierr = PetscArraycpy(aijperm_dest->xgroup,aijperm->xgroup,aijperm->ngroup+1);CHKERRQ(ierr); 156 ierr = PetscArraycpy(aijperm_dest->nzgroup,aijperm->nzgroup,aijperm->ngroup);CHKERRQ(ierr); 157 PetscFunctionReturn(0); 158 } 159 160 PetscErrorCode MatSeqAIJPERM_create_perm(Mat A) 161 { 162 PetscErrorCode ierr; 163 Mat_SeqAIJ *a = (Mat_SeqAIJ*)(A)->data; 164 Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM*) A->spptr; 165 PetscInt m; /* Number of rows in the matrix. */ 166 PetscInt *ia; /* From the CSR representation; points to the beginning of each row. */ 167 PetscInt maxnz; /* Maximum number of nonzeros in any row. */ 168 PetscInt *rows_in_bucket; 169 /* To construct the permutation, we sort each row into one of maxnz 170 * buckets based on how many nonzeros are in the row. */ 171 PetscInt nz; 172 PetscInt *nz_in_row; /* the number of nonzero elements in row k. */ 173 PetscInt *ipnz; 174 /* When constructing the iperm permutation vector, 175 * ipnz[nz] is used to point to the next place in the permutation vector 176 * that a row with nz nonzero elements should be placed.*/ 177 PetscInt i, ngroup, istart, ipos; 178 179 PetscFunctionBegin; 180 if (aijperm->nonzerostate == A->nonzerostate) PetscFunctionReturn(0); /* permutation exists and matches current nonzero structure */ 181 aijperm->nonzerostate = A->nonzerostate; 182 /* Free anything previously put in the Mat_SeqAIJPERM data structure. */ 183 ierr = PetscFree(aijperm->xgroup);CHKERRQ(ierr); 184 ierr = PetscFree(aijperm->nzgroup);CHKERRQ(ierr); 185 ierr = PetscFree(aijperm->iperm);CHKERRQ(ierr); 186 187 m = A->rmap->n; 188 ia = a->i; 189 190 /* Allocate the arrays that will hold the permutation vector. */ 191 ierr = PetscMalloc1(m, &aijperm->iperm);CHKERRQ(ierr); 192 193 /* Allocate some temporary work arrays that will be used in 194 * calculating the permuation vector and groupings. */ 195 ierr = PetscMalloc1(m, &nz_in_row);CHKERRQ(ierr); 196 197 /* Now actually figure out the permutation and grouping. */ 198 199 /* First pass: Determine number of nonzeros in each row, maximum 200 * number of nonzeros in any row, and how many rows fall into each 201 * "bucket" of rows with same number of nonzeros. */ 202 maxnz = 0; 203 for (i=0; i<m; i++) { 204 nz_in_row[i] = ia[i+1]-ia[i]; 205 if (nz_in_row[i] > maxnz) maxnz = nz_in_row[i]; 206 } 207 ierr = PetscMalloc1(PetscMax(maxnz,m)+1, &rows_in_bucket);CHKERRQ(ierr); 208 ierr = PetscMalloc1(PetscMax(maxnz,m)+1, &ipnz);CHKERRQ(ierr); 209 210 for (i=0; i<=maxnz; i++) { 211 rows_in_bucket[i] = 0; 212 } 213 for (i=0; i<m; i++) { 214 nz = nz_in_row[i]; 215 rows_in_bucket[nz]++; 216 } 217 218 /* Allocate space for the grouping info. There will be at most (maxnz + 1) 219 * groups. (It is maxnz + 1 instead of simply maxnz because there may be 220 * rows with no nonzero elements.) If there are (maxnz + 1) groups, 221 * then xgroup[] must consist of (maxnz + 2) elements, since the last 222 * element of xgroup will tell us where the (maxnz + 1)th group ends. 223 * We allocate space for the maximum number of groups; 224 * that is potentially a little wasteful, but not too much so. 225 * Perhaps I should fix it later. */ 226 ierr = PetscMalloc1(maxnz+2, &aijperm->xgroup);CHKERRQ(ierr); 227 ierr = PetscMalloc1(maxnz+1, &aijperm->nzgroup);CHKERRQ(ierr); 228 229 /* Second pass. Look at what is in the buckets and create the groupings. 230 * Note that it is OK to have a group of rows with no non-zero values. */ 231 ngroup = 0; 232 istart = 0; 233 for (i=0; i<=maxnz; i++) { 234 if (rows_in_bucket[i] > 0) { 235 aijperm->nzgroup[ngroup] = i; 236 aijperm->xgroup[ngroup] = istart; 237 ngroup++; 238 istart += rows_in_bucket[i]; 239 } 240 } 241 242 aijperm->xgroup[ngroup] = istart; 243 aijperm->ngroup = ngroup; 244 245 /* Now fill in the permutation vector iperm. */ 246 ipnz[0] = 0; 247 for (i=0; i<maxnz; i++) { 248 ipnz[i+1] = ipnz[i] + rows_in_bucket[i]; 249 } 250 251 for (i=0; i<m; i++) { 252 nz = nz_in_row[i]; 253 ipos = ipnz[nz]; 254 aijperm->iperm[ipos] = i; 255 ipnz[nz]++; 256 } 257 258 /* Clean up temporary work arrays. */ 259 ierr = PetscFree(rows_in_bucket);CHKERRQ(ierr); 260 ierr = PetscFree(ipnz);CHKERRQ(ierr); 261 ierr = PetscFree(nz_in_row);CHKERRQ(ierr); 262 PetscFunctionReturn(0); 263 } 264 265 266 PetscErrorCode MatAssemblyEnd_SeqAIJPERM(Mat A, MatAssemblyType mode) 267 { 268 PetscErrorCode ierr; 269 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 270 271 PetscFunctionBegin; 272 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 273 274 /* Since a MATSEQAIJPERM matrix is really just a MATSEQAIJ with some 275 * extra information, call the AssemblyEnd routine for a MATSEQAIJ. 276 * I'm not sure if this is the best way to do this, but it avoids 277 * a lot of code duplication. 278 * I also note that currently MATSEQAIJPERM doesn't know anything about 279 * the Mat_CompressedRow data structure that SeqAIJ now uses when there 280 * are many zero rows. If the SeqAIJ assembly end routine decides to use 281 * this, this may break things. (Don't know... haven't looked at it.) */ 282 a->inode.use = PETSC_FALSE; 283 ierr = MatAssemblyEnd_SeqAIJ(A, mode);CHKERRQ(ierr); 284 285 /* Now calculate the permutation and grouping information. */ 286 ierr = MatSeqAIJPERM_create_perm(A);CHKERRQ(ierr); 287 PetscFunctionReturn(0); 288 } 289 290 PetscErrorCode MatMult_SeqAIJPERM(Mat A,Vec xx,Vec yy) 291 { 292 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 293 const PetscScalar *x; 294 PetscScalar *y; 295 const MatScalar *aa; 296 PetscErrorCode ierr; 297 const PetscInt *aj,*ai; 298 #if !(defined(PETSC_USE_FORTRAN_KERNEL_MULTAIJPERM) && defined(notworking)) 299 PetscInt i,j; 300 #endif 301 #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 302 __m512d vec_x,vec_y,vec_vals; 303 __m256i vec_idx,vec_ipos,vec_j; 304 __mmask8 mask; 305 #endif 306 307 /* Variables that don't appear in MatMult_SeqAIJ. */ 308 Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM*) A->spptr; 309 PetscInt *iperm; /* Points to the permutation vector. */ 310 PetscInt *xgroup; 311 /* Denotes where groups of rows with same number of nonzeros 312 * begin and end in iperm. */ 313 PetscInt *nzgroup; 314 PetscInt ngroup; 315 PetscInt igroup; 316 PetscInt jstart,jend; 317 /* jstart is used in loops to denote the position in iperm where a 318 * group starts; jend denotes the position where it ends. 319 * (jend + 1 is where the next group starts.) */ 320 PetscInt iold,nz; 321 PetscInt istart,iend,isize; 322 PetscInt ipos; 323 PetscScalar yp[NDIM]; 324 PetscInt ip[NDIM]; /* yp[] and ip[] are treated as vector "registers" for performing the mat-vec. */ 325 326 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 327 #pragma disjoint(*x,*y,*aa) 328 #endif 329 330 PetscFunctionBegin; 331 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 332 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 333 aj = a->j; /* aj[k] gives column index for element aa[k]. */ 334 aa = a->a; /* Nonzero elements stored row-by-row. */ 335 ai = a->i; /* ai[k] is the position in aa and aj where row k starts. */ 336 337 /* Get the info we need about the permutations and groupings. */ 338 iperm = aijperm->iperm; 339 ngroup = aijperm->ngroup; 340 xgroup = aijperm->xgroup; 341 nzgroup = aijperm->nzgroup; 342 343 #if defined(PETSC_USE_FORTRAN_KERNEL_MULTAIJPERM) && defined(notworking) 344 fortranmultaijperm_(&m,x,ii,aj,aa,y); 345 #else 346 347 for (igroup=0; igroup<ngroup; igroup++) { 348 jstart = xgroup[igroup]; 349 jend = xgroup[igroup+1] - 1; 350 nz = nzgroup[igroup]; 351 352 /* Handle the special cases where the number of nonzeros per row 353 * in the group is either 0 or 1. */ 354 if (nz == 0) { 355 for (i=jstart; i<=jend; i++) { 356 y[iperm[i]] = 0.0; 357 } 358 } else if (nz == 1) { 359 for (i=jstart; i<=jend; i++) { 360 iold = iperm[i]; 361 ipos = ai[iold]; 362 y[iold] = aa[ipos] * x[aj[ipos]]; 363 } 364 } else { 365 366 /* We work our way through the current group in chunks of NDIM rows 367 * at a time. */ 368 369 for (istart=jstart; istart<=jend; istart+=NDIM) { 370 /* Figure out where the chunk of 'isize' rows ends in iperm. 371 * 'isize may of course be less than NDIM for the last chunk. */ 372 iend = istart + (NDIM - 1); 373 374 if (iend > jend) iend = jend; 375 376 isize = iend - istart + 1; 377 378 /* Initialize the yp[] array that will be used to hold part of 379 * the permuted results vector, and figure out where in aa each 380 * row of the chunk will begin. */ 381 for (i=0; i<isize; i++) { 382 iold = iperm[istart + i]; 383 /* iold is a row number from the matrix A *before* reordering. */ 384 ip[i] = ai[iold]; 385 /* ip[i] tells us where the ith row of the chunk begins in aa. */ 386 yp[i] = (PetscScalar) 0.0; 387 } 388 389 /* If the number of zeros per row exceeds the number of rows in 390 * the chunk, we should vectorize along nz, that is, perform the 391 * mat-vec one row at a time as in the usual CSR case. */ 392 if (nz > isize) { 393 #if defined(PETSC_HAVE_CRAY_VECTOR) 394 #pragma _CRI preferstream 395 #endif 396 for (i=0; i<isize; i++) { 397 #if defined(PETSC_HAVE_CRAY_VECTOR) 398 #pragma _CRI prefervector 399 #endif 400 401 #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 402 vec_y = _mm512_setzero_pd(); 403 ipos = ip[i]; 404 for (j=0; j<(nz>>3); j++) { 405 vec_idx = _mm256_loadu_si256((__m256i const*)&aj[ipos]); 406 vec_vals = _mm512_loadu_pd(&aa[ipos]); 407 vec_x = _mm512_i32gather_pd(vec_idx,x,_MM_SCALE_8); 408 vec_y = _mm512_fmadd_pd(vec_x,vec_vals,vec_y); 409 ipos += 8; 410 } 411 if ((nz&0x07)>2) { 412 mask = (__mmask8)(0xff >> (8-(nz&0x07))); 413 vec_idx = _mm256_loadu_si256((__m256i const*)&aj[ipos]); 414 vec_vals = _mm512_loadu_pd(&aa[ipos]); 415 vec_x = _mm512_mask_i32gather_pd(vec_x,mask,vec_idx,x,_MM_SCALE_8); 416 vec_y = _mm512_mask3_fmadd_pd(vec_x,vec_vals,vec_y,mask); 417 } else if ((nz&0x07)==2) { 418 yp[i] += aa[ipos]*x[aj[ipos]]; 419 yp[i] += aa[ipos+1]*x[aj[ipos+1]]; 420 } else if ((nz&0x07)==1) { 421 yp[i] += aa[ipos]*x[aj[ipos]]; 422 } 423 yp[i] += _mm512_reduce_add_pd(vec_y); 424 #else 425 for (j=0; j<nz; j++) { 426 ipos = ip[i] + j; 427 yp[i] += aa[ipos] * x[aj[ipos]]; 428 } 429 #endif 430 } 431 } else { 432 /* Otherwise, there are enough rows in the chunk to make it 433 * worthwhile to vectorize across the rows, that is, to do the 434 * matvec by operating with "columns" of the chunk. */ 435 for (j=0; j<nz; j++) { 436 #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 437 vec_j = _mm256_set1_epi32(j); 438 for (i=0; i<((isize>>3)<<3); i+=8) { 439 vec_y = _mm512_loadu_pd(&yp[i]); 440 vec_ipos = _mm256_loadu_si256((__m256i const*)&ip[i]); 441 vec_ipos = _mm256_add_epi32(vec_ipos,vec_j); 442 vec_idx = _mm256_i32gather_epi32(aj,vec_ipos,_MM_SCALE_4); 443 vec_vals = _mm512_i32gather_pd(vec_ipos,aa,_MM_SCALE_8); 444 vec_x = _mm512_i32gather_pd(vec_idx,x,_MM_SCALE_8); 445 vec_y = _mm512_fmadd_pd(vec_x,vec_vals,vec_y); 446 _mm512_storeu_pd(&yp[i],vec_y); 447 } 448 for (i=isize-(isize&0x07); i<isize; i++) { 449 ipos = ip[i]+j; 450 yp[i] += aa[ipos]*x[aj[ipos]]; 451 } 452 #else 453 for (i=0; i<isize; i++) { 454 ipos = ip[i] + j; 455 yp[i] += aa[ipos] * x[aj[ipos]]; 456 } 457 #endif 458 } 459 } 460 461 #if defined(PETSC_HAVE_CRAY_VECTOR) 462 #pragma _CRI ivdep 463 #endif 464 /* Put results from yp[] into non-permuted result vector y. */ 465 for (i=0; i<isize; i++) { 466 y[iperm[istart+i]] = yp[i]; 467 } 468 } /* End processing chunk of isize rows of a group. */ 469 } /* End handling matvec for chunk with nz > 1. */ 470 } /* End loop over igroup. */ 471 #endif 472 ierr = PetscLogFlops(PetscMax(2.0*a->nz - A->rmap->n,0));CHKERRQ(ierr); 473 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 474 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 475 PetscFunctionReturn(0); 476 } 477 478 479 /* MatMultAdd_SeqAIJPERM() calculates yy = ww + A * xx. 480 * Note that the names I used to designate the vectors differs from that 481 * used in MatMultAdd_SeqAIJ(). I did this to keep my notation consistent 482 * with the MatMult_SeqAIJPERM() routine, which is very similar to this one. */ 483 /* 484 I hate having virtually identical code for the mult and the multadd!!! 485 */ 486 PetscErrorCode MatMultAdd_SeqAIJPERM(Mat A,Vec xx,Vec ww,Vec yy) 487 { 488 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 489 const PetscScalar *x; 490 PetscScalar *y,*w; 491 const MatScalar *aa; 492 PetscErrorCode ierr; 493 const PetscInt *aj,*ai; 494 #if !defined(PETSC_USE_FORTRAN_KERNEL_MULTADDAIJPERM) 495 PetscInt i,j; 496 #endif 497 498 /* Variables that don't appear in MatMultAdd_SeqAIJ. */ 499 Mat_SeqAIJPERM * aijperm; 500 PetscInt *iperm; /* Points to the permutation vector. */ 501 PetscInt *xgroup; 502 /* Denotes where groups of rows with same number of nonzeros 503 * begin and end in iperm. */ 504 PetscInt *nzgroup; 505 PetscInt ngroup; 506 PetscInt igroup; 507 PetscInt jstart,jend; 508 /* jstart is used in loops to denote the position in iperm where a 509 * group starts; jend denotes the position where it ends. 510 * (jend + 1 is where the next group starts.) */ 511 PetscInt iold,nz; 512 PetscInt istart,iend,isize; 513 PetscInt ipos; 514 PetscScalar yp[NDIM]; 515 PetscInt ip[NDIM]; 516 /* yp[] and ip[] are treated as vector "registers" for performing 517 * the mat-vec. */ 518 519 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 520 #pragma disjoint(*x,*y,*aa) 521 #endif 522 523 PetscFunctionBegin; 524 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 525 ierr = VecGetArrayPair(yy,ww,&y,&w);CHKERRQ(ierr); 526 527 aj = a->j; /* aj[k] gives column index for element aa[k]. */ 528 aa = a->a; /* Nonzero elements stored row-by-row. */ 529 ai = a->i; /* ai[k] is the position in aa and aj where row k starts. */ 530 531 /* Get the info we need about the permutations and groupings. */ 532 aijperm = (Mat_SeqAIJPERM*) A->spptr; 533 iperm = aijperm->iperm; 534 ngroup = aijperm->ngroup; 535 xgroup = aijperm->xgroup; 536 nzgroup = aijperm->nzgroup; 537 538 #if defined(PETSC_USE_FORTRAN_KERNEL_MULTADDAIJPERM) 539 fortranmultaddaijperm_(&m,x,ii,aj,aa,y,w); 540 #else 541 542 for (igroup=0; igroup<ngroup; igroup++) { 543 jstart = xgroup[igroup]; 544 jend = xgroup[igroup+1] - 1; 545 546 nz = nzgroup[igroup]; 547 548 /* Handle the special cases where the number of nonzeros per row 549 * in the group is either 0 or 1. */ 550 if (nz == 0) { 551 for (i=jstart; i<=jend; i++) { 552 iold = iperm[i]; 553 y[iold] = w[iold]; 554 } 555 } 556 else if (nz == 1) { 557 for (i=jstart; i<=jend; i++) { 558 iold = iperm[i]; 559 ipos = ai[iold]; 560 y[iold] = w[iold] + aa[ipos] * x[aj[ipos]]; 561 } 562 } 563 /* For the general case: */ 564 else { 565 566 /* We work our way through the current group in chunks of NDIM rows 567 * at a time. */ 568 569 for (istart=jstart; istart<=jend; istart+=NDIM) { 570 /* Figure out where the chunk of 'isize' rows ends in iperm. 571 * 'isize may of course be less than NDIM for the last chunk. */ 572 iend = istart + (NDIM - 1); 573 if (iend > jend) iend = jend; 574 isize = iend - istart + 1; 575 576 /* Initialize the yp[] array that will be used to hold part of 577 * the permuted results vector, and figure out where in aa each 578 * row of the chunk will begin. */ 579 for (i=0; i<isize; i++) { 580 iold = iperm[istart + i]; 581 /* iold is a row number from the matrix A *before* reordering. */ 582 ip[i] = ai[iold]; 583 /* ip[i] tells us where the ith row of the chunk begins in aa. */ 584 yp[i] = w[iold]; 585 } 586 587 /* If the number of zeros per row exceeds the number of rows in 588 * the chunk, we should vectorize along nz, that is, perform the 589 * mat-vec one row at a time as in the usual CSR case. */ 590 if (nz > isize) { 591 #if defined(PETSC_HAVE_CRAY_VECTOR) 592 #pragma _CRI preferstream 593 #endif 594 for (i=0; i<isize; i++) { 595 #if defined(PETSC_HAVE_CRAY_VECTOR) 596 #pragma _CRI prefervector 597 #endif 598 for (j=0; j<nz; j++) { 599 ipos = ip[i] + j; 600 yp[i] += aa[ipos] * x[aj[ipos]]; 601 } 602 } 603 } 604 /* Otherwise, there are enough rows in the chunk to make it 605 * worthwhile to vectorize across the rows, that is, to do the 606 * matvec by operating with "columns" of the chunk. */ 607 else { 608 for (j=0; j<nz; j++) { 609 for (i=0; i<isize; i++) { 610 ipos = ip[i] + j; 611 yp[i] += aa[ipos] * x[aj[ipos]]; 612 } 613 } 614 } 615 616 #if defined(PETSC_HAVE_CRAY_VECTOR) 617 #pragma _CRI ivdep 618 #endif 619 /* Put results from yp[] into non-permuted result vector y. */ 620 for (i=0; i<isize; i++) { 621 y[iperm[istart+i]] = yp[i]; 622 } 623 } /* End processing chunk of isize rows of a group. */ 624 625 } /* End handling matvec for chunk with nz > 1. */ 626 } /* End loop over igroup. */ 627 628 #endif 629 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 630 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 631 ierr = VecRestoreArrayPair(yy,ww,&y,&w);CHKERRQ(ierr); 632 PetscFunctionReturn(0); 633 } 634 635 /* MatConvert_SeqAIJ_SeqAIJPERM converts a SeqAIJ matrix into a 636 * SeqAIJPERM matrix. This routine is called by the MatCreate_SeqAIJPERM() 637 * routine, but can also be used to convert an assembled SeqAIJ matrix 638 * into a SeqAIJPERM one. */ 639 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJPERM(Mat A,MatType type,MatReuse reuse,Mat *newmat) 640 { 641 PetscErrorCode ierr; 642 Mat B = *newmat; 643 Mat_SeqAIJPERM *aijperm; 644 PetscBool sametype; 645 646 PetscFunctionBegin; 647 if (reuse == MAT_INITIAL_MATRIX) { 648 ierr = MatDuplicate(A,MAT_COPY_VALUES,&B);CHKERRQ(ierr); 649 } 650 ierr = PetscObjectTypeCompare((PetscObject)A,type,&sametype);CHKERRQ(ierr); 651 if (sametype) PetscFunctionReturn(0); 652 653 ierr = PetscNewLog(B,&aijperm);CHKERRQ(ierr); 654 B->spptr = (void*) aijperm; 655 656 /* Set function pointers for methods that we inherit from AIJ but override. */ 657 B->ops->duplicate = MatDuplicate_SeqAIJPERM; 658 B->ops->assemblyend = MatAssemblyEnd_SeqAIJPERM; 659 B->ops->destroy = MatDestroy_SeqAIJPERM; 660 B->ops->mult = MatMult_SeqAIJPERM; 661 B->ops->multadd = MatMultAdd_SeqAIJPERM; 662 663 aijperm->nonzerostate = -1; /* this will trigger the generation of the permutation information the first time through MatAssembly()*/ 664 /* If A has already been assembled, compute the permutation. */ 665 if (A->assembled) { 666 ierr = MatSeqAIJPERM_create_perm(B);CHKERRQ(ierr); 667 } 668 669 ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijperm_seqaij_C",MatConvert_SeqAIJPERM_SeqAIJ);CHKERRQ(ierr); 670 ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultSymbolic_seqdense_seqaijperm_C",MatMatMultSymbolic_SeqDense_SeqAIJ);CHKERRQ(ierr); 671 ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultNumeric_seqdense_seqaijperm_C",MatMatMultNumeric_SeqDense_SeqAIJ);CHKERRQ(ierr); 672 673 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJPERM);CHKERRQ(ierr); 674 *newmat = B; 675 PetscFunctionReturn(0); 676 } 677 678 /*@C 679 MatCreateSeqAIJPERM - Creates a sparse matrix of type SEQAIJPERM. 680 This type inherits from AIJ, but calculates some additional permutation 681 information that is used to allow better vectorization of some 682 operations. At the cost of increased storage, the AIJ formatted 683 matrix can be copied to a format in which pieces of the matrix are 684 stored in ELLPACK format, allowing the vectorized matrix multiply 685 routine to use stride-1 memory accesses. As with the AIJ type, it is 686 important to preallocate matrix storage in order to get good assembly 687 performance. 688 689 Collective 690 691 Input Parameters: 692 + comm - MPI communicator, set to PETSC_COMM_SELF 693 . m - number of rows 694 . n - number of columns 695 . nz - number of nonzeros per row (same for all rows) 696 - nnz - array containing the number of nonzeros in the various rows 697 (possibly different for each row) or NULL 698 699 Output Parameter: 700 . A - the matrix 701 702 Notes: 703 If nnz is given then nz is ignored 704 705 Level: intermediate 706 707 .seealso: MatCreate(), MatCreateMPIAIJPERM(), MatSetValues() 708 @*/ 709 PetscErrorCode MatCreateSeqAIJPERM(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 710 { 711 PetscErrorCode ierr; 712 713 PetscFunctionBegin; 714 ierr = MatCreate(comm,A);CHKERRQ(ierr); 715 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 716 ierr = MatSetType(*A,MATSEQAIJPERM);CHKERRQ(ierr); 717 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,nnz);CHKERRQ(ierr); 718 PetscFunctionReturn(0); 719 } 720 721 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJPERM(Mat A) 722 { 723 PetscErrorCode ierr; 724 725 PetscFunctionBegin; 726 ierr = MatSetType(A,MATSEQAIJ);CHKERRQ(ierr); 727 ierr = MatConvert_SeqAIJ_SeqAIJPERM(A,MATSEQAIJPERM,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 728 PetscFunctionReturn(0); 729 } 730 731