1 2 /* 3 Defines basic operations for the MATSEQAIJPERM matrix class. 4 This class is derived from the MATSEQAIJ class and retains the 5 compressed row storage (aka Yale sparse matrix format) but augments 6 it with some permutation information that enables some operations 7 to be more vectorizable. A physically rearranged copy of the matrix 8 may be stored if the user desires. 9 10 Eventually a variety of permutations may be supported. 11 */ 12 13 #include <../src/mat/impls/aij/seq/aij.h> 14 15 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 16 #include <immintrin.h> 17 18 #if !defined(_MM_SCALE_8) 19 #define _MM_SCALE_8 8 20 #endif 21 #if !defined(_MM_SCALE_4) 22 #define _MM_SCALE_4 4 23 #endif 24 #endif 25 26 #define NDIM 512 27 /* NDIM specifies how many rows at a time we should work with when 28 * performing the vectorized mat-vec. This depends on various factors 29 * such as vector register length, etc., and I really need to add a 30 * way for the user (or the library) to tune this. I'm setting it to 31 * 512 for now since that is what Ed D'Azevedo was using in his Fortran 32 * routines. */ 33 34 typedef struct { 35 PetscObjectState nonzerostate; /* used to determine if the nonzero structure has changed and hence the permutations need updating */ 36 37 PetscInt ngroup; 38 PetscInt *xgroup; 39 /* Denotes where groups of rows with same number of nonzeros 40 * begin and end, i.e., xgroup[i] gives us the position in iperm[] 41 * where the ith group begins. */ 42 43 PetscInt *nzgroup; /* how many nonzeros each row that is a member of group i has. */ 44 PetscInt *iperm; /* The permutation vector. */ 45 46 /* Some of this stuff is for Ed's recursive triangular solve. 47 * I'm not sure what I need yet. */ 48 PetscInt blocksize; 49 PetscInt nstep; 50 PetscInt *jstart_list; 51 PetscInt *jend_list; 52 PetscInt *action_list; 53 PetscInt *ngroup_list; 54 PetscInt **ipointer_list; 55 PetscInt **xgroup_list; 56 PetscInt **nzgroup_list; 57 PetscInt **iperm_list; 58 } Mat_SeqAIJPERM; 59 60 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJPERM_SeqAIJ(Mat A,MatType type,MatReuse reuse,Mat *newmat) 61 { 62 /* This routine is only called to convert a MATAIJPERM to its base PETSc type, */ 63 /* so we will ignore 'MatType type'. */ 64 PetscErrorCode ierr; 65 Mat B = *newmat; 66 Mat_SeqAIJPERM *aijperm=(Mat_SeqAIJPERM*)A->spptr; 67 68 PetscFunctionBegin; 69 if (reuse == MAT_INITIAL_MATRIX) { 70 ierr = MatDuplicate(A,MAT_COPY_VALUES,&B);CHKERRQ(ierr); 71 aijperm=(Mat_SeqAIJPERM*)B->spptr; 72 } 73 74 /* Reset the original function pointers. */ 75 B->ops->assemblyend = MatAssemblyEnd_SeqAIJ; 76 B->ops->destroy = MatDestroy_SeqAIJ; 77 B->ops->duplicate = MatDuplicate_SeqAIJ; 78 B->ops->mult = MatMult_SeqAIJ; 79 B->ops->multadd = MatMultAdd_SeqAIJ; 80 81 ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijperm_seqaij_C",NULL);CHKERRQ(ierr); 82 ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMult_seqdense_seqaijperm_C",NULL);CHKERRQ(ierr); 83 ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultSymbolic_seqdense_seqaijperm_C",NULL);CHKERRQ(ierr); 84 ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultNumeric_seqdense_seqaijperm_C",NULL);CHKERRQ(ierr); 85 86 /* Free everything in the Mat_SeqAIJPERM data structure.*/ 87 ierr = PetscFree(aijperm->xgroup);CHKERRQ(ierr); 88 ierr = PetscFree(aijperm->nzgroup);CHKERRQ(ierr); 89 ierr = PetscFree(aijperm->iperm);CHKERRQ(ierr); 90 ierr = PetscFree(B->spptr);CHKERRQ(ierr); 91 92 /* Change the type of B to MATSEQAIJ. */ 93 ierr = PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJ);CHKERRQ(ierr); 94 95 *newmat = B; 96 PetscFunctionReturn(0); 97 } 98 99 PetscErrorCode MatDestroy_SeqAIJPERM(Mat A) 100 { 101 PetscErrorCode ierr; 102 Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM*) A->spptr; 103 104 PetscFunctionBegin; 105 if (aijperm) { 106 /* If MatHeaderMerge() was used then this SeqAIJPERM matrix will not have a spprt. */ 107 ierr = PetscFree(aijperm->xgroup);CHKERRQ(ierr); 108 ierr = PetscFree(aijperm->nzgroup);CHKERRQ(ierr); 109 ierr = PetscFree(aijperm->iperm);CHKERRQ(ierr); 110 ierr = PetscFree(A->spptr);CHKERRQ(ierr); 111 } 112 /* Change the type of A back to SEQAIJ and use MatDestroy_SeqAIJ() 113 * to destroy everything that remains. */ 114 ierr = PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJ);CHKERRQ(ierr); 115 /* Note that I don't call MatSetType(). I believe this is because that 116 * is only to be called when *building* a matrix. I could be wrong, but 117 * that is how things work for the SuperLU matrix class. */ 118 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 119 PetscFunctionReturn(0); 120 } 121 122 PetscErrorCode MatDuplicate_SeqAIJPERM(Mat A, MatDuplicateOption op, Mat *M) 123 { 124 PetscErrorCode ierr; 125 Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM*) A->spptr; 126 Mat_SeqAIJPERM *aijperm_dest; 127 PetscBool perm; 128 129 PetscFunctionBegin; 130 ierr = MatDuplicate_SeqAIJ(A,op,M);CHKERRQ(ierr); 131 ierr = PetscObjectTypeCompare((PetscObject)*M,MATSEQAIJPERM,&perm);CHKERRQ(ierr); 132 if (perm) { 133 aijperm_dest = (Mat_SeqAIJPERM *) (*M)->spptr; 134 ierr = PetscFree(aijperm_dest->xgroup);CHKERRQ(ierr); 135 ierr = PetscFree(aijperm_dest->nzgroup);CHKERRQ(ierr); 136 ierr = PetscFree(aijperm_dest->iperm);CHKERRQ(ierr); 137 } else { 138 ierr = PetscNewLog(*M,&aijperm_dest);CHKERRQ(ierr); 139 (*M)->spptr = (void*) aijperm_dest; 140 ierr = PetscObjectChangeTypeName((PetscObject)*M,MATSEQAIJPERM);CHKERRQ(ierr); 141 ierr = PetscObjectComposeFunction((PetscObject)*M,"MatConvert_seqaijperm_seqaij_C",MatConvert_SeqAIJPERM_SeqAIJ);CHKERRQ(ierr); 142 ierr = PetscObjectComposeFunction((PetscObject)*M,"MatMatMult_seqdense_seqaijperm_C",MatMatMult_SeqDense_SeqAIJ);CHKERRQ(ierr); 143 ierr = PetscObjectComposeFunction((PetscObject)*M,"MatMatMultSymbolic_seqdense_seqaijperm_C",MatMatMultSymbolic_SeqDense_SeqAIJ);CHKERRQ(ierr); 144 ierr = PetscObjectComposeFunction((PetscObject)*M,"MatMatMultNumeric_seqdense_seqaijperm_C",MatMatMultNumeric_SeqDense_SeqAIJ);CHKERRQ(ierr); 145 } 146 ierr = PetscMemcpy(aijperm_dest,aijperm,sizeof(Mat_SeqAIJPERM));CHKERRQ(ierr); 147 /* Allocate space for, and copy the grouping and permutation info. 148 * I note that when the groups are initially determined in 149 * MatSeqAIJPERM_create_perm, xgroup and nzgroup may be sized larger than 150 * necessary. But at this point, we know how large they need to be, and 151 * allocate only the necessary amount of memory. So the duplicated matrix 152 * may actually use slightly less storage than the original! */ 153 ierr = PetscMalloc1(A->rmap->n, &aijperm_dest->iperm);CHKERRQ(ierr); 154 ierr = PetscMalloc1(aijperm->ngroup+1, &aijperm_dest->xgroup);CHKERRQ(ierr); 155 ierr = PetscMalloc1(aijperm->ngroup, &aijperm_dest->nzgroup);CHKERRQ(ierr); 156 ierr = PetscMemcpy(aijperm_dest->iperm,aijperm->iperm,sizeof(PetscInt)*A->rmap->n);CHKERRQ(ierr); 157 ierr = PetscMemcpy(aijperm_dest->xgroup,aijperm->xgroup,sizeof(PetscInt)*(aijperm->ngroup+1));CHKERRQ(ierr); 158 ierr = PetscMemcpy(aijperm_dest->nzgroup,aijperm->nzgroup,sizeof(PetscInt)*aijperm->ngroup);CHKERRQ(ierr); 159 PetscFunctionReturn(0); 160 } 161 162 PetscErrorCode MatSeqAIJPERM_create_perm(Mat A) 163 { 164 PetscErrorCode ierr; 165 Mat_SeqAIJ *a = (Mat_SeqAIJ*)(A)->data; 166 Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM*) A->spptr; 167 PetscInt m; /* Number of rows in the matrix. */ 168 PetscInt *ia; /* From the CSR representation; points to the beginning of each row. */ 169 PetscInt maxnz; /* Maximum number of nonzeros in any row. */ 170 PetscInt *rows_in_bucket; 171 /* To construct the permutation, we sort each row into one of maxnz 172 * buckets based on how many nonzeros are in the row. */ 173 PetscInt nz; 174 PetscInt *nz_in_row; /* the number of nonzero elements in row k. */ 175 PetscInt *ipnz; 176 /* When constructing the iperm permutation vector, 177 * ipnz[nz] is used to point to the next place in the permutation vector 178 * that a row with nz nonzero elements should be placed.*/ 179 PetscInt i, ngroup, istart, ipos; 180 181 PetscFunctionBegin; 182 if (aijperm->nonzerostate == A->nonzerostate) PetscFunctionReturn(0); /* permutation exists and matches current nonzero structure */ 183 aijperm->nonzerostate = A->nonzerostate; 184 /* Free anything previously put in the Mat_SeqAIJPERM data structure. */ 185 ierr = PetscFree(aijperm->xgroup);CHKERRQ(ierr); 186 ierr = PetscFree(aijperm->nzgroup);CHKERRQ(ierr); 187 ierr = PetscFree(aijperm->iperm);CHKERRQ(ierr); 188 189 m = A->rmap->n; 190 ia = a->i; 191 192 /* Allocate the arrays that will hold the permutation vector. */ 193 ierr = PetscMalloc1(m, &aijperm->iperm);CHKERRQ(ierr); 194 195 /* Allocate some temporary work arrays that will be used in 196 * calculating the permuation vector and groupings. */ 197 ierr = PetscMalloc1(m, &nz_in_row);CHKERRQ(ierr); 198 199 /* Now actually figure out the permutation and grouping. */ 200 201 /* First pass: Determine number of nonzeros in each row, maximum 202 * number of nonzeros in any row, and how many rows fall into each 203 * "bucket" of rows with same number of nonzeros. */ 204 maxnz = 0; 205 for (i=0; i<m; i++) { 206 nz_in_row[i] = ia[i+1]-ia[i]; 207 if (nz_in_row[i] > maxnz) maxnz = nz_in_row[i]; 208 } 209 ierr = PetscMalloc1(PetscMax(maxnz,m)+1, &rows_in_bucket);CHKERRQ(ierr); 210 ierr = PetscMalloc1(PetscMax(maxnz,m)+1, &ipnz);CHKERRQ(ierr); 211 212 for (i=0; i<=maxnz; i++) { 213 rows_in_bucket[i] = 0; 214 } 215 for (i=0; i<m; i++) { 216 nz = nz_in_row[i]; 217 rows_in_bucket[nz]++; 218 } 219 220 /* Allocate space for the grouping info. There will be at most (maxnz + 1) 221 * groups. (It is maxnz + 1 instead of simply maxnz because there may be 222 * rows with no nonzero elements.) If there are (maxnz + 1) groups, 223 * then xgroup[] must consist of (maxnz + 2) elements, since the last 224 * element of xgroup will tell us where the (maxnz + 1)th group ends. 225 * We allocate space for the maximum number of groups; 226 * that is potentially a little wasteful, but not too much so. 227 * Perhaps I should fix it later. */ 228 ierr = PetscMalloc1(maxnz+2, &aijperm->xgroup);CHKERRQ(ierr); 229 ierr = PetscMalloc1(maxnz+1, &aijperm->nzgroup);CHKERRQ(ierr); 230 231 /* Second pass. Look at what is in the buckets and create the groupings. 232 * Note that it is OK to have a group of rows with no non-zero values. */ 233 ngroup = 0; 234 istart = 0; 235 for (i=0; i<=maxnz; i++) { 236 if (rows_in_bucket[i] > 0) { 237 aijperm->nzgroup[ngroup] = i; 238 aijperm->xgroup[ngroup] = istart; 239 ngroup++; 240 istart += rows_in_bucket[i]; 241 } 242 } 243 244 aijperm->xgroup[ngroup] = istart; 245 aijperm->ngroup = ngroup; 246 247 /* Now fill in the permutation vector iperm. */ 248 ipnz[0] = 0; 249 for (i=0; i<maxnz; i++) { 250 ipnz[i+1] = ipnz[i] + rows_in_bucket[i]; 251 } 252 253 for (i=0; i<m; i++) { 254 nz = nz_in_row[i]; 255 ipos = ipnz[nz]; 256 aijperm->iperm[ipos] = i; 257 ipnz[nz]++; 258 } 259 260 /* Clean up temporary work arrays. */ 261 ierr = PetscFree(rows_in_bucket);CHKERRQ(ierr); 262 ierr = PetscFree(ipnz);CHKERRQ(ierr); 263 ierr = PetscFree(nz_in_row);CHKERRQ(ierr); 264 PetscFunctionReturn(0); 265 } 266 267 268 PetscErrorCode MatAssemblyEnd_SeqAIJPERM(Mat A, MatAssemblyType mode) 269 { 270 PetscErrorCode ierr; 271 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 272 273 PetscFunctionBegin; 274 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 275 276 /* Since a MATSEQAIJPERM matrix is really just a MATSEQAIJ with some 277 * extra information, call the AssemblyEnd routine for a MATSEQAIJ. 278 * I'm not sure if this is the best way to do this, but it avoids 279 * a lot of code duplication. 280 * I also note that currently MATSEQAIJPERM doesn't know anything about 281 * the Mat_CompressedRow data structure that SeqAIJ now uses when there 282 * are many zero rows. If the SeqAIJ assembly end routine decides to use 283 * this, this may break things. (Don't know... haven't looked at it.) */ 284 a->inode.use = PETSC_FALSE; 285 ierr = MatAssemblyEnd_SeqAIJ(A, mode);CHKERRQ(ierr); 286 287 /* Now calculate the permutation and grouping information. */ 288 ierr = MatSeqAIJPERM_create_perm(A);CHKERRQ(ierr); 289 PetscFunctionReturn(0); 290 } 291 292 PetscErrorCode MatMult_SeqAIJPERM(Mat A,Vec xx,Vec yy) 293 { 294 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 295 const PetscScalar *x; 296 PetscScalar *y; 297 const MatScalar *aa; 298 PetscErrorCode ierr; 299 const PetscInt *aj,*ai; 300 #if !(defined(PETSC_USE_FORTRAN_KERNEL_MULTAIJPERM) && defined(notworking)) 301 PetscInt i,j; 302 #endif 303 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 304 __m512d vec_x,vec_y,vec_vals; 305 __m256i vec_idx,vec_ipos,vec_j; 306 __mmask8 mask; 307 #endif 308 309 /* Variables that don't appear in MatMult_SeqAIJ. */ 310 Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM*) A->spptr; 311 PetscInt *iperm; /* Points to the permutation vector. */ 312 PetscInt *xgroup; 313 /* Denotes where groups of rows with same number of nonzeros 314 * begin and end in iperm. */ 315 PetscInt *nzgroup; 316 PetscInt ngroup; 317 PetscInt igroup; 318 PetscInt jstart,jend; 319 /* jstart is used in loops to denote the position in iperm where a 320 * group starts; jend denotes the position where it ends. 321 * (jend + 1 is where the next group starts.) */ 322 PetscInt iold,nz; 323 PetscInt istart,iend,isize; 324 PetscInt ipos; 325 PetscScalar yp[NDIM]; 326 PetscInt ip[NDIM]; /* yp[] and ip[] are treated as vector "registers" for performing the mat-vec. */ 327 328 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 329 #pragma disjoint(*x,*y,*aa) 330 #endif 331 332 PetscFunctionBegin; 333 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 334 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 335 aj = a->j; /* aj[k] gives column index for element aa[k]. */ 336 aa = a->a; /* Nonzero elements stored row-by-row. */ 337 ai = a->i; /* ai[k] is the position in aa and aj where row k starts. */ 338 339 /* Get the info we need about the permutations and groupings. */ 340 iperm = aijperm->iperm; 341 ngroup = aijperm->ngroup; 342 xgroup = aijperm->xgroup; 343 nzgroup = aijperm->nzgroup; 344 345 #if defined(PETSC_USE_FORTRAN_KERNEL_MULTAIJPERM) && defined(notworking) 346 fortranmultaijperm_(&m,x,ii,aj,aa,y); 347 #else 348 349 for (igroup=0; igroup<ngroup; igroup++) { 350 jstart = xgroup[igroup]; 351 jend = xgroup[igroup+1] - 1; 352 nz = nzgroup[igroup]; 353 354 /* Handle the special cases where the number of nonzeros per row 355 * in the group is either 0 or 1. */ 356 if (nz == 0) { 357 for (i=jstart; i<=jend; i++) { 358 y[iperm[i]] = 0.0; 359 } 360 } else if (nz == 1) { 361 for (i=jstart; i<=jend; i++) { 362 iold = iperm[i]; 363 ipos = ai[iold]; 364 y[iold] = aa[ipos] * x[aj[ipos]]; 365 } 366 } else { 367 368 /* We work our way through the current group in chunks of NDIM rows 369 * at a time. */ 370 371 for (istart=jstart; istart<=jend; istart+=NDIM) { 372 /* Figure out where the chunk of 'isize' rows ends in iperm. 373 * 'isize may of course be less than NDIM for the last chunk. */ 374 iend = istart + (NDIM - 1); 375 376 if (iend > jend) iend = jend; 377 378 isize = iend - istart + 1; 379 380 /* Initialize the yp[] array that will be used to hold part of 381 * the permuted results vector, and figure out where in aa each 382 * row of the chunk will begin. */ 383 for (i=0; i<isize; i++) { 384 iold = iperm[istart + i]; 385 /* iold is a row number from the matrix A *before* reordering. */ 386 ip[i] = ai[iold]; 387 /* ip[i] tells us where the ith row of the chunk begins in aa. */ 388 yp[i] = (PetscScalar) 0.0; 389 } 390 391 /* If the number of zeros per row exceeds the number of rows in 392 * the chunk, we should vectorize along nz, that is, perform the 393 * mat-vec one row at a time as in the usual CSR case. */ 394 if (nz > isize) { 395 #if defined(PETSC_HAVE_CRAY_VECTOR) 396 #pragma _CRI preferstream 397 #endif 398 for (i=0; i<isize; i++) { 399 #if defined(PETSC_HAVE_CRAY_VECTOR) 400 #pragma _CRI prefervector 401 #endif 402 403 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 404 vec_y = _mm512_setzero_pd(); 405 ipos = ip[i]; 406 for (j=0; j<(nz>>3); j++) { 407 vec_idx = _mm256_loadu_si256((__m256i const*)&aj[ipos]); 408 vec_vals = _mm512_loadu_pd(&aa[ipos]); 409 vec_x = _mm512_i32gather_pd(vec_idx,x,_MM_SCALE_8); 410 vec_y = _mm512_fmadd_pd(vec_x,vec_vals,vec_y); 411 ipos += 8; 412 } 413 if ((nz&0x07)>2) { 414 mask = (__mmask8)(0xff >> (8-(nz&0x07))); 415 vec_idx = _mm256_loadu_si256((__m256i const*)&aj[ipos]); 416 vec_vals = _mm512_loadu_pd(&aa[ipos]); 417 vec_x = _mm512_mask_i32gather_pd(vec_x,mask,vec_idx,x,_MM_SCALE_8); 418 vec_y = _mm512_mask3_fmadd_pd(vec_x,vec_vals,vec_y,mask); 419 } else if ((nz&0x07)==2) { 420 yp[i] += aa[ipos]*x[aj[ipos]]; 421 yp[i] += aa[ipos+1]*x[aj[ipos+1]]; 422 } else if ((nz&0x07)==1) { 423 yp[i] += aa[ipos]*x[aj[ipos]]; 424 } 425 yp[i] += _mm512_reduce_add_pd(vec_y); 426 #else 427 for (j=0; j<nz; j++) { 428 ipos = ip[i] + j; 429 yp[i] += aa[ipos] * x[aj[ipos]]; 430 } 431 #endif 432 } 433 } else { 434 /* Otherwise, there are enough rows in the chunk to make it 435 * worthwhile to vectorize across the rows, that is, to do the 436 * matvec by operating with "columns" of the chunk. */ 437 for (j=0; j<nz; j++) { 438 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 439 vec_j = _mm256_set1_epi32(j); 440 for (i=0; i<((isize>>3)<<3); i+=8) { 441 vec_y = _mm512_loadu_pd(&yp[i]); 442 vec_ipos = _mm256_loadu_si256((__m256i const*)&ip[i]); 443 vec_ipos = _mm256_add_epi32(vec_ipos,vec_j); 444 vec_idx = _mm256_i32gather_epi32(aj,vec_ipos,_MM_SCALE_4); 445 vec_vals = _mm512_i32gather_pd(vec_ipos,aa,_MM_SCALE_8); 446 vec_x = _mm512_i32gather_pd(vec_idx,x,_MM_SCALE_8); 447 vec_y = _mm512_fmadd_pd(vec_x,vec_vals,vec_y); 448 _mm512_storeu_pd(&yp[i],vec_y); 449 } 450 for (i=isize-(isize&0x07); i<isize; i++) { 451 ipos = ip[i]+j; 452 yp[i] += aa[ipos]*x[aj[ipos]]; 453 } 454 #else 455 for (i=0; i<isize; i++) { 456 ipos = ip[i] + j; 457 yp[i] += aa[ipos] * x[aj[ipos]]; 458 } 459 #endif 460 } 461 } 462 463 #if defined(PETSC_HAVE_CRAY_VECTOR) 464 #pragma _CRI ivdep 465 #endif 466 /* Put results from yp[] into non-permuted result vector y. */ 467 for (i=0; i<isize; i++) { 468 y[iperm[istart+i]] = yp[i]; 469 } 470 } /* End processing chunk of isize rows of a group. */ 471 } /* End handling matvec for chunk with nz > 1. */ 472 } /* End loop over igroup. */ 473 #endif 474 ierr = PetscLogFlops(PetscMax(2.0*a->nz - A->rmap->n,0));CHKERRQ(ierr); 475 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 476 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 477 PetscFunctionReturn(0); 478 } 479 480 481 /* MatMultAdd_SeqAIJPERM() calculates yy = ww + A * xx. 482 * Note that the names I used to designate the vectors differs from that 483 * used in MatMultAdd_SeqAIJ(). I did this to keep my notation consistent 484 * with the MatMult_SeqAIJPERM() routine, which is very similar to this one. */ 485 /* 486 I hate having virtually identical code for the mult and the multadd!!! 487 */ 488 PetscErrorCode MatMultAdd_SeqAIJPERM(Mat A,Vec xx,Vec ww,Vec yy) 489 { 490 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 491 const PetscScalar *x; 492 PetscScalar *y,*w; 493 const MatScalar *aa; 494 PetscErrorCode ierr; 495 const PetscInt *aj,*ai; 496 #if !defined(PETSC_USE_FORTRAN_KERNEL_MULTADDAIJPERM) 497 PetscInt i,j; 498 #endif 499 500 /* Variables that don't appear in MatMultAdd_SeqAIJ. */ 501 Mat_SeqAIJPERM * aijperm; 502 PetscInt *iperm; /* Points to the permutation vector. */ 503 PetscInt *xgroup; 504 /* Denotes where groups of rows with same number of nonzeros 505 * begin and end in iperm. */ 506 PetscInt *nzgroup; 507 PetscInt ngroup; 508 PetscInt igroup; 509 PetscInt jstart,jend; 510 /* jstart is used in loops to denote the position in iperm where a 511 * group starts; jend denotes the position where it ends. 512 * (jend + 1 is where the next group starts.) */ 513 PetscInt iold,nz; 514 PetscInt istart,iend,isize; 515 PetscInt ipos; 516 PetscScalar yp[NDIM]; 517 PetscInt ip[NDIM]; 518 /* yp[] and ip[] are treated as vector "registers" for performing 519 * the mat-vec. */ 520 521 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 522 #pragma disjoint(*x,*y,*aa) 523 #endif 524 525 PetscFunctionBegin; 526 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 527 ierr = VecGetArrayPair(yy,ww,&y,&w);CHKERRQ(ierr); 528 529 aj = a->j; /* aj[k] gives column index for element aa[k]. */ 530 aa = a->a; /* Nonzero elements stored row-by-row. */ 531 ai = a->i; /* ai[k] is the position in aa and aj where row k starts. */ 532 533 /* Get the info we need about the permutations and groupings. */ 534 aijperm = (Mat_SeqAIJPERM*) A->spptr; 535 iperm = aijperm->iperm; 536 ngroup = aijperm->ngroup; 537 xgroup = aijperm->xgroup; 538 nzgroup = aijperm->nzgroup; 539 540 #if defined(PETSC_USE_FORTRAN_KERNEL_MULTADDAIJPERM) 541 fortranmultaddaijperm_(&m,x,ii,aj,aa,y,w); 542 #else 543 544 for (igroup=0; igroup<ngroup; igroup++) { 545 jstart = xgroup[igroup]; 546 jend = xgroup[igroup+1] - 1; 547 548 nz = nzgroup[igroup]; 549 550 /* Handle the special cases where the number of nonzeros per row 551 * in the group is either 0 or 1. */ 552 if (nz == 0) { 553 for (i=jstart; i<=jend; i++) { 554 iold = iperm[i]; 555 y[iold] = w[iold]; 556 } 557 } 558 else if (nz == 1) { 559 for (i=jstart; i<=jend; i++) { 560 iold = iperm[i]; 561 ipos = ai[iold]; 562 y[iold] = w[iold] + aa[ipos] * x[aj[ipos]]; 563 } 564 } 565 /* For the general case: */ 566 else { 567 568 /* We work our way through the current group in chunks of NDIM rows 569 * at a time. */ 570 571 for (istart=jstart; istart<=jend; istart+=NDIM) { 572 /* Figure out where the chunk of 'isize' rows ends in iperm. 573 * 'isize may of course be less than NDIM for the last chunk. */ 574 iend = istart + (NDIM - 1); 575 if (iend > jend) iend = jend; 576 isize = iend - istart + 1; 577 578 /* Initialize the yp[] array that will be used to hold part of 579 * the permuted results vector, and figure out where in aa each 580 * row of the chunk will begin. */ 581 for (i=0; i<isize; i++) { 582 iold = iperm[istart + i]; 583 /* iold is a row number from the matrix A *before* reordering. */ 584 ip[i] = ai[iold]; 585 /* ip[i] tells us where the ith row of the chunk begins in aa. */ 586 yp[i] = w[iold]; 587 } 588 589 /* If the number of zeros per row exceeds the number of rows in 590 * the chunk, we should vectorize along nz, that is, perform the 591 * mat-vec one row at a time as in the usual CSR case. */ 592 if (nz > isize) { 593 #if defined(PETSC_HAVE_CRAY_VECTOR) 594 #pragma _CRI preferstream 595 #endif 596 for (i=0; i<isize; i++) { 597 #if defined(PETSC_HAVE_CRAY_VECTOR) 598 #pragma _CRI prefervector 599 #endif 600 for (j=0; j<nz; j++) { 601 ipos = ip[i] + j; 602 yp[i] += aa[ipos] * x[aj[ipos]]; 603 } 604 } 605 } 606 /* Otherwise, there are enough rows in the chunk to make it 607 * worthwhile to vectorize across the rows, that is, to do the 608 * matvec by operating with "columns" of the chunk. */ 609 else { 610 for (j=0; j<nz; j++) { 611 for (i=0; i<isize; i++) { 612 ipos = ip[i] + j; 613 yp[i] += aa[ipos] * x[aj[ipos]]; 614 } 615 } 616 } 617 618 #if defined(PETSC_HAVE_CRAY_VECTOR) 619 #pragma _CRI ivdep 620 #endif 621 /* Put results from yp[] into non-permuted result vector y. */ 622 for (i=0; i<isize; i++) { 623 y[iperm[istart+i]] = yp[i]; 624 } 625 } /* End processing chunk of isize rows of a group. */ 626 627 } /* End handling matvec for chunk with nz > 1. */ 628 } /* End loop over igroup. */ 629 630 #endif 631 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 632 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 633 ierr = VecRestoreArrayPair(yy,ww,&y,&w);CHKERRQ(ierr); 634 PetscFunctionReturn(0); 635 } 636 637 638 /* MatConvert_SeqAIJ_SeqAIJPERM converts a SeqAIJ matrix into a 639 * SeqAIJPERM matrix. This routine is called by the MatCreate_SeqAIJPERM() 640 * routine, but can also be used to convert an assembled SeqAIJ matrix 641 * into a SeqAIJPERM one. */ 642 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJPERM(Mat A,MatType type,MatReuse reuse,Mat *newmat) 643 { 644 PetscErrorCode ierr; 645 Mat B = *newmat; 646 Mat_SeqAIJPERM *aijperm; 647 PetscBool sametype; 648 649 PetscFunctionBegin; 650 if (reuse == MAT_INITIAL_MATRIX) { 651 ierr = MatDuplicate(A,MAT_COPY_VALUES,&B);CHKERRQ(ierr); 652 } 653 ierr = PetscObjectTypeCompare((PetscObject)A,type,&sametype);CHKERRQ(ierr); 654 if (sametype) PetscFunctionReturn(0); 655 656 ierr = PetscNewLog(B,&aijperm);CHKERRQ(ierr); 657 B->spptr = (void*) aijperm; 658 659 /* Set function pointers for methods that we inherit from AIJ but override. */ 660 B->ops->duplicate = MatDuplicate_SeqAIJPERM; 661 B->ops->assemblyend = MatAssemblyEnd_SeqAIJPERM; 662 B->ops->destroy = MatDestroy_SeqAIJPERM; 663 B->ops->mult = MatMult_SeqAIJPERM; 664 B->ops->multadd = MatMultAdd_SeqAIJPERM; 665 666 aijperm->nonzerostate = -1; /* this will trigger the generation of the permutation information the first time through MatAssembly()*/ 667 /* If A has already been assembled, compute the permutation. */ 668 if (A->assembled) { 669 ierr = MatSeqAIJPERM_create_perm(B);CHKERRQ(ierr); 670 } 671 672 ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijperm_seqaij_C",MatConvert_SeqAIJPERM_SeqAIJ);CHKERRQ(ierr); 673 ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMult_seqdense_seqaijperm_C",MatMatMult_SeqDense_SeqAIJ);CHKERRQ(ierr); 674 ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultSymbolic_seqdense_seqaijperm_C",MatMatMultSymbolic_SeqDense_SeqAIJ);CHKERRQ(ierr); 675 ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultNumeric_seqdense_seqaijperm_C",MatMatMultNumeric_SeqDense_SeqAIJ);CHKERRQ(ierr); 676 677 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJPERM);CHKERRQ(ierr); 678 *newmat = B; 679 PetscFunctionReturn(0); 680 } 681 682 /*@C 683 MatCreateSeqAIJPERM - Creates a sparse matrix of type SEQAIJPERM. 684 This type inherits from AIJ, but calculates some additional permutation 685 information that is used to allow better vectorization of some 686 operations. At the cost of increased storage, the AIJ formatted 687 matrix can be copied to a format in which pieces of the matrix are 688 stored in ELLPACK format, allowing the vectorized matrix multiply 689 routine to use stride-1 memory accesses. As with the AIJ type, it is 690 important to preallocate matrix storage in order to get good assembly 691 performance. 692 693 Collective on MPI_Comm 694 695 Input Parameters: 696 + comm - MPI communicator, set to PETSC_COMM_SELF 697 . m - number of rows 698 . n - number of columns 699 . nz - number of nonzeros per row (same for all rows) 700 - nnz - array containing the number of nonzeros in the various rows 701 (possibly different for each row) or NULL 702 703 Output Parameter: 704 . A - the matrix 705 706 Notes: 707 If nnz is given then nz is ignored 708 709 Level: intermediate 710 711 .keywords: matrix, cray, sparse, parallel 712 713 .seealso: MatCreate(), MatCreateMPIAIJPERM(), MatSetValues() 714 @*/ 715 PetscErrorCode MatCreateSeqAIJPERM(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 716 { 717 PetscErrorCode ierr; 718 719 PetscFunctionBegin; 720 ierr = MatCreate(comm,A);CHKERRQ(ierr); 721 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 722 ierr = MatSetType(*A,MATSEQAIJPERM);CHKERRQ(ierr); 723 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,nnz);CHKERRQ(ierr); 724 PetscFunctionReturn(0); 725 } 726 727 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJPERM(Mat A) 728 { 729 PetscErrorCode ierr; 730 731 PetscFunctionBegin; 732 ierr = MatSetType(A,MATSEQAIJ);CHKERRQ(ierr); 733 ierr = MatConvert_SeqAIJ_SeqAIJPERM(A,MATSEQAIJPERM,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 734 PetscFunctionReturn(0); 735 } 736 737