1 2 /* 3 Defines basic operations for the MATSEQAIJPERM matrix class. 4 This class is derived from the MATSEQAIJ class and retains the 5 compressed row storage (aka Yale sparse matrix format) but augments 6 it with some permutation information that enables some operations 7 to be more vectorizable. A physically rearranged copy of the matrix 8 may be stored if the user desires. 9 10 Eventually a variety of permutations may be supported. 11 */ 12 13 #include <../src/mat/impls/aij/seq/aij.h> 14 15 #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 16 #include <immintrin.h> 17 18 #if !defined(_MM_SCALE_8) 19 #define _MM_SCALE_8 8 20 #endif 21 #if !defined(_MM_SCALE_4) 22 #define _MM_SCALE_4 4 23 #endif 24 #endif 25 26 #define NDIM 512 27 /* NDIM specifies how many rows at a time we should work with when 28 * performing the vectorized mat-vec. This depends on various factors 29 * such as vector register length, etc., and I really need to add a 30 * way for the user (or the library) to tune this. I'm setting it to 31 * 512 for now since that is what Ed D'Azevedo was using in his Fortran 32 * routines. */ 33 34 typedef struct { 35 PetscObjectState nonzerostate; /* used to determine if the nonzero structure has changed and hence the permutations need updating */ 36 37 PetscInt ngroup; 38 PetscInt *xgroup; 39 /* Denotes where groups of rows with same number of nonzeros 40 * begin and end, i.e., xgroup[i] gives us the position in iperm[] 41 * where the ith group begins. */ 42 43 PetscInt *nzgroup; /* how many nonzeros each row that is a member of group i has. */ 44 PetscInt *iperm; /* The permutation vector. */ 45 46 /* Some of this stuff is for Ed's recursive triangular solve. 47 * I'm not sure what I need yet. */ 48 PetscInt blocksize; 49 PetscInt nstep; 50 PetscInt *jstart_list; 51 PetscInt *jend_list; 52 PetscInt *action_list; 53 PetscInt *ngroup_list; 54 PetscInt **ipointer_list; 55 PetscInt **xgroup_list; 56 PetscInt **nzgroup_list; 57 PetscInt **iperm_list; 58 } Mat_SeqAIJPERM; 59 60 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJPERM_SeqAIJ(Mat A,MatType type,MatReuse reuse,Mat *newmat) 61 { 62 /* This routine is only called to convert a MATAIJPERM to its base PETSc type, */ 63 /* so we will ignore 'MatType type'. */ 64 PetscErrorCode ierr; 65 Mat B = *newmat; 66 Mat_SeqAIJPERM *aijperm=(Mat_SeqAIJPERM*)A->spptr; 67 68 PetscFunctionBegin; 69 if (reuse == MAT_INITIAL_MATRIX) { 70 ierr = MatDuplicate(A,MAT_COPY_VALUES,&B);CHKERRQ(ierr); 71 aijperm=(Mat_SeqAIJPERM*)B->spptr; 72 } 73 74 /* Reset the original function pointers. */ 75 B->ops->assemblyend = MatAssemblyEnd_SeqAIJ; 76 B->ops->destroy = MatDestroy_SeqAIJ; 77 B->ops->duplicate = MatDuplicate_SeqAIJ; 78 B->ops->mult = MatMult_SeqAIJ; 79 B->ops->multadd = MatMultAdd_SeqAIJ; 80 81 ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijperm_seqaij_C",NULL);CHKERRQ(ierr); 82 83 /* Free everything in the Mat_SeqAIJPERM data structure.*/ 84 ierr = PetscFree(aijperm->xgroup);CHKERRQ(ierr); 85 ierr = PetscFree(aijperm->nzgroup);CHKERRQ(ierr); 86 ierr = PetscFree(aijperm->iperm);CHKERRQ(ierr); 87 ierr = PetscFree(B->spptr);CHKERRQ(ierr); 88 89 /* Change the type of B to MATSEQAIJ. */ 90 ierr = PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJ);CHKERRQ(ierr); 91 92 *newmat = B; 93 PetscFunctionReturn(0); 94 } 95 96 PetscErrorCode MatDestroy_SeqAIJPERM(Mat A) 97 { 98 PetscErrorCode ierr; 99 Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM*) A->spptr; 100 101 PetscFunctionBegin; 102 if (aijperm) { 103 /* If MatHeaderMerge() was used then this SeqAIJPERM matrix will not have a spprt. */ 104 ierr = PetscFree(aijperm->xgroup);CHKERRQ(ierr); 105 ierr = PetscFree(aijperm->nzgroup);CHKERRQ(ierr); 106 ierr = PetscFree(aijperm->iperm);CHKERRQ(ierr); 107 ierr = PetscFree(A->spptr);CHKERRQ(ierr); 108 } 109 /* Change the type of A back to SEQAIJ and use MatDestroy_SeqAIJ() 110 * to destroy everything that remains. */ 111 ierr = PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJ);CHKERRQ(ierr); 112 /* Note that I don't call MatSetType(). I believe this is because that 113 * is only to be called when *building* a matrix. I could be wrong, but 114 * that is how things work for the SuperLU matrix class. */ 115 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 116 PetscFunctionReturn(0); 117 } 118 119 PetscErrorCode MatDuplicate_SeqAIJPERM(Mat A, MatDuplicateOption op, Mat *M) 120 { 121 PetscErrorCode ierr; 122 Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM*) A->spptr; 123 Mat_SeqAIJPERM *aijperm_dest; 124 PetscBool perm; 125 126 PetscFunctionBegin; 127 ierr = MatDuplicate_SeqAIJ(A,op,M);CHKERRQ(ierr); 128 ierr = PetscObjectTypeCompare((PetscObject)*M,MATSEQAIJPERM,&perm);CHKERRQ(ierr); 129 if (perm) { 130 aijperm_dest = (Mat_SeqAIJPERM *) (*M)->spptr; 131 ierr = PetscFree(aijperm_dest->xgroup);CHKERRQ(ierr); 132 ierr = PetscFree(aijperm_dest->nzgroup);CHKERRQ(ierr); 133 ierr = PetscFree(aijperm_dest->iperm);CHKERRQ(ierr); 134 } else { 135 ierr = PetscNewLog(*M,&aijperm_dest);CHKERRQ(ierr); 136 (*M)->spptr = (void*) aijperm_dest; 137 ierr = PetscObjectChangeTypeName((PetscObject)*M,MATSEQAIJPERM);CHKERRQ(ierr); 138 ierr = PetscObjectComposeFunction((PetscObject)*M,"MatConvert_seqaijperm_seqaij_C",MatConvert_SeqAIJPERM_SeqAIJ);CHKERRQ(ierr); 139 } 140 ierr = PetscArraycpy(aijperm_dest,aijperm,1);CHKERRQ(ierr); 141 /* Allocate space for, and copy the grouping and permutation info. 142 * I note that when the groups are initially determined in 143 * MatSeqAIJPERM_create_perm, xgroup and nzgroup may be sized larger than 144 * necessary. But at this point, we know how large they need to be, and 145 * allocate only the necessary amount of memory. So the duplicated matrix 146 * may actually use slightly less storage than the original! */ 147 ierr = PetscMalloc1(A->rmap->n, &aijperm_dest->iperm);CHKERRQ(ierr); 148 ierr = PetscMalloc1(aijperm->ngroup+1, &aijperm_dest->xgroup);CHKERRQ(ierr); 149 ierr = PetscMalloc1(aijperm->ngroup, &aijperm_dest->nzgroup);CHKERRQ(ierr); 150 ierr = PetscArraycpy(aijperm_dest->iperm,aijperm->iperm,A->rmap->n);CHKERRQ(ierr); 151 ierr = PetscArraycpy(aijperm_dest->xgroup,aijperm->xgroup,aijperm->ngroup+1);CHKERRQ(ierr); 152 ierr = PetscArraycpy(aijperm_dest->nzgroup,aijperm->nzgroup,aijperm->ngroup);CHKERRQ(ierr); 153 PetscFunctionReturn(0); 154 } 155 156 PetscErrorCode MatSeqAIJPERM_create_perm(Mat A) 157 { 158 PetscErrorCode ierr; 159 Mat_SeqAIJ *a = (Mat_SeqAIJ*)(A)->data; 160 Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM*) A->spptr; 161 PetscInt m; /* Number of rows in the matrix. */ 162 PetscInt *ia; /* From the CSR representation; points to the beginning of each row. */ 163 PetscInt maxnz; /* Maximum number of nonzeros in any row. */ 164 PetscInt *rows_in_bucket; 165 /* To construct the permutation, we sort each row into one of maxnz 166 * buckets based on how many nonzeros are in the row. */ 167 PetscInt nz; 168 PetscInt *nz_in_row; /* the number of nonzero elements in row k. */ 169 PetscInt *ipnz; 170 /* When constructing the iperm permutation vector, 171 * ipnz[nz] is used to point to the next place in the permutation vector 172 * that a row with nz nonzero elements should be placed.*/ 173 PetscInt i, ngroup, istart, ipos; 174 175 PetscFunctionBegin; 176 if (aijperm->nonzerostate == A->nonzerostate) PetscFunctionReturn(0); /* permutation exists and matches current nonzero structure */ 177 aijperm->nonzerostate = A->nonzerostate; 178 /* Free anything previously put in the Mat_SeqAIJPERM data structure. */ 179 ierr = PetscFree(aijperm->xgroup);CHKERRQ(ierr); 180 ierr = PetscFree(aijperm->nzgroup);CHKERRQ(ierr); 181 ierr = PetscFree(aijperm->iperm);CHKERRQ(ierr); 182 183 m = A->rmap->n; 184 ia = a->i; 185 186 /* Allocate the arrays that will hold the permutation vector. */ 187 ierr = PetscMalloc1(m, &aijperm->iperm);CHKERRQ(ierr); 188 189 /* Allocate some temporary work arrays that will be used in 190 * calculating the permuation vector and groupings. */ 191 ierr = PetscMalloc1(m, &nz_in_row);CHKERRQ(ierr); 192 193 /* Now actually figure out the permutation and grouping. */ 194 195 /* First pass: Determine number of nonzeros in each row, maximum 196 * number of nonzeros in any row, and how many rows fall into each 197 * "bucket" of rows with same number of nonzeros. */ 198 maxnz = 0; 199 for (i=0; i<m; i++) { 200 nz_in_row[i] = ia[i+1]-ia[i]; 201 if (nz_in_row[i] > maxnz) maxnz = nz_in_row[i]; 202 } 203 ierr = PetscMalloc1(PetscMax(maxnz,m)+1, &rows_in_bucket);CHKERRQ(ierr); 204 ierr = PetscMalloc1(PetscMax(maxnz,m)+1, &ipnz);CHKERRQ(ierr); 205 206 for (i=0; i<=maxnz; i++) { 207 rows_in_bucket[i] = 0; 208 } 209 for (i=0; i<m; i++) { 210 nz = nz_in_row[i]; 211 rows_in_bucket[nz]++; 212 } 213 214 /* Allocate space for the grouping info. There will be at most (maxnz + 1) 215 * groups. (It is maxnz + 1 instead of simply maxnz because there may be 216 * rows with no nonzero elements.) If there are (maxnz + 1) groups, 217 * then xgroup[] must consist of (maxnz + 2) elements, since the last 218 * element of xgroup will tell us where the (maxnz + 1)th group ends. 219 * We allocate space for the maximum number of groups; 220 * that is potentially a little wasteful, but not too much so. 221 * Perhaps I should fix it later. */ 222 ierr = PetscMalloc1(maxnz+2, &aijperm->xgroup);CHKERRQ(ierr); 223 ierr = PetscMalloc1(maxnz+1, &aijperm->nzgroup);CHKERRQ(ierr); 224 225 /* Second pass. Look at what is in the buckets and create the groupings. 226 * Note that it is OK to have a group of rows with no non-zero values. */ 227 ngroup = 0; 228 istart = 0; 229 for (i=0; i<=maxnz; i++) { 230 if (rows_in_bucket[i] > 0) { 231 aijperm->nzgroup[ngroup] = i; 232 aijperm->xgroup[ngroup] = istart; 233 ngroup++; 234 istart += rows_in_bucket[i]; 235 } 236 } 237 238 aijperm->xgroup[ngroup] = istart; 239 aijperm->ngroup = ngroup; 240 241 /* Now fill in the permutation vector iperm. */ 242 ipnz[0] = 0; 243 for (i=0; i<maxnz; i++) { 244 ipnz[i+1] = ipnz[i] + rows_in_bucket[i]; 245 } 246 247 for (i=0; i<m; i++) { 248 nz = nz_in_row[i]; 249 ipos = ipnz[nz]; 250 aijperm->iperm[ipos] = i; 251 ipnz[nz]++; 252 } 253 254 /* Clean up temporary work arrays. */ 255 ierr = PetscFree(rows_in_bucket);CHKERRQ(ierr); 256 ierr = PetscFree(ipnz);CHKERRQ(ierr); 257 ierr = PetscFree(nz_in_row);CHKERRQ(ierr); 258 PetscFunctionReturn(0); 259 } 260 261 PetscErrorCode MatAssemblyEnd_SeqAIJPERM(Mat A, MatAssemblyType mode) 262 { 263 PetscErrorCode ierr; 264 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 265 266 PetscFunctionBegin; 267 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 268 269 /* Since a MATSEQAIJPERM matrix is really just a MATSEQAIJ with some 270 * extra information, call the AssemblyEnd routine for a MATSEQAIJ. 271 * I'm not sure if this is the best way to do this, but it avoids 272 * a lot of code duplication. 273 * I also note that currently MATSEQAIJPERM doesn't know anything about 274 * the Mat_CompressedRow data structure that SeqAIJ now uses when there 275 * are many zero rows. If the SeqAIJ assembly end routine decides to use 276 * this, this may break things. (Don't know... haven't looked at it.) */ 277 a->inode.use = PETSC_FALSE; 278 ierr = MatAssemblyEnd_SeqAIJ(A, mode);CHKERRQ(ierr); 279 280 /* Now calculate the permutation and grouping information. */ 281 ierr = MatSeqAIJPERM_create_perm(A);CHKERRQ(ierr); 282 PetscFunctionReturn(0); 283 } 284 285 PetscErrorCode MatMult_SeqAIJPERM(Mat A,Vec xx,Vec yy) 286 { 287 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 288 const PetscScalar *x; 289 PetscScalar *y; 290 const MatScalar *aa; 291 PetscErrorCode ierr; 292 const PetscInt *aj,*ai; 293 #if !(defined(PETSC_USE_FORTRAN_KERNEL_MULTAIJPERM) && defined(notworking)) 294 PetscInt i,j; 295 #endif 296 #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 297 __m512d vec_x,vec_y,vec_vals; 298 __m256i vec_idx,vec_ipos,vec_j; 299 __mmask8 mask; 300 #endif 301 302 /* Variables that don't appear in MatMult_SeqAIJ. */ 303 Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM*) A->spptr; 304 PetscInt *iperm; /* Points to the permutation vector. */ 305 PetscInt *xgroup; 306 /* Denotes where groups of rows with same number of nonzeros 307 * begin and end in iperm. */ 308 PetscInt *nzgroup; 309 PetscInt ngroup; 310 PetscInt igroup; 311 PetscInt jstart,jend; 312 /* jstart is used in loops to denote the position in iperm where a 313 * group starts; jend denotes the position where it ends. 314 * (jend + 1 is where the next group starts.) */ 315 PetscInt iold,nz; 316 PetscInt istart,iend,isize; 317 PetscInt ipos; 318 PetscScalar yp[NDIM]; 319 PetscInt ip[NDIM]; /* yp[] and ip[] are treated as vector "registers" for performing the mat-vec. */ 320 321 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 322 #pragma disjoint(*x,*y,*aa) 323 #endif 324 325 PetscFunctionBegin; 326 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 327 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 328 aj = a->j; /* aj[k] gives column index for element aa[k]. */ 329 aa = a->a; /* Nonzero elements stored row-by-row. */ 330 ai = a->i; /* ai[k] is the position in aa and aj where row k starts. */ 331 332 /* Get the info we need about the permutations and groupings. */ 333 iperm = aijperm->iperm; 334 ngroup = aijperm->ngroup; 335 xgroup = aijperm->xgroup; 336 nzgroup = aijperm->nzgroup; 337 338 #if defined(PETSC_USE_FORTRAN_KERNEL_MULTAIJPERM) && defined(notworking) 339 fortranmultaijperm_(&m,x,ii,aj,aa,y); 340 #else 341 342 for (igroup=0; igroup<ngroup; igroup++) { 343 jstart = xgroup[igroup]; 344 jend = xgroup[igroup+1] - 1; 345 nz = nzgroup[igroup]; 346 347 /* Handle the special cases where the number of nonzeros per row 348 * in the group is either 0 or 1. */ 349 if (nz == 0) { 350 for (i=jstart; i<=jend; i++) { 351 y[iperm[i]] = 0.0; 352 } 353 } else if (nz == 1) { 354 for (i=jstart; i<=jend; i++) { 355 iold = iperm[i]; 356 ipos = ai[iold]; 357 y[iold] = aa[ipos] * x[aj[ipos]]; 358 } 359 } else { 360 361 /* We work our way through the current group in chunks of NDIM rows 362 * at a time. */ 363 364 for (istart=jstart; istart<=jend; istart+=NDIM) { 365 /* Figure out where the chunk of 'isize' rows ends in iperm. 366 * 'isize may of course be less than NDIM for the last chunk. */ 367 iend = istart + (NDIM - 1); 368 369 if (iend > jend) iend = jend; 370 371 isize = iend - istart + 1; 372 373 /* Initialize the yp[] array that will be used to hold part of 374 * the permuted results vector, and figure out where in aa each 375 * row of the chunk will begin. */ 376 for (i=0; i<isize; i++) { 377 iold = iperm[istart + i]; 378 /* iold is a row number from the matrix A *before* reordering. */ 379 ip[i] = ai[iold]; 380 /* ip[i] tells us where the ith row of the chunk begins in aa. */ 381 yp[i] = (PetscScalar) 0.0; 382 } 383 384 /* If the number of zeros per row exceeds the number of rows in 385 * the chunk, we should vectorize along nz, that is, perform the 386 * mat-vec one row at a time as in the usual CSR case. */ 387 if (nz > isize) { 388 #if defined(PETSC_HAVE_CRAY_VECTOR) 389 #pragma _CRI preferstream 390 #endif 391 for (i=0; i<isize; i++) { 392 #if defined(PETSC_HAVE_CRAY_VECTOR) 393 #pragma _CRI prefervector 394 #endif 395 396 #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 397 vec_y = _mm512_setzero_pd(); 398 ipos = ip[i]; 399 for (j=0; j<(nz>>3); j++) { 400 vec_idx = _mm256_loadu_si256((__m256i const*)&aj[ipos]); 401 vec_vals = _mm512_loadu_pd(&aa[ipos]); 402 vec_x = _mm512_i32gather_pd(vec_idx,x,_MM_SCALE_8); 403 vec_y = _mm512_fmadd_pd(vec_x,vec_vals,vec_y); 404 ipos += 8; 405 } 406 if ((nz&0x07)>2) { 407 mask = (__mmask8)(0xff >> (8-(nz&0x07))); 408 vec_idx = _mm256_loadu_si256((__m256i const*)&aj[ipos]); 409 vec_vals = _mm512_loadu_pd(&aa[ipos]); 410 vec_x = _mm512_mask_i32gather_pd(vec_x,mask,vec_idx,x,_MM_SCALE_8); 411 vec_y = _mm512_mask3_fmadd_pd(vec_x,vec_vals,vec_y,mask); 412 } else if ((nz&0x07)==2) { 413 yp[i] += aa[ipos]*x[aj[ipos]]; 414 yp[i] += aa[ipos+1]*x[aj[ipos+1]]; 415 } else if ((nz&0x07)==1) { 416 yp[i] += aa[ipos]*x[aj[ipos]]; 417 } 418 yp[i] += _mm512_reduce_add_pd(vec_y); 419 #else 420 for (j=0; j<nz; j++) { 421 ipos = ip[i] + j; 422 yp[i] += aa[ipos] * x[aj[ipos]]; 423 } 424 #endif 425 } 426 } else { 427 /* Otherwise, there are enough rows in the chunk to make it 428 * worthwhile to vectorize across the rows, that is, to do the 429 * matvec by operating with "columns" of the chunk. */ 430 for (j=0; j<nz; j++) { 431 #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 432 vec_j = _mm256_set1_epi32(j); 433 for (i=0; i<((isize>>3)<<3); i+=8) { 434 vec_y = _mm512_loadu_pd(&yp[i]); 435 vec_ipos = _mm256_loadu_si256((__m256i const*)&ip[i]); 436 vec_ipos = _mm256_add_epi32(vec_ipos,vec_j); 437 vec_idx = _mm256_i32gather_epi32(aj,vec_ipos,_MM_SCALE_4); 438 vec_vals = _mm512_i32gather_pd(vec_ipos,aa,_MM_SCALE_8); 439 vec_x = _mm512_i32gather_pd(vec_idx,x,_MM_SCALE_8); 440 vec_y = _mm512_fmadd_pd(vec_x,vec_vals,vec_y); 441 _mm512_storeu_pd(&yp[i],vec_y); 442 } 443 for (i=isize-(isize&0x07); i<isize; i++) { 444 ipos = ip[i]+j; 445 yp[i] += aa[ipos]*x[aj[ipos]]; 446 } 447 #else 448 for (i=0; i<isize; i++) { 449 ipos = ip[i] + j; 450 yp[i] += aa[ipos] * x[aj[ipos]]; 451 } 452 #endif 453 } 454 } 455 456 #if defined(PETSC_HAVE_CRAY_VECTOR) 457 #pragma _CRI ivdep 458 #endif 459 /* Put results from yp[] into non-permuted result vector y. */ 460 for (i=0; i<isize; i++) { 461 y[iperm[istart+i]] = yp[i]; 462 } 463 } /* End processing chunk of isize rows of a group. */ 464 } /* End handling matvec for chunk with nz > 1. */ 465 } /* End loop over igroup. */ 466 #endif 467 ierr = PetscLogFlops(PetscMax(2.0*a->nz - A->rmap->n,0));CHKERRQ(ierr); 468 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 469 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 470 PetscFunctionReturn(0); 471 } 472 473 /* MatMultAdd_SeqAIJPERM() calculates yy = ww + A * xx. 474 * Note that the names I used to designate the vectors differs from that 475 * used in MatMultAdd_SeqAIJ(). I did this to keep my notation consistent 476 * with the MatMult_SeqAIJPERM() routine, which is very similar to this one. */ 477 /* 478 I hate having virtually identical code for the mult and the multadd!!! 479 */ 480 PetscErrorCode MatMultAdd_SeqAIJPERM(Mat A,Vec xx,Vec ww,Vec yy) 481 { 482 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 483 const PetscScalar *x; 484 PetscScalar *y,*w; 485 const MatScalar *aa; 486 PetscErrorCode ierr; 487 const PetscInt *aj,*ai; 488 #if !defined(PETSC_USE_FORTRAN_KERNEL_MULTADDAIJPERM) 489 PetscInt i,j; 490 #endif 491 492 /* Variables that don't appear in MatMultAdd_SeqAIJ. */ 493 Mat_SeqAIJPERM * aijperm; 494 PetscInt *iperm; /* Points to the permutation vector. */ 495 PetscInt *xgroup; 496 /* Denotes where groups of rows with same number of nonzeros 497 * begin and end in iperm. */ 498 PetscInt *nzgroup; 499 PetscInt ngroup; 500 PetscInt igroup; 501 PetscInt jstart,jend; 502 /* jstart is used in loops to denote the position in iperm where a 503 * group starts; jend denotes the position where it ends. 504 * (jend + 1 is where the next group starts.) */ 505 PetscInt iold,nz; 506 PetscInt istart,iend,isize; 507 PetscInt ipos; 508 PetscScalar yp[NDIM]; 509 PetscInt ip[NDIM]; 510 /* yp[] and ip[] are treated as vector "registers" for performing 511 * the mat-vec. */ 512 513 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 514 #pragma disjoint(*x,*y,*aa) 515 #endif 516 517 PetscFunctionBegin; 518 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 519 ierr = VecGetArrayPair(yy,ww,&y,&w);CHKERRQ(ierr); 520 521 aj = a->j; /* aj[k] gives column index for element aa[k]. */ 522 aa = a->a; /* Nonzero elements stored row-by-row. */ 523 ai = a->i; /* ai[k] is the position in aa and aj where row k starts. */ 524 525 /* Get the info we need about the permutations and groupings. */ 526 aijperm = (Mat_SeqAIJPERM*) A->spptr; 527 iperm = aijperm->iperm; 528 ngroup = aijperm->ngroup; 529 xgroup = aijperm->xgroup; 530 nzgroup = aijperm->nzgroup; 531 532 #if defined(PETSC_USE_FORTRAN_KERNEL_MULTADDAIJPERM) 533 fortranmultaddaijperm_(&m,x,ii,aj,aa,y,w); 534 #else 535 536 for (igroup=0; igroup<ngroup; igroup++) { 537 jstart = xgroup[igroup]; 538 jend = xgroup[igroup+1] - 1; 539 540 nz = nzgroup[igroup]; 541 542 /* Handle the special cases where the number of nonzeros per row 543 * in the group is either 0 or 1. */ 544 if (nz == 0) { 545 for (i=jstart; i<=jend; i++) { 546 iold = iperm[i]; 547 y[iold] = w[iold]; 548 } 549 } 550 else if (nz == 1) { 551 for (i=jstart; i<=jend; i++) { 552 iold = iperm[i]; 553 ipos = ai[iold]; 554 y[iold] = w[iold] + aa[ipos] * x[aj[ipos]]; 555 } 556 } 557 /* For the general case: */ 558 else { 559 560 /* We work our way through the current group in chunks of NDIM rows 561 * at a time. */ 562 563 for (istart=jstart; istart<=jend; istart+=NDIM) { 564 /* Figure out where the chunk of 'isize' rows ends in iperm. 565 * 'isize may of course be less than NDIM for the last chunk. */ 566 iend = istart + (NDIM - 1); 567 if (iend > jend) iend = jend; 568 isize = iend - istart + 1; 569 570 /* Initialize the yp[] array that will be used to hold part of 571 * the permuted results vector, and figure out where in aa each 572 * row of the chunk will begin. */ 573 for (i=0; i<isize; i++) { 574 iold = iperm[istart + i]; 575 /* iold is a row number from the matrix A *before* reordering. */ 576 ip[i] = ai[iold]; 577 /* ip[i] tells us where the ith row of the chunk begins in aa. */ 578 yp[i] = w[iold]; 579 } 580 581 /* If the number of zeros per row exceeds the number of rows in 582 * the chunk, we should vectorize along nz, that is, perform the 583 * mat-vec one row at a time as in the usual CSR case. */ 584 if (nz > isize) { 585 #if defined(PETSC_HAVE_CRAY_VECTOR) 586 #pragma _CRI preferstream 587 #endif 588 for (i=0; i<isize; i++) { 589 #if defined(PETSC_HAVE_CRAY_VECTOR) 590 #pragma _CRI prefervector 591 #endif 592 for (j=0; j<nz; j++) { 593 ipos = ip[i] + j; 594 yp[i] += aa[ipos] * x[aj[ipos]]; 595 } 596 } 597 } 598 /* Otherwise, there are enough rows in the chunk to make it 599 * worthwhile to vectorize across the rows, that is, to do the 600 * matvec by operating with "columns" of the chunk. */ 601 else { 602 for (j=0; j<nz; j++) { 603 for (i=0; i<isize; i++) { 604 ipos = ip[i] + j; 605 yp[i] += aa[ipos] * x[aj[ipos]]; 606 } 607 } 608 } 609 610 #if defined(PETSC_HAVE_CRAY_VECTOR) 611 #pragma _CRI ivdep 612 #endif 613 /* Put results from yp[] into non-permuted result vector y. */ 614 for (i=0; i<isize; i++) { 615 y[iperm[istart+i]] = yp[i]; 616 } 617 } /* End processing chunk of isize rows of a group. */ 618 619 } /* End handling matvec for chunk with nz > 1. */ 620 } /* End loop over igroup. */ 621 622 #endif 623 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 624 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 625 ierr = VecRestoreArrayPair(yy,ww,&y,&w);CHKERRQ(ierr); 626 PetscFunctionReturn(0); 627 } 628 629 /* MatConvert_SeqAIJ_SeqAIJPERM converts a SeqAIJ matrix into a 630 * SeqAIJPERM matrix. This routine is called by the MatCreate_SeqAIJPERM() 631 * routine, but can also be used to convert an assembled SeqAIJ matrix 632 * into a SeqAIJPERM one. */ 633 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJPERM(Mat A,MatType type,MatReuse reuse,Mat *newmat) 634 { 635 PetscErrorCode ierr; 636 Mat B = *newmat; 637 Mat_SeqAIJPERM *aijperm; 638 PetscBool sametype; 639 640 PetscFunctionBegin; 641 if (reuse == MAT_INITIAL_MATRIX) { 642 ierr = MatDuplicate(A,MAT_COPY_VALUES,&B);CHKERRQ(ierr); 643 } 644 ierr = PetscObjectTypeCompare((PetscObject)A,type,&sametype);CHKERRQ(ierr); 645 if (sametype) PetscFunctionReturn(0); 646 647 ierr = PetscNewLog(B,&aijperm);CHKERRQ(ierr); 648 B->spptr = (void*) aijperm; 649 650 /* Set function pointers for methods that we inherit from AIJ but override. */ 651 B->ops->duplicate = MatDuplicate_SeqAIJPERM; 652 B->ops->assemblyend = MatAssemblyEnd_SeqAIJPERM; 653 B->ops->destroy = MatDestroy_SeqAIJPERM; 654 B->ops->mult = MatMult_SeqAIJPERM; 655 B->ops->multadd = MatMultAdd_SeqAIJPERM; 656 657 aijperm->nonzerostate = -1; /* this will trigger the generation of the permutation information the first time through MatAssembly()*/ 658 /* If A has already been assembled, compute the permutation. */ 659 if (A->assembled) { 660 ierr = MatSeqAIJPERM_create_perm(B);CHKERRQ(ierr); 661 } 662 663 ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijperm_seqaij_C",MatConvert_SeqAIJPERM_SeqAIJ);CHKERRQ(ierr); 664 665 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJPERM);CHKERRQ(ierr); 666 *newmat = B; 667 PetscFunctionReturn(0); 668 } 669 670 /*@C 671 MatCreateSeqAIJPERM - Creates a sparse matrix of type SEQAIJPERM. 672 This type inherits from AIJ, but calculates some additional permutation 673 information that is used to allow better vectorization of some 674 operations. At the cost of increased storage, the AIJ formatted 675 matrix can be copied to a format in which pieces of the matrix are 676 stored in ELLPACK format, allowing the vectorized matrix multiply 677 routine to use stride-1 memory accesses. As with the AIJ type, it is 678 important to preallocate matrix storage in order to get good assembly 679 performance. 680 681 Collective 682 683 Input Parameters: 684 + comm - MPI communicator, set to PETSC_COMM_SELF 685 . m - number of rows 686 . n - number of columns 687 . nz - number of nonzeros per row (same for all rows) 688 - nnz - array containing the number of nonzeros in the various rows 689 (possibly different for each row) or NULL 690 691 Output Parameter: 692 . A - the matrix 693 694 Notes: 695 If nnz is given then nz is ignored 696 697 Level: intermediate 698 699 .seealso: MatCreate(), MatCreateMPIAIJPERM(), MatSetValues() 700 @*/ 701 PetscErrorCode MatCreateSeqAIJPERM(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 702 { 703 PetscErrorCode ierr; 704 705 PetscFunctionBegin; 706 ierr = MatCreate(comm,A);CHKERRQ(ierr); 707 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 708 ierr = MatSetType(*A,MATSEQAIJPERM);CHKERRQ(ierr); 709 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,nnz);CHKERRQ(ierr); 710 PetscFunctionReturn(0); 711 } 712 713 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJPERM(Mat A) 714 { 715 PetscErrorCode ierr; 716 717 PetscFunctionBegin; 718 ierr = MatSetType(A,MATSEQAIJ);CHKERRQ(ierr); 719 ierr = MatConvert_SeqAIJ_SeqAIJPERM(A,MATSEQAIJPERM,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 720 PetscFunctionReturn(0); 721 } 722 723