1 2 /* 3 Defines basic operations for the MATSEQAIJPERM matrix class. 4 This class is derived from the MATSEQAIJ class and retains the 5 compressed row storage (aka Yale sparse matrix format) but augments 6 it with some permutation information that enables some operations 7 to be more vectorizable. A physically rearranged copy of the matrix 8 may be stored if the user desires. 9 10 Eventually a variety of permutations may be supported. 11 */ 12 13 #include <../src/mat/impls/aij/seq/aij.h> 14 15 #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 16 #include <immintrin.h> 17 18 #if !defined(_MM_SCALE_8) 19 #define _MM_SCALE_8 8 20 #endif 21 #if !defined(_MM_SCALE_4) 22 #define _MM_SCALE_4 4 23 #endif 24 #endif 25 26 #define NDIM 512 27 /* NDIM specifies how many rows at a time we should work with when 28 * performing the vectorized mat-vec. This depends on various factors 29 * such as vector register length, etc., and I really need to add a 30 * way for the user (or the library) to tune this. I'm setting it to 31 * 512 for now since that is what Ed D'Azevedo was using in his Fortran 32 * routines. */ 33 34 typedef struct { 35 PetscObjectState nonzerostate; /* used to determine if the nonzero structure has changed and hence the permutations need updating */ 36 37 PetscInt ngroup; 38 PetscInt *xgroup; 39 /* Denotes where groups of rows with same number of nonzeros 40 * begin and end, i.e., xgroup[i] gives us the position in iperm[] 41 * where the ith group begins. */ 42 43 PetscInt *nzgroup; /* how many nonzeros each row that is a member of group i has. */ 44 PetscInt *iperm; /* The permutation vector. */ 45 46 /* Some of this stuff is for Ed's recursive triangular solve. 47 * I'm not sure what I need yet. */ 48 PetscInt blocksize; 49 PetscInt nstep; 50 PetscInt *jstart_list; 51 PetscInt *jend_list; 52 PetscInt *action_list; 53 PetscInt *ngroup_list; 54 PetscInt **ipointer_list; 55 PetscInt **xgroup_list; 56 PetscInt **nzgroup_list; 57 PetscInt **iperm_list; 58 } Mat_SeqAIJPERM; 59 60 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJPERM_SeqAIJ(Mat A,MatType type,MatReuse reuse,Mat *newmat) 61 { 62 /* This routine is only called to convert a MATAIJPERM to its base PETSc type, */ 63 /* so we will ignore 'MatType type'. */ 64 PetscErrorCode ierr; 65 Mat B = *newmat; 66 Mat_SeqAIJPERM *aijperm=(Mat_SeqAIJPERM*)A->spptr; 67 68 PetscFunctionBegin; 69 if (reuse == MAT_INITIAL_MATRIX) { 70 ierr = MatDuplicate(A,MAT_COPY_VALUES,&B);CHKERRQ(ierr); 71 aijperm=(Mat_SeqAIJPERM*)B->spptr; 72 } 73 74 /* Reset the original function pointers. */ 75 B->ops->assemblyend = MatAssemblyEnd_SeqAIJ; 76 B->ops->destroy = MatDestroy_SeqAIJ; 77 B->ops->duplicate = MatDuplicate_SeqAIJ; 78 B->ops->mult = MatMult_SeqAIJ; 79 B->ops->multadd = MatMultAdd_SeqAIJ; 80 81 ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijperm_seqaij_C",NULL);CHKERRQ(ierr); 82 83 /* Free everything in the Mat_SeqAIJPERM data structure.*/ 84 ierr = PetscFree(aijperm->xgroup);CHKERRQ(ierr); 85 ierr = PetscFree(aijperm->nzgroup);CHKERRQ(ierr); 86 ierr = PetscFree(aijperm->iperm);CHKERRQ(ierr); 87 ierr = PetscFree(B->spptr);CHKERRQ(ierr); 88 89 /* Change the type of B to MATSEQAIJ. */ 90 ierr = PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJ);CHKERRQ(ierr); 91 92 *newmat = B; 93 PetscFunctionReturn(0); 94 } 95 96 PetscErrorCode MatDestroy_SeqAIJPERM(Mat A) 97 { 98 PetscErrorCode ierr; 99 Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM*) A->spptr; 100 101 PetscFunctionBegin; 102 if (aijperm) { 103 /* If MatHeaderMerge() was used then this SeqAIJPERM matrix will not have a spprt. */ 104 ierr = PetscFree(aijperm->xgroup);CHKERRQ(ierr); 105 ierr = PetscFree(aijperm->nzgroup);CHKERRQ(ierr); 106 ierr = PetscFree(aijperm->iperm);CHKERRQ(ierr); 107 ierr = PetscFree(A->spptr);CHKERRQ(ierr); 108 } 109 /* Change the type of A back to SEQAIJ and use MatDestroy_SeqAIJ() 110 * to destroy everything that remains. */ 111 ierr = PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJ);CHKERRQ(ierr); 112 /* Note that I don't call MatSetType(). I believe this is because that 113 * is only to be called when *building* a matrix. I could be wrong, but 114 * that is how things work for the SuperLU matrix class. */ 115 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 116 PetscFunctionReturn(0); 117 } 118 119 PetscErrorCode MatDuplicate_SeqAIJPERM(Mat A, MatDuplicateOption op, Mat *M) 120 { 121 PetscErrorCode ierr; 122 Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM*) A->spptr; 123 Mat_SeqAIJPERM *aijperm_dest; 124 PetscBool perm; 125 126 PetscFunctionBegin; 127 ierr = MatDuplicate_SeqAIJ(A,op,M);CHKERRQ(ierr); 128 ierr = PetscObjectTypeCompare((PetscObject)*M,MATSEQAIJPERM,&perm);CHKERRQ(ierr); 129 if (perm) { 130 aijperm_dest = (Mat_SeqAIJPERM *) (*M)->spptr; 131 ierr = PetscFree(aijperm_dest->xgroup);CHKERRQ(ierr); 132 ierr = PetscFree(aijperm_dest->nzgroup);CHKERRQ(ierr); 133 ierr = PetscFree(aijperm_dest->iperm);CHKERRQ(ierr); 134 } else { 135 ierr = PetscNewLog(*M,&aijperm_dest);CHKERRQ(ierr); 136 (*M)->spptr = (void*) aijperm_dest; 137 ierr = PetscObjectChangeTypeName((PetscObject)*M,MATSEQAIJPERM);CHKERRQ(ierr); 138 ierr = PetscObjectComposeFunction((PetscObject)*M,"MatConvert_seqaijperm_seqaij_C",MatConvert_SeqAIJPERM_SeqAIJ);CHKERRQ(ierr); 139 } 140 ierr = PetscArraycpy(aijperm_dest,aijperm,1);CHKERRQ(ierr); 141 /* Allocate space for, and copy the grouping and permutation info. 142 * I note that when the groups are initially determined in 143 * MatSeqAIJPERM_create_perm, xgroup and nzgroup may be sized larger than 144 * necessary. But at this point, we know how large they need to be, and 145 * allocate only the necessary amount of memory. So the duplicated matrix 146 * may actually use slightly less storage than the original! */ 147 ierr = PetscMalloc1(A->rmap->n, &aijperm_dest->iperm);CHKERRQ(ierr); 148 ierr = PetscMalloc1(aijperm->ngroup+1, &aijperm_dest->xgroup);CHKERRQ(ierr); 149 ierr = PetscMalloc1(aijperm->ngroup, &aijperm_dest->nzgroup);CHKERRQ(ierr); 150 ierr = PetscArraycpy(aijperm_dest->iperm,aijperm->iperm,A->rmap->n);CHKERRQ(ierr); 151 ierr = PetscArraycpy(aijperm_dest->xgroup,aijperm->xgroup,aijperm->ngroup+1);CHKERRQ(ierr); 152 ierr = PetscArraycpy(aijperm_dest->nzgroup,aijperm->nzgroup,aijperm->ngroup);CHKERRQ(ierr); 153 PetscFunctionReturn(0); 154 } 155 156 PetscErrorCode MatSeqAIJPERM_create_perm(Mat A) 157 { 158 PetscErrorCode ierr; 159 Mat_SeqAIJ *a = (Mat_SeqAIJ*)(A)->data; 160 Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM*) A->spptr; 161 PetscInt m; /* Number of rows in the matrix. */ 162 PetscInt *ia; /* From the CSR representation; points to the beginning of each row. */ 163 PetscInt maxnz; /* Maximum number of nonzeros in any row. */ 164 PetscInt *rows_in_bucket; 165 /* To construct the permutation, we sort each row into one of maxnz 166 * buckets based on how many nonzeros are in the row. */ 167 PetscInt nz; 168 PetscInt *nz_in_row; /* the number of nonzero elements in row k. */ 169 PetscInt *ipnz; 170 /* When constructing the iperm permutation vector, 171 * ipnz[nz] is used to point to the next place in the permutation vector 172 * that a row with nz nonzero elements should be placed.*/ 173 PetscInt i, ngroup, istart, ipos; 174 175 PetscFunctionBegin; 176 if (aijperm->nonzerostate == A->nonzerostate) PetscFunctionReturn(0); /* permutation exists and matches current nonzero structure */ 177 aijperm->nonzerostate = A->nonzerostate; 178 /* Free anything previously put in the Mat_SeqAIJPERM data structure. */ 179 ierr = PetscFree(aijperm->xgroup);CHKERRQ(ierr); 180 ierr = PetscFree(aijperm->nzgroup);CHKERRQ(ierr); 181 ierr = PetscFree(aijperm->iperm);CHKERRQ(ierr); 182 183 m = A->rmap->n; 184 ia = a->i; 185 186 /* Allocate the arrays that will hold the permutation vector. */ 187 ierr = PetscMalloc1(m, &aijperm->iperm);CHKERRQ(ierr); 188 189 /* Allocate some temporary work arrays that will be used in 190 * calculating the permuation vector and groupings. */ 191 ierr = PetscMalloc1(m, &nz_in_row);CHKERRQ(ierr); 192 193 /* Now actually figure out the permutation and grouping. */ 194 195 /* First pass: Determine number of nonzeros in each row, maximum 196 * number of nonzeros in any row, and how many rows fall into each 197 * "bucket" of rows with same number of nonzeros. */ 198 maxnz = 0; 199 for (i=0; i<m; i++) { 200 nz_in_row[i] = ia[i+1]-ia[i]; 201 if (nz_in_row[i] > maxnz) maxnz = nz_in_row[i]; 202 } 203 ierr = PetscMalloc1(PetscMax(maxnz,m)+1, &rows_in_bucket);CHKERRQ(ierr); 204 ierr = PetscMalloc1(PetscMax(maxnz,m)+1, &ipnz);CHKERRQ(ierr); 205 206 for (i=0; i<=maxnz; i++) { 207 rows_in_bucket[i] = 0; 208 } 209 for (i=0; i<m; i++) { 210 nz = nz_in_row[i]; 211 rows_in_bucket[nz]++; 212 } 213 214 /* Allocate space for the grouping info. There will be at most (maxnz + 1) 215 * groups. (It is maxnz + 1 instead of simply maxnz because there may be 216 * rows with no nonzero elements.) If there are (maxnz + 1) groups, 217 * then xgroup[] must consist of (maxnz + 2) elements, since the last 218 * element of xgroup will tell us where the (maxnz + 1)th group ends. 219 * We allocate space for the maximum number of groups; 220 * that is potentially a little wasteful, but not too much so. 221 * Perhaps I should fix it later. */ 222 ierr = PetscMalloc1(maxnz+2, &aijperm->xgroup);CHKERRQ(ierr); 223 ierr = PetscMalloc1(maxnz+1, &aijperm->nzgroup);CHKERRQ(ierr); 224 225 /* Second pass. Look at what is in the buckets and create the groupings. 226 * Note that it is OK to have a group of rows with no non-zero values. */ 227 ngroup = 0; 228 istart = 0; 229 for (i=0; i<=maxnz; i++) { 230 if (rows_in_bucket[i] > 0) { 231 aijperm->nzgroup[ngroup] = i; 232 aijperm->xgroup[ngroup] = istart; 233 ngroup++; 234 istart += rows_in_bucket[i]; 235 } 236 } 237 238 aijperm->xgroup[ngroup] = istart; 239 aijperm->ngroup = ngroup; 240 241 /* Now fill in the permutation vector iperm. */ 242 ipnz[0] = 0; 243 for (i=0; i<maxnz; i++) { 244 ipnz[i+1] = ipnz[i] + rows_in_bucket[i]; 245 } 246 247 for (i=0; i<m; i++) { 248 nz = nz_in_row[i]; 249 ipos = ipnz[nz]; 250 aijperm->iperm[ipos] = i; 251 ipnz[nz]++; 252 } 253 254 /* Clean up temporary work arrays. */ 255 ierr = PetscFree(rows_in_bucket);CHKERRQ(ierr); 256 ierr = PetscFree(ipnz);CHKERRQ(ierr); 257 ierr = PetscFree(nz_in_row);CHKERRQ(ierr); 258 PetscFunctionReturn(0); 259 } 260 261 262 PetscErrorCode MatAssemblyEnd_SeqAIJPERM(Mat A, MatAssemblyType mode) 263 { 264 PetscErrorCode ierr; 265 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 266 267 PetscFunctionBegin; 268 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 269 270 /* Since a MATSEQAIJPERM matrix is really just a MATSEQAIJ with some 271 * extra information, call the AssemblyEnd routine for a MATSEQAIJ. 272 * I'm not sure if this is the best way to do this, but it avoids 273 * a lot of code duplication. 274 * I also note that currently MATSEQAIJPERM doesn't know anything about 275 * the Mat_CompressedRow data structure that SeqAIJ now uses when there 276 * are many zero rows. If the SeqAIJ assembly end routine decides to use 277 * this, this may break things. (Don't know... haven't looked at it.) */ 278 a->inode.use = PETSC_FALSE; 279 ierr = MatAssemblyEnd_SeqAIJ(A, mode);CHKERRQ(ierr); 280 281 /* Now calculate the permutation and grouping information. */ 282 ierr = MatSeqAIJPERM_create_perm(A);CHKERRQ(ierr); 283 PetscFunctionReturn(0); 284 } 285 286 PetscErrorCode MatMult_SeqAIJPERM(Mat A,Vec xx,Vec yy) 287 { 288 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 289 const PetscScalar *x; 290 PetscScalar *y; 291 const MatScalar *aa; 292 PetscErrorCode ierr; 293 const PetscInt *aj,*ai; 294 #if !(defined(PETSC_USE_FORTRAN_KERNEL_MULTAIJPERM) && defined(notworking)) 295 PetscInt i,j; 296 #endif 297 #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 298 __m512d vec_x,vec_y,vec_vals; 299 __m256i vec_idx,vec_ipos,vec_j; 300 __mmask8 mask; 301 #endif 302 303 /* Variables that don't appear in MatMult_SeqAIJ. */ 304 Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM*) A->spptr; 305 PetscInt *iperm; /* Points to the permutation vector. */ 306 PetscInt *xgroup; 307 /* Denotes where groups of rows with same number of nonzeros 308 * begin and end in iperm. */ 309 PetscInt *nzgroup; 310 PetscInt ngroup; 311 PetscInt igroup; 312 PetscInt jstart,jend; 313 /* jstart is used in loops to denote the position in iperm where a 314 * group starts; jend denotes the position where it ends. 315 * (jend + 1 is where the next group starts.) */ 316 PetscInt iold,nz; 317 PetscInt istart,iend,isize; 318 PetscInt ipos; 319 PetscScalar yp[NDIM]; 320 PetscInt ip[NDIM]; /* yp[] and ip[] are treated as vector "registers" for performing the mat-vec. */ 321 322 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 323 #pragma disjoint(*x,*y,*aa) 324 #endif 325 326 PetscFunctionBegin; 327 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 328 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 329 aj = a->j; /* aj[k] gives column index for element aa[k]. */ 330 aa = a->a; /* Nonzero elements stored row-by-row. */ 331 ai = a->i; /* ai[k] is the position in aa and aj where row k starts. */ 332 333 /* Get the info we need about the permutations and groupings. */ 334 iperm = aijperm->iperm; 335 ngroup = aijperm->ngroup; 336 xgroup = aijperm->xgroup; 337 nzgroup = aijperm->nzgroup; 338 339 #if defined(PETSC_USE_FORTRAN_KERNEL_MULTAIJPERM) && defined(notworking) 340 fortranmultaijperm_(&m,x,ii,aj,aa,y); 341 #else 342 343 for (igroup=0; igroup<ngroup; igroup++) { 344 jstart = xgroup[igroup]; 345 jend = xgroup[igroup+1] - 1; 346 nz = nzgroup[igroup]; 347 348 /* Handle the special cases where the number of nonzeros per row 349 * in the group is either 0 or 1. */ 350 if (nz == 0) { 351 for (i=jstart; i<=jend; i++) { 352 y[iperm[i]] = 0.0; 353 } 354 } else if (nz == 1) { 355 for (i=jstart; i<=jend; i++) { 356 iold = iperm[i]; 357 ipos = ai[iold]; 358 y[iold] = aa[ipos] * x[aj[ipos]]; 359 } 360 } else { 361 362 /* We work our way through the current group in chunks of NDIM rows 363 * at a time. */ 364 365 for (istart=jstart; istart<=jend; istart+=NDIM) { 366 /* Figure out where the chunk of 'isize' rows ends in iperm. 367 * 'isize may of course be less than NDIM for the last chunk. */ 368 iend = istart + (NDIM - 1); 369 370 if (iend > jend) iend = jend; 371 372 isize = iend - istart + 1; 373 374 /* Initialize the yp[] array that will be used to hold part of 375 * the permuted results vector, and figure out where in aa each 376 * row of the chunk will begin. */ 377 for (i=0; i<isize; i++) { 378 iold = iperm[istart + i]; 379 /* iold is a row number from the matrix A *before* reordering. */ 380 ip[i] = ai[iold]; 381 /* ip[i] tells us where the ith row of the chunk begins in aa. */ 382 yp[i] = (PetscScalar) 0.0; 383 } 384 385 /* If the number of zeros per row exceeds the number of rows in 386 * the chunk, we should vectorize along nz, that is, perform the 387 * mat-vec one row at a time as in the usual CSR case. */ 388 if (nz > isize) { 389 #if defined(PETSC_HAVE_CRAY_VECTOR) 390 #pragma _CRI preferstream 391 #endif 392 for (i=0; i<isize; i++) { 393 #if defined(PETSC_HAVE_CRAY_VECTOR) 394 #pragma _CRI prefervector 395 #endif 396 397 #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 398 vec_y = _mm512_setzero_pd(); 399 ipos = ip[i]; 400 for (j=0; j<(nz>>3); j++) { 401 vec_idx = _mm256_loadu_si256((__m256i const*)&aj[ipos]); 402 vec_vals = _mm512_loadu_pd(&aa[ipos]); 403 vec_x = _mm512_i32gather_pd(vec_idx,x,_MM_SCALE_8); 404 vec_y = _mm512_fmadd_pd(vec_x,vec_vals,vec_y); 405 ipos += 8; 406 } 407 if ((nz&0x07)>2) { 408 mask = (__mmask8)(0xff >> (8-(nz&0x07))); 409 vec_idx = _mm256_loadu_si256((__m256i const*)&aj[ipos]); 410 vec_vals = _mm512_loadu_pd(&aa[ipos]); 411 vec_x = _mm512_mask_i32gather_pd(vec_x,mask,vec_idx,x,_MM_SCALE_8); 412 vec_y = _mm512_mask3_fmadd_pd(vec_x,vec_vals,vec_y,mask); 413 } else if ((nz&0x07)==2) { 414 yp[i] += aa[ipos]*x[aj[ipos]]; 415 yp[i] += aa[ipos+1]*x[aj[ipos+1]]; 416 } else if ((nz&0x07)==1) { 417 yp[i] += aa[ipos]*x[aj[ipos]]; 418 } 419 yp[i] += _mm512_reduce_add_pd(vec_y); 420 #else 421 for (j=0; j<nz; j++) { 422 ipos = ip[i] + j; 423 yp[i] += aa[ipos] * x[aj[ipos]]; 424 } 425 #endif 426 } 427 } else { 428 /* Otherwise, there are enough rows in the chunk to make it 429 * worthwhile to vectorize across the rows, that is, to do the 430 * matvec by operating with "columns" of the chunk. */ 431 for (j=0; j<nz; j++) { 432 #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 433 vec_j = _mm256_set1_epi32(j); 434 for (i=0; i<((isize>>3)<<3); i+=8) { 435 vec_y = _mm512_loadu_pd(&yp[i]); 436 vec_ipos = _mm256_loadu_si256((__m256i const*)&ip[i]); 437 vec_ipos = _mm256_add_epi32(vec_ipos,vec_j); 438 vec_idx = _mm256_i32gather_epi32(aj,vec_ipos,_MM_SCALE_4); 439 vec_vals = _mm512_i32gather_pd(vec_ipos,aa,_MM_SCALE_8); 440 vec_x = _mm512_i32gather_pd(vec_idx,x,_MM_SCALE_8); 441 vec_y = _mm512_fmadd_pd(vec_x,vec_vals,vec_y); 442 _mm512_storeu_pd(&yp[i],vec_y); 443 } 444 for (i=isize-(isize&0x07); i<isize; i++) { 445 ipos = ip[i]+j; 446 yp[i] += aa[ipos]*x[aj[ipos]]; 447 } 448 #else 449 for (i=0; i<isize; i++) { 450 ipos = ip[i] + j; 451 yp[i] += aa[ipos] * x[aj[ipos]]; 452 } 453 #endif 454 } 455 } 456 457 #if defined(PETSC_HAVE_CRAY_VECTOR) 458 #pragma _CRI ivdep 459 #endif 460 /* Put results from yp[] into non-permuted result vector y. */ 461 for (i=0; i<isize; i++) { 462 y[iperm[istart+i]] = yp[i]; 463 } 464 } /* End processing chunk of isize rows of a group. */ 465 } /* End handling matvec for chunk with nz > 1. */ 466 } /* End loop over igroup. */ 467 #endif 468 ierr = PetscLogFlops(PetscMax(2.0*a->nz - A->rmap->n,0));CHKERRQ(ierr); 469 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 470 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 471 PetscFunctionReturn(0); 472 } 473 474 475 /* MatMultAdd_SeqAIJPERM() calculates yy = ww + A * xx. 476 * Note that the names I used to designate the vectors differs from that 477 * used in MatMultAdd_SeqAIJ(). I did this to keep my notation consistent 478 * with the MatMult_SeqAIJPERM() routine, which is very similar to this one. */ 479 /* 480 I hate having virtually identical code for the mult and the multadd!!! 481 */ 482 PetscErrorCode MatMultAdd_SeqAIJPERM(Mat A,Vec xx,Vec ww,Vec yy) 483 { 484 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 485 const PetscScalar *x; 486 PetscScalar *y,*w; 487 const MatScalar *aa; 488 PetscErrorCode ierr; 489 const PetscInt *aj,*ai; 490 #if !defined(PETSC_USE_FORTRAN_KERNEL_MULTADDAIJPERM) 491 PetscInt i,j; 492 #endif 493 494 /* Variables that don't appear in MatMultAdd_SeqAIJ. */ 495 Mat_SeqAIJPERM * aijperm; 496 PetscInt *iperm; /* Points to the permutation vector. */ 497 PetscInt *xgroup; 498 /* Denotes where groups of rows with same number of nonzeros 499 * begin and end in iperm. */ 500 PetscInt *nzgroup; 501 PetscInt ngroup; 502 PetscInt igroup; 503 PetscInt jstart,jend; 504 /* jstart is used in loops to denote the position in iperm where a 505 * group starts; jend denotes the position where it ends. 506 * (jend + 1 is where the next group starts.) */ 507 PetscInt iold,nz; 508 PetscInt istart,iend,isize; 509 PetscInt ipos; 510 PetscScalar yp[NDIM]; 511 PetscInt ip[NDIM]; 512 /* yp[] and ip[] are treated as vector "registers" for performing 513 * the mat-vec. */ 514 515 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 516 #pragma disjoint(*x,*y,*aa) 517 #endif 518 519 PetscFunctionBegin; 520 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 521 ierr = VecGetArrayPair(yy,ww,&y,&w);CHKERRQ(ierr); 522 523 aj = a->j; /* aj[k] gives column index for element aa[k]. */ 524 aa = a->a; /* Nonzero elements stored row-by-row. */ 525 ai = a->i; /* ai[k] is the position in aa and aj where row k starts. */ 526 527 /* Get the info we need about the permutations and groupings. */ 528 aijperm = (Mat_SeqAIJPERM*) A->spptr; 529 iperm = aijperm->iperm; 530 ngroup = aijperm->ngroup; 531 xgroup = aijperm->xgroup; 532 nzgroup = aijperm->nzgroup; 533 534 #if defined(PETSC_USE_FORTRAN_KERNEL_MULTADDAIJPERM) 535 fortranmultaddaijperm_(&m,x,ii,aj,aa,y,w); 536 #else 537 538 for (igroup=0; igroup<ngroup; igroup++) { 539 jstart = xgroup[igroup]; 540 jend = xgroup[igroup+1] - 1; 541 542 nz = nzgroup[igroup]; 543 544 /* Handle the special cases where the number of nonzeros per row 545 * in the group is either 0 or 1. */ 546 if (nz == 0) { 547 for (i=jstart; i<=jend; i++) { 548 iold = iperm[i]; 549 y[iold] = w[iold]; 550 } 551 } 552 else if (nz == 1) { 553 for (i=jstart; i<=jend; i++) { 554 iold = iperm[i]; 555 ipos = ai[iold]; 556 y[iold] = w[iold] + aa[ipos] * x[aj[ipos]]; 557 } 558 } 559 /* For the general case: */ 560 else { 561 562 /* We work our way through the current group in chunks of NDIM rows 563 * at a time. */ 564 565 for (istart=jstart; istart<=jend; istart+=NDIM) { 566 /* Figure out where the chunk of 'isize' rows ends in iperm. 567 * 'isize may of course be less than NDIM for the last chunk. */ 568 iend = istart + (NDIM - 1); 569 if (iend > jend) iend = jend; 570 isize = iend - istart + 1; 571 572 /* Initialize the yp[] array that will be used to hold part of 573 * the permuted results vector, and figure out where in aa each 574 * row of the chunk will begin. */ 575 for (i=0; i<isize; i++) { 576 iold = iperm[istart + i]; 577 /* iold is a row number from the matrix A *before* reordering. */ 578 ip[i] = ai[iold]; 579 /* ip[i] tells us where the ith row of the chunk begins in aa. */ 580 yp[i] = w[iold]; 581 } 582 583 /* If the number of zeros per row exceeds the number of rows in 584 * the chunk, we should vectorize along nz, that is, perform the 585 * mat-vec one row at a time as in the usual CSR case. */ 586 if (nz > isize) { 587 #if defined(PETSC_HAVE_CRAY_VECTOR) 588 #pragma _CRI preferstream 589 #endif 590 for (i=0; i<isize; i++) { 591 #if defined(PETSC_HAVE_CRAY_VECTOR) 592 #pragma _CRI prefervector 593 #endif 594 for (j=0; j<nz; j++) { 595 ipos = ip[i] + j; 596 yp[i] += aa[ipos] * x[aj[ipos]]; 597 } 598 } 599 } 600 /* Otherwise, there are enough rows in the chunk to make it 601 * worthwhile to vectorize across the rows, that is, to do the 602 * matvec by operating with "columns" of the chunk. */ 603 else { 604 for (j=0; j<nz; j++) { 605 for (i=0; i<isize; i++) { 606 ipos = ip[i] + j; 607 yp[i] += aa[ipos] * x[aj[ipos]]; 608 } 609 } 610 } 611 612 #if defined(PETSC_HAVE_CRAY_VECTOR) 613 #pragma _CRI ivdep 614 #endif 615 /* Put results from yp[] into non-permuted result vector y. */ 616 for (i=0; i<isize; i++) { 617 y[iperm[istart+i]] = yp[i]; 618 } 619 } /* End processing chunk of isize rows of a group. */ 620 621 } /* End handling matvec for chunk with nz > 1. */ 622 } /* End loop over igroup. */ 623 624 #endif 625 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 626 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 627 ierr = VecRestoreArrayPair(yy,ww,&y,&w);CHKERRQ(ierr); 628 PetscFunctionReturn(0); 629 } 630 631 /* MatConvert_SeqAIJ_SeqAIJPERM converts a SeqAIJ matrix into a 632 * SeqAIJPERM matrix. This routine is called by the MatCreate_SeqAIJPERM() 633 * routine, but can also be used to convert an assembled SeqAIJ matrix 634 * into a SeqAIJPERM one. */ 635 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJPERM(Mat A,MatType type,MatReuse reuse,Mat *newmat) 636 { 637 PetscErrorCode ierr; 638 Mat B = *newmat; 639 Mat_SeqAIJPERM *aijperm; 640 PetscBool sametype; 641 642 PetscFunctionBegin; 643 if (reuse == MAT_INITIAL_MATRIX) { 644 ierr = MatDuplicate(A,MAT_COPY_VALUES,&B);CHKERRQ(ierr); 645 } 646 ierr = PetscObjectTypeCompare((PetscObject)A,type,&sametype);CHKERRQ(ierr); 647 if (sametype) PetscFunctionReturn(0); 648 649 ierr = PetscNewLog(B,&aijperm);CHKERRQ(ierr); 650 B->spptr = (void*) aijperm; 651 652 /* Set function pointers for methods that we inherit from AIJ but override. */ 653 B->ops->duplicate = MatDuplicate_SeqAIJPERM; 654 B->ops->assemblyend = MatAssemblyEnd_SeqAIJPERM; 655 B->ops->destroy = MatDestroy_SeqAIJPERM; 656 B->ops->mult = MatMult_SeqAIJPERM; 657 B->ops->multadd = MatMultAdd_SeqAIJPERM; 658 659 aijperm->nonzerostate = -1; /* this will trigger the generation of the permutation information the first time through MatAssembly()*/ 660 /* If A has already been assembled, compute the permutation. */ 661 if (A->assembled) { 662 ierr = MatSeqAIJPERM_create_perm(B);CHKERRQ(ierr); 663 } 664 665 ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijperm_seqaij_C",MatConvert_SeqAIJPERM_SeqAIJ);CHKERRQ(ierr); 666 667 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJPERM);CHKERRQ(ierr); 668 *newmat = B; 669 PetscFunctionReturn(0); 670 } 671 672 /*@C 673 MatCreateSeqAIJPERM - Creates a sparse matrix of type SEQAIJPERM. 674 This type inherits from AIJ, but calculates some additional permutation 675 information that is used to allow better vectorization of some 676 operations. At the cost of increased storage, the AIJ formatted 677 matrix can be copied to a format in which pieces of the matrix are 678 stored in ELLPACK format, allowing the vectorized matrix multiply 679 routine to use stride-1 memory accesses. As with the AIJ type, it is 680 important to preallocate matrix storage in order to get good assembly 681 performance. 682 683 Collective 684 685 Input Parameters: 686 + comm - MPI communicator, set to PETSC_COMM_SELF 687 . m - number of rows 688 . n - number of columns 689 . nz - number of nonzeros per row (same for all rows) 690 - nnz - array containing the number of nonzeros in the various rows 691 (possibly different for each row) or NULL 692 693 Output Parameter: 694 . A - the matrix 695 696 Notes: 697 If nnz is given then nz is ignored 698 699 Level: intermediate 700 701 .seealso: MatCreate(), MatCreateMPIAIJPERM(), MatSetValues() 702 @*/ 703 PetscErrorCode MatCreateSeqAIJPERM(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 704 { 705 PetscErrorCode ierr; 706 707 PetscFunctionBegin; 708 ierr = MatCreate(comm,A);CHKERRQ(ierr); 709 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 710 ierr = MatSetType(*A,MATSEQAIJPERM);CHKERRQ(ierr); 711 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,nnz);CHKERRQ(ierr); 712 PetscFunctionReturn(0); 713 } 714 715 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJPERM(Mat A) 716 { 717 PetscErrorCode ierr; 718 719 PetscFunctionBegin; 720 ierr = MatSetType(A,MATSEQAIJ);CHKERRQ(ierr); 721 ierr = MatConvert_SeqAIJ_SeqAIJPERM(A,MATSEQAIJPERM,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 722 PetscFunctionReturn(0); 723 } 724 725