1 2 /* 3 Defines basic operations for the MATSEQAIJPERM matrix class. 4 This class is derived from the MATSEQAIJ class and retains the 5 compressed row storage (aka Yale sparse matrix format) but augments 6 it with some permutation information that enables some operations 7 to be more vectorizable. A physically rearranged copy of the matrix 8 may be stored if the user desires. 9 10 Eventually a variety of permutations may be supported. 11 */ 12 13 #include <../src/mat/impls/aij/seq/aij.h> 14 15 #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 16 #include <immintrin.h> 17 18 #if !defined(_MM_SCALE_8) 19 #define _MM_SCALE_8 8 20 #endif 21 #if !defined(_MM_SCALE_4) 22 #define _MM_SCALE_4 4 23 #endif 24 #endif 25 26 #define NDIM 512 27 /* NDIM specifies how many rows at a time we should work with when 28 * performing the vectorized mat-vec. This depends on various factors 29 * such as vector register length, etc., and I really need to add a 30 * way for the user (or the library) to tune this. I'm setting it to 31 * 512 for now since that is what Ed D'Azevedo was using in his Fortran 32 * routines. */ 33 34 typedef struct { 35 PetscObjectState nonzerostate; /* used to determine if the nonzero structure has changed and hence the permutations need updating */ 36 37 PetscInt ngroup; 38 PetscInt *xgroup; 39 /* Denotes where groups of rows with same number of nonzeros 40 * begin and end, i.e., xgroup[i] gives us the position in iperm[] 41 * where the ith group begins. */ 42 43 PetscInt *nzgroup; /* how many nonzeros each row that is a member of group i has. */ 44 PetscInt *iperm; /* The permutation vector. */ 45 46 /* Some of this stuff is for Ed's recursive triangular solve. 47 * I'm not sure what I need yet. */ 48 PetscInt blocksize; 49 PetscInt nstep; 50 PetscInt *jstart_list; 51 PetscInt *jend_list; 52 PetscInt *action_list; 53 PetscInt *ngroup_list; 54 PetscInt **ipointer_list; 55 PetscInt **xgroup_list; 56 PetscInt **nzgroup_list; 57 PetscInt **iperm_list; 58 } Mat_SeqAIJPERM; 59 60 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJPERM_SeqAIJ(Mat A, MatType type, MatReuse reuse, Mat *newmat) { 61 /* This routine is only called to convert a MATAIJPERM to its base PETSc type, */ 62 /* so we will ignore 'MatType type'. */ 63 Mat B = *newmat; 64 Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM *)A->spptr; 65 66 PetscFunctionBegin; 67 if (reuse == MAT_INITIAL_MATRIX) { 68 PetscCall(MatDuplicate(A, MAT_COPY_VALUES, &B)); 69 aijperm = (Mat_SeqAIJPERM *)B->spptr; 70 } 71 72 /* Reset the original function pointers. */ 73 B->ops->assemblyend = MatAssemblyEnd_SeqAIJ; 74 B->ops->destroy = MatDestroy_SeqAIJ; 75 B->ops->duplicate = MatDuplicate_SeqAIJ; 76 B->ops->mult = MatMult_SeqAIJ; 77 B->ops->multadd = MatMultAdd_SeqAIJ; 78 79 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijperm_seqaij_C", NULL)); 80 81 /* Free everything in the Mat_SeqAIJPERM data structure.*/ 82 PetscCall(PetscFree(aijperm->xgroup)); 83 PetscCall(PetscFree(aijperm->nzgroup)); 84 PetscCall(PetscFree(aijperm->iperm)); 85 PetscCall(PetscFree(B->spptr)); 86 87 /* Change the type of B to MATSEQAIJ. */ 88 PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJ)); 89 90 *newmat = B; 91 PetscFunctionReturn(0); 92 } 93 94 PetscErrorCode MatDestroy_SeqAIJPERM(Mat A) { 95 Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM *)A->spptr; 96 97 PetscFunctionBegin; 98 if (aijperm) { 99 /* If MatHeaderMerge() was used then this SeqAIJPERM matrix will not have a spprt. */ 100 PetscCall(PetscFree(aijperm->xgroup)); 101 PetscCall(PetscFree(aijperm->nzgroup)); 102 PetscCall(PetscFree(aijperm->iperm)); 103 PetscCall(PetscFree(A->spptr)); 104 } 105 /* Change the type of A back to SEQAIJ and use MatDestroy_SeqAIJ() 106 * to destroy everything that remains. */ 107 PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJ)); 108 /* Note that I don't call MatSetType(). I believe this is because that 109 * is only to be called when *building* a matrix. I could be wrong, but 110 * that is how things work for the SuperLU matrix class. */ 111 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijperm_seqaij_C", NULL)); 112 PetscCall(MatDestroy_SeqAIJ(A)); 113 PetscFunctionReturn(0); 114 } 115 116 PetscErrorCode MatDuplicate_SeqAIJPERM(Mat A, MatDuplicateOption op, Mat *M) { 117 Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM *)A->spptr; 118 Mat_SeqAIJPERM *aijperm_dest; 119 PetscBool perm; 120 121 PetscFunctionBegin; 122 PetscCall(MatDuplicate_SeqAIJ(A, op, M)); 123 PetscCall(PetscObjectTypeCompare((PetscObject)*M, MATSEQAIJPERM, &perm)); 124 if (perm) { 125 aijperm_dest = (Mat_SeqAIJPERM *)(*M)->spptr; 126 PetscCall(PetscFree(aijperm_dest->xgroup)); 127 PetscCall(PetscFree(aijperm_dest->nzgroup)); 128 PetscCall(PetscFree(aijperm_dest->iperm)); 129 } else { 130 PetscCall(PetscNewLog(*M, &aijperm_dest)); 131 (*M)->spptr = (void *)aijperm_dest; 132 PetscCall(PetscObjectChangeTypeName((PetscObject)*M, MATSEQAIJPERM)); 133 PetscCall(PetscObjectComposeFunction((PetscObject)*M, "MatConvert_seqaijperm_seqaij_C", MatConvert_SeqAIJPERM_SeqAIJ)); 134 } 135 PetscCall(PetscArraycpy(aijperm_dest, aijperm, 1)); 136 /* Allocate space for, and copy the grouping and permutation info. 137 * I note that when the groups are initially determined in 138 * MatSeqAIJPERM_create_perm, xgroup and nzgroup may be sized larger than 139 * necessary. But at this point, we know how large they need to be, and 140 * allocate only the necessary amount of memory. So the duplicated matrix 141 * may actually use slightly less storage than the original! */ 142 PetscCall(PetscMalloc1(A->rmap->n, &aijperm_dest->iperm)); 143 PetscCall(PetscMalloc1(aijperm->ngroup + 1, &aijperm_dest->xgroup)); 144 PetscCall(PetscMalloc1(aijperm->ngroup, &aijperm_dest->nzgroup)); 145 PetscCall(PetscArraycpy(aijperm_dest->iperm, aijperm->iperm, A->rmap->n)); 146 PetscCall(PetscArraycpy(aijperm_dest->xgroup, aijperm->xgroup, aijperm->ngroup + 1)); 147 PetscCall(PetscArraycpy(aijperm_dest->nzgroup, aijperm->nzgroup, aijperm->ngroup)); 148 PetscFunctionReturn(0); 149 } 150 151 PetscErrorCode MatSeqAIJPERM_create_perm(Mat A) { 152 Mat_SeqAIJ *a = (Mat_SeqAIJ *)(A)->data; 153 Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM *)A->spptr; 154 PetscInt m; /* Number of rows in the matrix. */ 155 PetscInt *ia; /* From the CSR representation; points to the beginning of each row. */ 156 PetscInt maxnz; /* Maximum number of nonzeros in any row. */ 157 PetscInt *rows_in_bucket; 158 /* To construct the permutation, we sort each row into one of maxnz 159 * buckets based on how many nonzeros are in the row. */ 160 PetscInt nz; 161 PetscInt *nz_in_row; /* the number of nonzero elements in row k. */ 162 PetscInt *ipnz; 163 /* When constructing the iperm permutation vector, 164 * ipnz[nz] is used to point to the next place in the permutation vector 165 * that a row with nz nonzero elements should be placed.*/ 166 PetscInt i, ngroup, istart, ipos; 167 168 PetscFunctionBegin; 169 if (aijperm->nonzerostate == A->nonzerostate) PetscFunctionReturn(0); /* permutation exists and matches current nonzero structure */ 170 aijperm->nonzerostate = A->nonzerostate; 171 /* Free anything previously put in the Mat_SeqAIJPERM data structure. */ 172 PetscCall(PetscFree(aijperm->xgroup)); 173 PetscCall(PetscFree(aijperm->nzgroup)); 174 PetscCall(PetscFree(aijperm->iperm)); 175 176 m = A->rmap->n; 177 ia = a->i; 178 179 /* Allocate the arrays that will hold the permutation vector. */ 180 PetscCall(PetscMalloc1(m, &aijperm->iperm)); 181 182 /* Allocate some temporary work arrays that will be used in 183 * calculating the permutation vector and groupings. */ 184 PetscCall(PetscMalloc1(m, &nz_in_row)); 185 186 /* Now actually figure out the permutation and grouping. */ 187 188 /* First pass: Determine number of nonzeros in each row, maximum 189 * number of nonzeros in any row, and how many rows fall into each 190 * "bucket" of rows with same number of nonzeros. */ 191 maxnz = 0; 192 for (i = 0; i < m; i++) { 193 nz_in_row[i] = ia[i + 1] - ia[i]; 194 if (nz_in_row[i] > maxnz) maxnz = nz_in_row[i]; 195 } 196 PetscCall(PetscMalloc1(PetscMax(maxnz, m) + 1, &rows_in_bucket)); 197 PetscCall(PetscMalloc1(PetscMax(maxnz, m) + 1, &ipnz)); 198 199 for (i = 0; i <= maxnz; i++) { rows_in_bucket[i] = 0; } 200 for (i = 0; i < m; i++) { 201 nz = nz_in_row[i]; 202 rows_in_bucket[nz]++; 203 } 204 205 /* Allocate space for the grouping info. There will be at most (maxnz + 1) 206 * groups. (It is maxnz + 1 instead of simply maxnz because there may be 207 * rows with no nonzero elements.) If there are (maxnz + 1) groups, 208 * then xgroup[] must consist of (maxnz + 2) elements, since the last 209 * element of xgroup will tell us where the (maxnz + 1)th group ends. 210 * We allocate space for the maximum number of groups; 211 * that is potentially a little wasteful, but not too much so. 212 * Perhaps I should fix it later. */ 213 PetscCall(PetscMalloc1(maxnz + 2, &aijperm->xgroup)); 214 PetscCall(PetscMalloc1(maxnz + 1, &aijperm->nzgroup)); 215 216 /* Second pass. Look at what is in the buckets and create the groupings. 217 * Note that it is OK to have a group of rows with no non-zero values. */ 218 ngroup = 0; 219 istart = 0; 220 for (i = 0; i <= maxnz; i++) { 221 if (rows_in_bucket[i] > 0) { 222 aijperm->nzgroup[ngroup] = i; 223 aijperm->xgroup[ngroup] = istart; 224 ngroup++; 225 istart += rows_in_bucket[i]; 226 } 227 } 228 229 aijperm->xgroup[ngroup] = istart; 230 aijperm->ngroup = ngroup; 231 232 /* Now fill in the permutation vector iperm. */ 233 ipnz[0] = 0; 234 for (i = 0; i < maxnz; i++) { ipnz[i + 1] = ipnz[i] + rows_in_bucket[i]; } 235 236 for (i = 0; i < m; i++) { 237 nz = nz_in_row[i]; 238 ipos = ipnz[nz]; 239 aijperm->iperm[ipos] = i; 240 ipnz[nz]++; 241 } 242 243 /* Clean up temporary work arrays. */ 244 PetscCall(PetscFree(rows_in_bucket)); 245 PetscCall(PetscFree(ipnz)); 246 PetscCall(PetscFree(nz_in_row)); 247 PetscFunctionReturn(0); 248 } 249 250 PetscErrorCode MatAssemblyEnd_SeqAIJPERM(Mat A, MatAssemblyType mode) { 251 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 252 253 PetscFunctionBegin; 254 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 255 256 /* Since a MATSEQAIJPERM matrix is really just a MATSEQAIJ with some 257 * extra information, call the AssemblyEnd routine for a MATSEQAIJ. 258 * I'm not sure if this is the best way to do this, but it avoids 259 * a lot of code duplication. 260 * I also note that currently MATSEQAIJPERM doesn't know anything about 261 * the Mat_CompressedRow data structure that SeqAIJ now uses when there 262 * are many zero rows. If the SeqAIJ assembly end routine decides to use 263 * this, this may break things. (Don't know... haven't looked at it.) */ 264 a->inode.use = PETSC_FALSE; 265 PetscCall(MatAssemblyEnd_SeqAIJ(A, mode)); 266 267 /* Now calculate the permutation and grouping information. */ 268 PetscCall(MatSeqAIJPERM_create_perm(A)); 269 PetscFunctionReturn(0); 270 } 271 272 PetscErrorCode MatMult_SeqAIJPERM(Mat A, Vec xx, Vec yy) { 273 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 274 const PetscScalar *x; 275 PetscScalar *y; 276 const MatScalar *aa; 277 const PetscInt *aj, *ai; 278 #if !(defined(PETSC_USE_FORTRAN_KERNEL_MULTAIJPERM) && defined(notworking)) 279 PetscInt i, j; 280 #endif 281 #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 282 __m512d vec_x, vec_y, vec_vals; 283 __m256i vec_idx, vec_ipos, vec_j; 284 __mmask8 mask; 285 #endif 286 287 /* Variables that don't appear in MatMult_SeqAIJ. */ 288 Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM *)A->spptr; 289 PetscInt *iperm; /* Points to the permutation vector. */ 290 PetscInt *xgroup; 291 /* Denotes where groups of rows with same number of nonzeros 292 * begin and end in iperm. */ 293 PetscInt *nzgroup; 294 PetscInt ngroup; 295 PetscInt igroup; 296 PetscInt jstart, jend; 297 /* jstart is used in loops to denote the position in iperm where a 298 * group starts; jend denotes the position where it ends. 299 * (jend + 1 is where the next group starts.) */ 300 PetscInt iold, nz; 301 PetscInt istart, iend, isize; 302 PetscInt ipos; 303 PetscScalar yp[NDIM]; 304 PetscInt ip[NDIM]; /* yp[] and ip[] are treated as vector "registers" for performing the mat-vec. */ 305 306 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 307 #pragma disjoint(*x, *y, *aa) 308 #endif 309 310 PetscFunctionBegin; 311 PetscCall(VecGetArrayRead(xx, &x)); 312 PetscCall(VecGetArray(yy, &y)); 313 aj = a->j; /* aj[k] gives column index for element aa[k]. */ 314 aa = a->a; /* Nonzero elements stored row-by-row. */ 315 ai = a->i; /* ai[k] is the position in aa and aj where row k starts. */ 316 317 /* Get the info we need about the permutations and groupings. */ 318 iperm = aijperm->iperm; 319 ngroup = aijperm->ngroup; 320 xgroup = aijperm->xgroup; 321 nzgroup = aijperm->nzgroup; 322 323 #if defined(PETSC_USE_FORTRAN_KERNEL_MULTAIJPERM) && defined(notworking) 324 fortranmultaijperm_(&m, x, ii, aj, aa, y); 325 #else 326 327 for (igroup = 0; igroup < ngroup; igroup++) { 328 jstart = xgroup[igroup]; 329 jend = xgroup[igroup + 1] - 1; 330 nz = nzgroup[igroup]; 331 332 /* Handle the special cases where the number of nonzeros per row 333 * in the group is either 0 or 1. */ 334 if (nz == 0) { 335 for (i = jstart; i <= jend; i++) { y[iperm[i]] = 0.0; } 336 } else if (nz == 1) { 337 for (i = jstart; i <= jend; i++) { 338 iold = iperm[i]; 339 ipos = ai[iold]; 340 y[iold] = aa[ipos] * x[aj[ipos]]; 341 } 342 } else { 343 /* We work our way through the current group in chunks of NDIM rows 344 * at a time. */ 345 346 for (istart = jstart; istart <= jend; istart += NDIM) { 347 /* Figure out where the chunk of 'isize' rows ends in iperm. 348 * 'isize may of course be less than NDIM for the last chunk. */ 349 iend = istart + (NDIM - 1); 350 351 if (iend > jend) iend = jend; 352 353 isize = iend - istart + 1; 354 355 /* Initialize the yp[] array that will be used to hold part of 356 * the permuted results vector, and figure out where in aa each 357 * row of the chunk will begin. */ 358 for (i = 0; i < isize; i++) { 359 iold = iperm[istart + i]; 360 /* iold is a row number from the matrix A *before* reordering. */ 361 ip[i] = ai[iold]; 362 /* ip[i] tells us where the ith row of the chunk begins in aa. */ 363 yp[i] = (PetscScalar)0.0; 364 } 365 366 /* If the number of zeros per row exceeds the number of rows in 367 * the chunk, we should vectorize along nz, that is, perform the 368 * mat-vec one row at a time as in the usual CSR case. */ 369 if (nz > isize) { 370 #if defined(PETSC_HAVE_CRAY_VECTOR) 371 #pragma _CRI preferstream 372 #endif 373 for (i = 0; i < isize; i++) { 374 #if defined(PETSC_HAVE_CRAY_VECTOR) 375 #pragma _CRI prefervector 376 #endif 377 378 #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 379 vec_y = _mm512_setzero_pd(); 380 ipos = ip[i]; 381 for (j = 0; j < (nz >> 3); j++) { 382 vec_idx = _mm256_loadu_si256((__m256i const *)&aj[ipos]); 383 vec_vals = _mm512_loadu_pd(&aa[ipos]); 384 vec_x = _mm512_i32gather_pd(vec_idx, x, _MM_SCALE_8); 385 vec_y = _mm512_fmadd_pd(vec_x, vec_vals, vec_y); 386 ipos += 8; 387 } 388 if ((nz & 0x07) > 2) { 389 mask = (__mmask8)(0xff >> (8 - (nz & 0x07))); 390 vec_idx = _mm256_loadu_si256((__m256i const *)&aj[ipos]); 391 vec_vals = _mm512_loadu_pd(&aa[ipos]); 392 vec_x = _mm512_mask_i32gather_pd(vec_x, mask, vec_idx, x, _MM_SCALE_8); 393 vec_y = _mm512_mask3_fmadd_pd(vec_x, vec_vals, vec_y, mask); 394 } else if ((nz & 0x07) == 2) { 395 yp[i] += aa[ipos] * x[aj[ipos]]; 396 yp[i] += aa[ipos + 1] * x[aj[ipos + 1]]; 397 } else if ((nz & 0x07) == 1) { 398 yp[i] += aa[ipos] * x[aj[ipos]]; 399 } 400 yp[i] += _mm512_reduce_add_pd(vec_y); 401 #else 402 for (j = 0; j < nz; j++) { 403 ipos = ip[i] + j; 404 yp[i] += aa[ipos] * x[aj[ipos]]; 405 } 406 #endif 407 } 408 } else { 409 /* Otherwise, there are enough rows in the chunk to make it 410 * worthwhile to vectorize across the rows, that is, to do the 411 * matvec by operating with "columns" of the chunk. */ 412 for (j = 0; j < nz; j++) { 413 #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 414 vec_j = _mm256_set1_epi32(j); 415 for (i = 0; i < ((isize >> 3) << 3); i += 8) { 416 vec_y = _mm512_loadu_pd(&yp[i]); 417 vec_ipos = _mm256_loadu_si256((__m256i const *)&ip[i]); 418 vec_ipos = _mm256_add_epi32(vec_ipos, vec_j); 419 vec_idx = _mm256_i32gather_epi32(aj, vec_ipos, _MM_SCALE_4); 420 vec_vals = _mm512_i32gather_pd(vec_ipos, aa, _MM_SCALE_8); 421 vec_x = _mm512_i32gather_pd(vec_idx, x, _MM_SCALE_8); 422 vec_y = _mm512_fmadd_pd(vec_x, vec_vals, vec_y); 423 _mm512_storeu_pd(&yp[i], vec_y); 424 } 425 for (i = isize - (isize & 0x07); i < isize; i++) { 426 ipos = ip[i] + j; 427 yp[i] += aa[ipos] * x[aj[ipos]]; 428 } 429 #else 430 for (i = 0; i < isize; i++) { 431 ipos = ip[i] + j; 432 yp[i] += aa[ipos] * x[aj[ipos]]; 433 } 434 #endif 435 } 436 } 437 438 #if defined(PETSC_HAVE_CRAY_VECTOR) 439 #pragma _CRI ivdep 440 #endif 441 /* Put results from yp[] into non-permuted result vector y. */ 442 for (i = 0; i < isize; i++) { y[iperm[istart + i]] = yp[i]; } 443 } /* End processing chunk of isize rows of a group. */ 444 } /* End handling matvec for chunk with nz > 1. */ 445 } /* End loop over igroup. */ 446 #endif 447 PetscCall(PetscLogFlops(PetscMax(2.0 * a->nz - A->rmap->n, 0))); 448 PetscCall(VecRestoreArrayRead(xx, &x)); 449 PetscCall(VecRestoreArray(yy, &y)); 450 PetscFunctionReturn(0); 451 } 452 453 /* MatMultAdd_SeqAIJPERM() calculates yy = ww + A * xx. 454 * Note that the names I used to designate the vectors differs from that 455 * used in MatMultAdd_SeqAIJ(). I did this to keep my notation consistent 456 * with the MatMult_SeqAIJPERM() routine, which is very similar to this one. */ 457 /* 458 I hate having virtually identical code for the mult and the multadd!!! 459 */ 460 PetscErrorCode MatMultAdd_SeqAIJPERM(Mat A, Vec xx, Vec ww, Vec yy) { 461 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 462 const PetscScalar *x; 463 PetscScalar *y, *w; 464 const MatScalar *aa; 465 const PetscInt *aj, *ai; 466 #if !defined(PETSC_USE_FORTRAN_KERNEL_MULTADDAIJPERM) 467 PetscInt i, j; 468 #endif 469 470 /* Variables that don't appear in MatMultAdd_SeqAIJ. */ 471 Mat_SeqAIJPERM *aijperm; 472 PetscInt *iperm; /* Points to the permutation vector. */ 473 PetscInt *xgroup; 474 /* Denotes where groups of rows with same number of nonzeros 475 * begin and end in iperm. */ 476 PetscInt *nzgroup; 477 PetscInt ngroup; 478 PetscInt igroup; 479 PetscInt jstart, jend; 480 /* jstart is used in loops to denote the position in iperm where a 481 * group starts; jend denotes the position where it ends. 482 * (jend + 1 is where the next group starts.) */ 483 PetscInt iold, nz; 484 PetscInt istart, iend, isize; 485 PetscInt ipos; 486 PetscScalar yp[NDIM]; 487 PetscInt ip[NDIM]; 488 /* yp[] and ip[] are treated as vector "registers" for performing 489 * the mat-vec. */ 490 491 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 492 #pragma disjoint(*x, *y, *aa) 493 #endif 494 495 PetscFunctionBegin; 496 PetscCall(VecGetArrayRead(xx, &x)); 497 PetscCall(VecGetArrayPair(yy, ww, &y, &w)); 498 499 aj = a->j; /* aj[k] gives column index for element aa[k]. */ 500 aa = a->a; /* Nonzero elements stored row-by-row. */ 501 ai = a->i; /* ai[k] is the position in aa and aj where row k starts. */ 502 503 /* Get the info we need about the permutations and groupings. */ 504 aijperm = (Mat_SeqAIJPERM *)A->spptr; 505 iperm = aijperm->iperm; 506 ngroup = aijperm->ngroup; 507 xgroup = aijperm->xgroup; 508 nzgroup = aijperm->nzgroup; 509 510 #if defined(PETSC_USE_FORTRAN_KERNEL_MULTADDAIJPERM) 511 fortranmultaddaijperm_(&m, x, ii, aj, aa, y, w); 512 #else 513 514 for (igroup = 0; igroup < ngroup; igroup++) { 515 jstart = xgroup[igroup]; 516 jend = xgroup[igroup + 1] - 1; 517 518 nz = nzgroup[igroup]; 519 520 /* Handle the special cases where the number of nonzeros per row 521 * in the group is either 0 or 1. */ 522 if (nz == 0) { 523 for (i = jstart; i <= jend; i++) { 524 iold = iperm[i]; 525 y[iold] = w[iold]; 526 } 527 } else if (nz == 1) { 528 for (i = jstart; i <= jend; i++) { 529 iold = iperm[i]; 530 ipos = ai[iold]; 531 y[iold] = w[iold] + aa[ipos] * x[aj[ipos]]; 532 } 533 } 534 /* For the general case: */ 535 else { 536 /* We work our way through the current group in chunks of NDIM rows 537 * at a time. */ 538 539 for (istart = jstart; istart <= jend; istart += NDIM) { 540 /* Figure out where the chunk of 'isize' rows ends in iperm. 541 * 'isize may of course be less than NDIM for the last chunk. */ 542 iend = istart + (NDIM - 1); 543 if (iend > jend) iend = jend; 544 isize = iend - istart + 1; 545 546 /* Initialize the yp[] array that will be used to hold part of 547 * the permuted results vector, and figure out where in aa each 548 * row of the chunk will begin. */ 549 for (i = 0; i < isize; i++) { 550 iold = iperm[istart + i]; 551 /* iold is a row number from the matrix A *before* reordering. */ 552 ip[i] = ai[iold]; 553 /* ip[i] tells us where the ith row of the chunk begins in aa. */ 554 yp[i] = w[iold]; 555 } 556 557 /* If the number of zeros per row exceeds the number of rows in 558 * the chunk, we should vectorize along nz, that is, perform the 559 * mat-vec one row at a time as in the usual CSR case. */ 560 if (nz > isize) { 561 #if defined(PETSC_HAVE_CRAY_VECTOR) 562 #pragma _CRI preferstream 563 #endif 564 for (i = 0; i < isize; i++) { 565 #if defined(PETSC_HAVE_CRAY_VECTOR) 566 #pragma _CRI prefervector 567 #endif 568 for (j = 0; j < nz; j++) { 569 ipos = ip[i] + j; 570 yp[i] += aa[ipos] * x[aj[ipos]]; 571 } 572 } 573 } 574 /* Otherwise, there are enough rows in the chunk to make it 575 * worthwhile to vectorize across the rows, that is, to do the 576 * matvec by operating with "columns" of the chunk. */ 577 else { 578 for (j = 0; j < nz; j++) { 579 for (i = 0; i < isize; i++) { 580 ipos = ip[i] + j; 581 yp[i] += aa[ipos] * x[aj[ipos]]; 582 } 583 } 584 } 585 586 #if defined(PETSC_HAVE_CRAY_VECTOR) 587 #pragma _CRI ivdep 588 #endif 589 /* Put results from yp[] into non-permuted result vector y. */ 590 for (i = 0; i < isize; i++) { y[iperm[istart + i]] = yp[i]; } 591 } /* End processing chunk of isize rows of a group. */ 592 593 } /* End handling matvec for chunk with nz > 1. */ 594 } /* End loop over igroup. */ 595 596 #endif 597 PetscCall(PetscLogFlops(2.0 * a->nz)); 598 PetscCall(VecRestoreArrayRead(xx, &x)); 599 PetscCall(VecRestoreArrayPair(yy, ww, &y, &w)); 600 PetscFunctionReturn(0); 601 } 602 603 /* MatConvert_SeqAIJ_SeqAIJPERM converts a SeqAIJ matrix into a 604 * SeqAIJPERM matrix. This routine is called by the MatCreate_SeqAIJPERM() 605 * routine, but can also be used to convert an assembled SeqAIJ matrix 606 * into a SeqAIJPERM one. */ 607 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJPERM(Mat A, MatType type, MatReuse reuse, Mat *newmat) { 608 Mat B = *newmat; 609 Mat_SeqAIJPERM *aijperm; 610 PetscBool sametype; 611 612 PetscFunctionBegin; 613 if (reuse == MAT_INITIAL_MATRIX) { PetscCall(MatDuplicate(A, MAT_COPY_VALUES, &B)); } 614 PetscCall(PetscObjectTypeCompare((PetscObject)A, type, &sametype)); 615 if (sametype) PetscFunctionReturn(0); 616 617 PetscCall(PetscNewLog(B, &aijperm)); 618 B->spptr = (void *)aijperm; 619 620 /* Set function pointers for methods that we inherit from AIJ but override. */ 621 B->ops->duplicate = MatDuplicate_SeqAIJPERM; 622 B->ops->assemblyend = MatAssemblyEnd_SeqAIJPERM; 623 B->ops->destroy = MatDestroy_SeqAIJPERM; 624 B->ops->mult = MatMult_SeqAIJPERM; 625 B->ops->multadd = MatMultAdd_SeqAIJPERM; 626 627 aijperm->nonzerostate = -1; /* this will trigger the generation of the permutation information the first time through MatAssembly()*/ 628 /* If A has already been assembled, compute the permutation. */ 629 if (A->assembled) PetscCall(MatSeqAIJPERM_create_perm(B)); 630 631 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijperm_seqaij_C", MatConvert_SeqAIJPERM_SeqAIJ)); 632 633 PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJPERM)); 634 *newmat = B; 635 PetscFunctionReturn(0); 636 } 637 638 /*@C 639 MatCreateSeqAIJPERM - Creates a sparse matrix of type SEQAIJPERM. 640 This type inherits from AIJ, but calculates some additional permutation 641 information that is used to allow better vectorization of some 642 operations. At the cost of increased storage, the AIJ formatted 643 matrix can be copied to a format in which pieces of the matrix are 644 stored in ELLPACK format, allowing the vectorized matrix multiply 645 routine to use stride-1 memory accesses. As with the AIJ type, it is 646 important to preallocate matrix storage in order to get good assembly 647 performance. 648 649 Collective 650 651 Input Parameters: 652 + comm - MPI communicator, set to PETSC_COMM_SELF 653 . m - number of rows 654 . n - number of columns 655 . nz - number of nonzeros per row (same for all rows) 656 - nnz - array containing the number of nonzeros in the various rows 657 (possibly different for each row) or NULL 658 659 Output Parameter: 660 . A - the matrix 661 662 Notes: 663 If nnz is given then nz is ignored 664 665 Level: intermediate 666 667 .seealso: `MatCreate()`, `MatCreateMPIAIJPERM()`, `MatSetValues()` 668 @*/ 669 PetscErrorCode MatCreateSeqAIJPERM(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A) { 670 PetscFunctionBegin; 671 PetscCall(MatCreate(comm, A)); 672 PetscCall(MatSetSizes(*A, m, n, m, n)); 673 PetscCall(MatSetType(*A, MATSEQAIJPERM)); 674 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, nnz)); 675 PetscFunctionReturn(0); 676 } 677 678 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJPERM(Mat A) { 679 PetscFunctionBegin; 680 PetscCall(MatSetType(A, MATSEQAIJ)); 681 PetscCall(MatConvert_SeqAIJ_SeqAIJPERM(A, MATSEQAIJPERM, MAT_INPLACE_MATRIX, &A)); 682 PetscFunctionReturn(0); 683 } 684