1 2 /* 3 Defines basic operations for the MATSEQAIJPERM matrix class. 4 This class is derived from the MATSEQAIJ class and retains the 5 compressed row storage (aka Yale sparse matrix format) but augments 6 it with some permutation information that enables some operations 7 to be more vectorizable. A physically rearranged copy of the matrix 8 may be stored if the user desires. 9 10 Eventually a variety of permutations may be supported. 11 */ 12 13 #include <../src/mat/impls/aij/seq/aij.h> 14 15 #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 16 #include <immintrin.h> 17 18 #if !defined(_MM_SCALE_8) 19 #define _MM_SCALE_8 8 20 #endif 21 #if !defined(_MM_SCALE_4) 22 #define _MM_SCALE_4 4 23 #endif 24 #endif 25 26 #define NDIM 512 27 /* NDIM specifies how many rows at a time we should work with when 28 * performing the vectorized mat-vec. This depends on various factors 29 * such as vector register length, etc., and I really need to add a 30 * way for the user (or the library) to tune this. I'm setting it to 31 * 512 for now since that is what Ed D'Azevedo was using in his Fortran 32 * routines. */ 33 34 typedef struct { 35 PetscObjectState nonzerostate; /* used to determine if the nonzero structure has changed and hence the permutations need updating */ 36 37 PetscInt ngroup; 38 PetscInt *xgroup; 39 /* Denotes where groups of rows with same number of nonzeros 40 * begin and end, i.e., xgroup[i] gives us the position in iperm[] 41 * where the ith group begins. */ 42 43 PetscInt *nzgroup; /* how many nonzeros each row that is a member of group i has. */ 44 PetscInt *iperm; /* The permutation vector. */ 45 46 /* Some of this stuff is for Ed's recursive triangular solve. 47 * I'm not sure what I need yet. */ 48 PetscInt blocksize; 49 PetscInt nstep; 50 PetscInt *jstart_list; 51 PetscInt *jend_list; 52 PetscInt *action_list; 53 PetscInt *ngroup_list; 54 PetscInt **ipointer_list; 55 PetscInt **xgroup_list; 56 PetscInt **nzgroup_list; 57 PetscInt **iperm_list; 58 } Mat_SeqAIJPERM; 59 60 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJPERM_SeqAIJ(Mat A, MatType type, MatReuse reuse, Mat *newmat) { 61 /* This routine is only called to convert a MATAIJPERM to its base PETSc type, */ 62 /* so we will ignore 'MatType type'. */ 63 Mat B = *newmat; 64 Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM *)A->spptr; 65 66 PetscFunctionBegin; 67 if (reuse == MAT_INITIAL_MATRIX) { 68 PetscCall(MatDuplicate(A, MAT_COPY_VALUES, &B)); 69 aijperm = (Mat_SeqAIJPERM *)B->spptr; 70 } 71 72 /* Reset the original function pointers. */ 73 B->ops->assemblyend = MatAssemblyEnd_SeqAIJ; 74 B->ops->destroy = MatDestroy_SeqAIJ; 75 B->ops->duplicate = MatDuplicate_SeqAIJ; 76 B->ops->mult = MatMult_SeqAIJ; 77 B->ops->multadd = MatMultAdd_SeqAIJ; 78 79 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijperm_seqaij_C", NULL)); 80 81 /* Free everything in the Mat_SeqAIJPERM data structure.*/ 82 PetscCall(PetscFree(aijperm->xgroup)); 83 PetscCall(PetscFree(aijperm->nzgroup)); 84 PetscCall(PetscFree(aijperm->iperm)); 85 PetscCall(PetscFree(B->spptr)); 86 87 /* Change the type of B to MATSEQAIJ. */ 88 PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJ)); 89 90 *newmat = B; 91 PetscFunctionReturn(0); 92 } 93 94 PetscErrorCode MatDestroy_SeqAIJPERM(Mat A) { 95 Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM *)A->spptr; 96 97 PetscFunctionBegin; 98 if (aijperm) { 99 /* If MatHeaderMerge() was used then this SeqAIJPERM matrix will not have a spprt. */ 100 PetscCall(PetscFree(aijperm->xgroup)); 101 PetscCall(PetscFree(aijperm->nzgroup)); 102 PetscCall(PetscFree(aijperm->iperm)); 103 PetscCall(PetscFree(A->spptr)); 104 } 105 /* Change the type of A back to SEQAIJ and use MatDestroy_SeqAIJ() 106 * to destroy everything that remains. */ 107 PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJ)); 108 /* Note that I don't call MatSetType(). I believe this is because that 109 * is only to be called when *building* a matrix. I could be wrong, but 110 * that is how things work for the SuperLU matrix class. */ 111 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijperm_seqaij_C", NULL)); 112 PetscCall(MatDestroy_SeqAIJ(A)); 113 PetscFunctionReturn(0); 114 } 115 116 PetscErrorCode MatDuplicate_SeqAIJPERM(Mat A, MatDuplicateOption op, Mat *M) { 117 Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM *)A->spptr; 118 Mat_SeqAIJPERM *aijperm_dest; 119 PetscBool perm; 120 121 PetscFunctionBegin; 122 PetscCall(MatDuplicate_SeqAIJ(A, op, M)); 123 PetscCall(PetscObjectTypeCompare((PetscObject)*M, MATSEQAIJPERM, &perm)); 124 if (perm) { 125 aijperm_dest = (Mat_SeqAIJPERM *)(*M)->spptr; 126 PetscCall(PetscFree(aijperm_dest->xgroup)); 127 PetscCall(PetscFree(aijperm_dest->nzgroup)); 128 PetscCall(PetscFree(aijperm_dest->iperm)); 129 } else { 130 PetscCall(PetscNewLog(*M, &aijperm_dest)); 131 (*M)->spptr = (void *)aijperm_dest; 132 PetscCall(PetscObjectChangeTypeName((PetscObject)*M, MATSEQAIJPERM)); 133 PetscCall(PetscObjectComposeFunction((PetscObject)*M, "MatConvert_seqaijperm_seqaij_C", MatConvert_SeqAIJPERM_SeqAIJ)); 134 } 135 PetscCall(PetscArraycpy(aijperm_dest, aijperm, 1)); 136 /* Allocate space for, and copy the grouping and permutation info. 137 * I note that when the groups are initially determined in 138 * MatSeqAIJPERM_create_perm, xgroup and nzgroup may be sized larger than 139 * necessary. But at this point, we know how large they need to be, and 140 * allocate only the necessary amount of memory. So the duplicated matrix 141 * may actually use slightly less storage than the original! */ 142 PetscCall(PetscMalloc1(A->rmap->n, &aijperm_dest->iperm)); 143 PetscCall(PetscMalloc1(aijperm->ngroup + 1, &aijperm_dest->xgroup)); 144 PetscCall(PetscMalloc1(aijperm->ngroup, &aijperm_dest->nzgroup)); 145 PetscCall(PetscArraycpy(aijperm_dest->iperm, aijperm->iperm, A->rmap->n)); 146 PetscCall(PetscArraycpy(aijperm_dest->xgroup, aijperm->xgroup, aijperm->ngroup + 1)); 147 PetscCall(PetscArraycpy(aijperm_dest->nzgroup, aijperm->nzgroup, aijperm->ngroup)); 148 PetscFunctionReturn(0); 149 } 150 151 PetscErrorCode MatSeqAIJPERM_create_perm(Mat A) { 152 Mat_SeqAIJ *a = (Mat_SeqAIJ *)(A)->data; 153 Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM *)A->spptr; 154 PetscInt m; /* Number of rows in the matrix. */ 155 PetscInt *ia; /* From the CSR representation; points to the beginning of each row. */ 156 PetscInt maxnz; /* Maximum number of nonzeros in any row. */ 157 PetscInt *rows_in_bucket; 158 /* To construct the permutation, we sort each row into one of maxnz 159 * buckets based on how many nonzeros are in the row. */ 160 PetscInt nz; 161 PetscInt *nz_in_row; /* the number of nonzero elements in row k. */ 162 PetscInt *ipnz; 163 /* When constructing the iperm permutation vector, 164 * ipnz[nz] is used to point to the next place in the permutation vector 165 * that a row with nz nonzero elements should be placed.*/ 166 PetscInt i, ngroup, istart, ipos; 167 168 PetscFunctionBegin; 169 if (aijperm->nonzerostate == A->nonzerostate) PetscFunctionReturn(0); /* permutation exists and matches current nonzero structure */ 170 aijperm->nonzerostate = A->nonzerostate; 171 /* Free anything previously put in the Mat_SeqAIJPERM data structure. */ 172 PetscCall(PetscFree(aijperm->xgroup)); 173 PetscCall(PetscFree(aijperm->nzgroup)); 174 PetscCall(PetscFree(aijperm->iperm)); 175 176 m = A->rmap->n; 177 ia = a->i; 178 179 /* Allocate the arrays that will hold the permutation vector. */ 180 PetscCall(PetscMalloc1(m, &aijperm->iperm)); 181 182 /* Allocate some temporary work arrays that will be used in 183 * calculating the permutation vector and groupings. */ 184 PetscCall(PetscMalloc1(m, &nz_in_row)); 185 186 /* Now actually figure out the permutation and grouping. */ 187 188 /* First pass: Determine number of nonzeros in each row, maximum 189 * number of nonzeros in any row, and how many rows fall into each 190 * "bucket" of rows with same number of nonzeros. */ 191 maxnz = 0; 192 for (i = 0; i < m; i++) { 193 nz_in_row[i] = ia[i + 1] - ia[i]; 194 if (nz_in_row[i] > maxnz) maxnz = nz_in_row[i]; 195 } 196 PetscCall(PetscMalloc1(PetscMax(maxnz, m) + 1, &rows_in_bucket)); 197 PetscCall(PetscMalloc1(PetscMax(maxnz, m) + 1, &ipnz)); 198 199 for (i = 0; i <= maxnz; i++) rows_in_bucket[i] = 0; 200 for (i = 0; i < m; i++) { 201 nz = nz_in_row[i]; 202 rows_in_bucket[nz]++; 203 } 204 205 /* Allocate space for the grouping info. There will be at most (maxnz + 1) 206 * groups. (It is maxnz + 1 instead of simply maxnz because there may be 207 * rows with no nonzero elements.) If there are (maxnz + 1) groups, 208 * then xgroup[] must consist of (maxnz + 2) elements, since the last 209 * element of xgroup will tell us where the (maxnz + 1)th group ends. 210 * We allocate space for the maximum number of groups; 211 * that is potentially a little wasteful, but not too much so. 212 * Perhaps I should fix it later. */ 213 PetscCall(PetscMalloc1(maxnz + 2, &aijperm->xgroup)); 214 PetscCall(PetscMalloc1(maxnz + 1, &aijperm->nzgroup)); 215 216 /* Second pass. Look at what is in the buckets and create the groupings. 217 * Note that it is OK to have a group of rows with no non-zero values. */ 218 ngroup = 0; 219 istart = 0; 220 for (i = 0; i <= maxnz; i++) { 221 if (rows_in_bucket[i] > 0) { 222 aijperm->nzgroup[ngroup] = i; 223 aijperm->xgroup[ngroup] = istart; 224 ngroup++; 225 istart += rows_in_bucket[i]; 226 } 227 } 228 229 aijperm->xgroup[ngroup] = istart; 230 aijperm->ngroup = ngroup; 231 232 /* Now fill in the permutation vector iperm. */ 233 ipnz[0] = 0; 234 for (i = 0; i < maxnz; i++) ipnz[i + 1] = ipnz[i] + rows_in_bucket[i]; 235 236 for (i = 0; i < m; i++) { 237 nz = nz_in_row[i]; 238 ipos = ipnz[nz]; 239 aijperm->iperm[ipos] = i; 240 ipnz[nz]++; 241 } 242 243 /* Clean up temporary work arrays. */ 244 PetscCall(PetscFree(rows_in_bucket)); 245 PetscCall(PetscFree(ipnz)); 246 PetscCall(PetscFree(nz_in_row)); 247 PetscFunctionReturn(0); 248 } 249 250 PetscErrorCode MatAssemblyEnd_SeqAIJPERM(Mat A, MatAssemblyType mode) { 251 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 252 253 PetscFunctionBegin; 254 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 255 256 /* Since a MATSEQAIJPERM matrix is really just a MATSEQAIJ with some 257 * extra information, call the AssemblyEnd routine for a MATSEQAIJ. 258 * I'm not sure if this is the best way to do this, but it avoids 259 * a lot of code duplication. 260 * I also note that currently MATSEQAIJPERM doesn't know anything about 261 * the Mat_CompressedRow data structure that SeqAIJ now uses when there 262 * are many zero rows. If the SeqAIJ assembly end routine decides to use 263 * this, this may break things. (Don't know... haven't looked at it.) */ 264 a->inode.use = PETSC_FALSE; 265 PetscCall(MatAssemblyEnd_SeqAIJ(A, mode)); 266 267 /* Now calculate the permutation and grouping information. */ 268 PetscCall(MatSeqAIJPERM_create_perm(A)); 269 PetscFunctionReturn(0); 270 } 271 272 PetscErrorCode MatMult_SeqAIJPERM(Mat A, Vec xx, Vec yy) { 273 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 274 const PetscScalar *x; 275 PetscScalar *y; 276 const MatScalar *aa; 277 const PetscInt *aj, *ai; 278 #if !(defined(PETSC_USE_FORTRAN_KERNEL_MULTAIJPERM) && defined(notworking)) 279 PetscInt i, j; 280 #endif 281 #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 282 __m512d vec_x, vec_y, vec_vals; 283 __m256i vec_idx, vec_ipos, vec_j; 284 __mmask8 mask; 285 #endif 286 287 /* Variables that don't appear in MatMult_SeqAIJ. */ 288 Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM *)A->spptr; 289 PetscInt *iperm; /* Points to the permutation vector. */ 290 PetscInt *xgroup; 291 /* Denotes where groups of rows with same number of nonzeros 292 * begin and end in iperm. */ 293 PetscInt *nzgroup; 294 PetscInt ngroup; 295 PetscInt igroup; 296 PetscInt jstart, jend; 297 /* jstart is used in loops to denote the position in iperm where a 298 * group starts; jend denotes the position where it ends. 299 * (jend + 1 is where the next group starts.) */ 300 PetscInt iold, nz; 301 PetscInt istart, iend, isize; 302 PetscInt ipos; 303 PetscScalar yp[NDIM]; 304 PetscInt ip[NDIM]; /* yp[] and ip[] are treated as vector "registers" for performing the mat-vec. */ 305 306 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 307 #pragma disjoint(*x, *y, *aa) 308 #endif 309 310 PetscFunctionBegin; 311 PetscCall(VecGetArrayRead(xx, &x)); 312 PetscCall(VecGetArray(yy, &y)); 313 aj = a->j; /* aj[k] gives column index for element aa[k]. */ 314 aa = a->a; /* Nonzero elements stored row-by-row. */ 315 ai = a->i; /* ai[k] is the position in aa and aj where row k starts. */ 316 317 /* Get the info we need about the permutations and groupings. */ 318 iperm = aijperm->iperm; 319 ngroup = aijperm->ngroup; 320 xgroup = aijperm->xgroup; 321 nzgroup = aijperm->nzgroup; 322 323 #if defined(PETSC_USE_FORTRAN_KERNEL_MULTAIJPERM) && defined(notworking) 324 fortranmultaijperm_(&m, x, ii, aj, aa, y); 325 #else 326 327 for (igroup = 0; igroup < ngroup; igroup++) { 328 jstart = xgroup[igroup]; 329 jend = xgroup[igroup + 1] - 1; 330 nz = nzgroup[igroup]; 331 332 /* Handle the special cases where the number of nonzeros per row 333 * in the group is either 0 or 1. */ 334 if (nz == 0) { 335 for (i = jstart; i <= jend; i++) y[iperm[i]] = 0.0; 336 } else if (nz == 1) { 337 for (i = jstart; i <= jend; i++) { 338 iold = iperm[i]; 339 ipos = ai[iold]; 340 y[iold] = aa[ipos] * x[aj[ipos]]; 341 } 342 } else { 343 /* We work our way through the current group in chunks of NDIM rows 344 * at a time. */ 345 346 for (istart = jstart; istart <= jend; istart += NDIM) { 347 /* Figure out where the chunk of 'isize' rows ends in iperm. 348 * 'isize may of course be less than NDIM for the last chunk. */ 349 iend = istart + (NDIM - 1); 350 351 if (iend > jend) iend = jend; 352 353 isize = iend - istart + 1; 354 355 /* Initialize the yp[] array that will be used to hold part of 356 * the permuted results vector, and figure out where in aa each 357 * row of the chunk will begin. */ 358 for (i = 0; i < isize; i++) { 359 iold = iperm[istart + i]; 360 /* iold is a row number from the matrix A *before* reordering. */ 361 ip[i] = ai[iold]; 362 /* ip[i] tells us where the ith row of the chunk begins in aa. */ 363 yp[i] = (PetscScalar)0.0; 364 } 365 366 /* If the number of zeros per row exceeds the number of rows in 367 * the chunk, we should vectorize along nz, that is, perform the 368 * mat-vec one row at a time as in the usual CSR case. */ 369 if (nz > isize) { 370 #if defined(PETSC_HAVE_CRAY_VECTOR) 371 #pragma _CRI preferstream 372 #endif 373 for (i = 0; i < isize; i++) { 374 #if defined(PETSC_HAVE_CRAY_VECTOR) 375 #pragma _CRI prefervector 376 #endif 377 378 #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 379 vec_y = _mm512_setzero_pd(); 380 ipos = ip[i]; 381 for (j = 0; j < (nz >> 3); j++) { 382 vec_idx = _mm256_loadu_si256((__m256i const *)&aj[ipos]); 383 vec_vals = _mm512_loadu_pd(&aa[ipos]); 384 vec_x = _mm512_i32gather_pd(vec_idx, x, _MM_SCALE_8); 385 vec_y = _mm512_fmadd_pd(vec_x, vec_vals, vec_y); 386 ipos += 8; 387 } 388 if ((nz & 0x07) > 2) { 389 mask = (__mmask8)(0xff >> (8 - (nz & 0x07))); 390 vec_idx = _mm256_loadu_si256((__m256i const *)&aj[ipos]); 391 vec_vals = _mm512_loadu_pd(&aa[ipos]); 392 vec_x = _mm512_mask_i32gather_pd(vec_x, mask, vec_idx, x, _MM_SCALE_8); 393 vec_y = _mm512_mask3_fmadd_pd(vec_x, vec_vals, vec_y, mask); 394 } else if ((nz & 0x07) == 2) { 395 yp[i] += aa[ipos] * x[aj[ipos]]; 396 yp[i] += aa[ipos + 1] * x[aj[ipos + 1]]; 397 } else if ((nz & 0x07) == 1) { 398 yp[i] += aa[ipos] * x[aj[ipos]]; 399 } 400 yp[i] += _mm512_reduce_add_pd(vec_y); 401 #else 402 for (j = 0; j < nz; j++) { 403 ipos = ip[i] + j; 404 yp[i] += aa[ipos] * x[aj[ipos]]; 405 } 406 #endif 407 } 408 } else { 409 /* Otherwise, there are enough rows in the chunk to make it 410 * worthwhile to vectorize across the rows, that is, to do the 411 * matvec by operating with "columns" of the chunk. */ 412 for (j = 0; j < nz; j++) { 413 #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 414 vec_j = _mm256_set1_epi32(j); 415 for (i = 0; i < ((isize >> 3) << 3); i += 8) { 416 vec_y = _mm512_loadu_pd(&yp[i]); 417 vec_ipos = _mm256_loadu_si256((__m256i const *)&ip[i]); 418 vec_ipos = _mm256_add_epi32(vec_ipos, vec_j); 419 vec_idx = _mm256_i32gather_epi32(aj, vec_ipos, _MM_SCALE_4); 420 vec_vals = _mm512_i32gather_pd(vec_ipos, aa, _MM_SCALE_8); 421 vec_x = _mm512_i32gather_pd(vec_idx, x, _MM_SCALE_8); 422 vec_y = _mm512_fmadd_pd(vec_x, vec_vals, vec_y); 423 _mm512_storeu_pd(&yp[i], vec_y); 424 } 425 for (i = isize - (isize & 0x07); i < isize; i++) { 426 ipos = ip[i] + j; 427 yp[i] += aa[ipos] * x[aj[ipos]]; 428 } 429 #else 430 for (i = 0; i < isize; i++) { 431 ipos = ip[i] + j; 432 yp[i] += aa[ipos] * x[aj[ipos]]; 433 } 434 #endif 435 } 436 } 437 438 #if defined(PETSC_HAVE_CRAY_VECTOR) 439 #pragma _CRI ivdep 440 #endif 441 /* Put results from yp[] into non-permuted result vector y. */ 442 for (i = 0; i < isize; i++) y[iperm[istart + i]] = yp[i]; 443 } /* End processing chunk of isize rows of a group. */ 444 } /* End handling matvec for chunk with nz > 1. */ 445 } /* End loop over igroup. */ 446 #endif 447 PetscCall(PetscLogFlops(PetscMax(2.0 * a->nz - A->rmap->n, 0))); 448 PetscCall(VecRestoreArrayRead(xx, &x)); 449 PetscCall(VecRestoreArray(yy, &y)); 450 PetscFunctionReturn(0); 451 } 452 453 /* MatMultAdd_SeqAIJPERM() calculates yy = ww + A * xx. 454 * Note that the names I used to designate the vectors differs from that 455 * used in MatMultAdd_SeqAIJ(). I did this to keep my notation consistent 456 * with the MatMult_SeqAIJPERM() routine, which is very similar to this one. */ 457 /* 458 I hate having virtually identical code for the mult and the multadd!!! 459 */ 460 PetscErrorCode MatMultAdd_SeqAIJPERM(Mat A, Vec xx, Vec ww, Vec yy) { 461 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 462 const PetscScalar *x; 463 PetscScalar *y, *w; 464 const MatScalar *aa; 465 const PetscInt *aj, *ai; 466 #if !defined(PETSC_USE_FORTRAN_KERNEL_MULTADDAIJPERM) 467 PetscInt i, j; 468 #endif 469 470 /* Variables that don't appear in MatMultAdd_SeqAIJ. */ 471 Mat_SeqAIJPERM *aijperm; 472 PetscInt *iperm; /* Points to the permutation vector. */ 473 PetscInt *xgroup; 474 /* Denotes where groups of rows with same number of nonzeros 475 * begin and end in iperm. */ 476 PetscInt *nzgroup; 477 PetscInt ngroup; 478 PetscInt igroup; 479 PetscInt jstart, jend; 480 /* jstart is used in loops to denote the position in iperm where a 481 * group starts; jend denotes the position where it ends. 482 * (jend + 1 is where the next group starts.) */ 483 PetscInt iold, nz; 484 PetscInt istart, iend, isize; 485 PetscInt ipos; 486 PetscScalar yp[NDIM]; 487 PetscInt ip[NDIM]; 488 /* yp[] and ip[] are treated as vector "registers" for performing 489 * the mat-vec. */ 490 491 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 492 #pragma disjoint(*x, *y, *aa) 493 #endif 494 495 PetscFunctionBegin; 496 PetscCall(VecGetArrayRead(xx, &x)); 497 PetscCall(VecGetArrayPair(yy, ww, &y, &w)); 498 499 aj = a->j; /* aj[k] gives column index for element aa[k]. */ 500 aa = a->a; /* Nonzero elements stored row-by-row. */ 501 ai = a->i; /* ai[k] is the position in aa and aj where row k starts. */ 502 503 /* Get the info we need about the permutations and groupings. */ 504 aijperm = (Mat_SeqAIJPERM *)A->spptr; 505 iperm = aijperm->iperm; 506 ngroup = aijperm->ngroup; 507 xgroup = aijperm->xgroup; 508 nzgroup = aijperm->nzgroup; 509 510 #if defined(PETSC_USE_FORTRAN_KERNEL_MULTADDAIJPERM) 511 fortranmultaddaijperm_(&m, x, ii, aj, aa, y, w); 512 #else 513 514 for (igroup = 0; igroup < ngroup; igroup++) { 515 jstart = xgroup[igroup]; 516 jend = xgroup[igroup + 1] - 1; 517 518 nz = nzgroup[igroup]; 519 520 /* Handle the special cases where the number of nonzeros per row 521 * in the group is either 0 or 1. */ 522 if (nz == 0) { 523 for (i = jstart; i <= jend; i++) { 524 iold = iperm[i]; 525 y[iold] = w[iold]; 526 } 527 } else if (nz == 1) { 528 for (i = jstart; i <= jend; i++) { 529 iold = iperm[i]; 530 ipos = ai[iold]; 531 y[iold] = w[iold] + aa[ipos] * x[aj[ipos]]; 532 } 533 } 534 /* For the general case: */ 535 else { 536 /* We work our way through the current group in chunks of NDIM rows 537 * at a time. */ 538 539 for (istart = jstart; istart <= jend; istart += NDIM) { 540 /* Figure out where the chunk of 'isize' rows ends in iperm. 541 * 'isize may of course be less than NDIM for the last chunk. */ 542 iend = istart + (NDIM - 1); 543 if (iend > jend) iend = jend; 544 isize = iend - istart + 1; 545 546 /* Initialize the yp[] array that will be used to hold part of 547 * the permuted results vector, and figure out where in aa each 548 * row of the chunk will begin. */ 549 for (i = 0; i < isize; i++) { 550 iold = iperm[istart + i]; 551 /* iold is a row number from the matrix A *before* reordering. */ 552 ip[i] = ai[iold]; 553 /* ip[i] tells us where the ith row of the chunk begins in aa. */ 554 yp[i] = w[iold]; 555 } 556 557 /* If the number of zeros per row exceeds the number of rows in 558 * the chunk, we should vectorize along nz, that is, perform the 559 * mat-vec one row at a time as in the usual CSR case. */ 560 if (nz > isize) { 561 #if defined(PETSC_HAVE_CRAY_VECTOR) 562 #pragma _CRI preferstream 563 #endif 564 for (i = 0; i < isize; i++) { 565 #if defined(PETSC_HAVE_CRAY_VECTOR) 566 #pragma _CRI prefervector 567 #endif 568 for (j = 0; j < nz; j++) { 569 ipos = ip[i] + j; 570 yp[i] += aa[ipos] * x[aj[ipos]]; 571 } 572 } 573 } 574 /* Otherwise, there are enough rows in the chunk to make it 575 * worthwhile to vectorize across the rows, that is, to do the 576 * matvec by operating with "columns" of the chunk. */ 577 else { 578 for (j = 0; j < nz; j++) { 579 for (i = 0; i < isize; i++) { 580 ipos = ip[i] + j; 581 yp[i] += aa[ipos] * x[aj[ipos]]; 582 } 583 } 584 } 585 586 #if defined(PETSC_HAVE_CRAY_VECTOR) 587 #pragma _CRI ivdep 588 #endif 589 /* Put results from yp[] into non-permuted result vector y. */ 590 for (i = 0; i < isize; i++) y[iperm[istart + i]] = yp[i]; 591 } /* End processing chunk of isize rows of a group. */ 592 593 } /* End handling matvec for chunk with nz > 1. */ 594 } /* End loop over igroup. */ 595 596 #endif 597 PetscCall(PetscLogFlops(2.0 * a->nz)); 598 PetscCall(VecRestoreArrayRead(xx, &x)); 599 PetscCall(VecRestoreArrayPair(yy, ww, &y, &w)); 600 PetscFunctionReturn(0); 601 } 602 603 /* MatConvert_SeqAIJ_SeqAIJPERM converts a SeqAIJ matrix into a 604 * SeqAIJPERM matrix. This routine is called by the MatCreate_SeqAIJPERM() 605 * routine, but can also be used to convert an assembled SeqAIJ matrix 606 * into a SeqAIJPERM one. */ 607 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJPERM(Mat A, MatType type, MatReuse reuse, Mat *newmat) { 608 Mat B = *newmat; 609 Mat_SeqAIJPERM *aijperm; 610 PetscBool sametype; 611 612 PetscFunctionBegin; 613 if (reuse == MAT_INITIAL_MATRIX) PetscCall(MatDuplicate(A, MAT_COPY_VALUES, &B)); 614 PetscCall(PetscObjectTypeCompare((PetscObject)A, type, &sametype)); 615 if (sametype) PetscFunctionReturn(0); 616 617 PetscCall(PetscNewLog(B, &aijperm)); 618 B->spptr = (void *)aijperm; 619 620 /* Set function pointers for methods that we inherit from AIJ but override. */ 621 B->ops->duplicate = MatDuplicate_SeqAIJPERM; 622 B->ops->assemblyend = MatAssemblyEnd_SeqAIJPERM; 623 B->ops->destroy = MatDestroy_SeqAIJPERM; 624 B->ops->mult = MatMult_SeqAIJPERM; 625 B->ops->multadd = MatMultAdd_SeqAIJPERM; 626 627 aijperm->nonzerostate = -1; /* this will trigger the generation of the permutation information the first time through MatAssembly()*/ 628 /* If A has already been assembled, compute the permutation. */ 629 if (A->assembled) PetscCall(MatSeqAIJPERM_create_perm(B)); 630 631 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijperm_seqaij_C", MatConvert_SeqAIJPERM_SeqAIJ)); 632 633 PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJPERM)); 634 *newmat = B; 635 PetscFunctionReturn(0); 636 } 637 638 /*@C 639 MatCreateSeqAIJPERM - Creates a sparse matrix of type `MATSEQAIJPERM`. 640 This type inherits from `MATSEQAIJ`, but calculates some additional permutation 641 information that is used to allow better vectorization of some 642 operations. At the cost of increased storage, the `MATSEQAIJ` formatted 643 matrix can be copied to a format in which pieces of the matrix are 644 stored in ELLPACK format, allowing the vectorized matrix multiply 645 routine to use stride-1 memory accesses. As with the `MATSEQAIJ` type, it is 646 important to preallocate matrix storage in order to get good assembly 647 performance. 648 649 Collective 650 651 Input Parameters: 652 + comm - MPI communicator, set to `PETSC_COMM_SELF` 653 . m - number of rows 654 . n - number of columns 655 . nz - number of nonzeros per row (same for all rows) 656 - nnz - array containing the number of nonzeros in the various rows 657 (possibly different for each row) or NULL 658 659 Output Parameter: 660 . A - the matrix 661 662 Note: 663 If nnz is given then nz is ignored 664 665 Level: intermediate 666 667 .seealso: `MatCreate()`, `MatCreateMPIAIJPERM()`, `MatSetValues()` 668 @*/ 669 PetscErrorCode MatCreateSeqAIJPERM(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A) { 670 PetscFunctionBegin; 671 PetscCall(MatCreate(comm, A)); 672 PetscCall(MatSetSizes(*A, m, n, m, n)); 673 PetscCall(MatSetType(*A, MATSEQAIJPERM)); 674 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, nnz)); 675 PetscFunctionReturn(0); 676 } 677 678 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJPERM(Mat A) { 679 PetscFunctionBegin; 680 PetscCall(MatSetType(A, MATSEQAIJ)); 681 PetscCall(MatConvert_SeqAIJ_SeqAIJPERM(A, MATSEQAIJPERM, MAT_INPLACE_MATRIX, &A)); 682 PetscFunctionReturn(0); 683 } 684