1 /* 2 Defines basic operations for the MATSEQAIJPERM matrix class. 3 This class is derived from the MATSEQAIJ class and retains the 4 compressed row storage (aka Yale sparse matrix format) but augments 5 it with some permutation information that enables some operations 6 to be more vectorizable. A physically rearranged copy of the matrix 7 may be stored if the user desires. 8 9 Eventually a variety of permutations may be supported. 10 */ 11 12 #include <../src/mat/impls/aij/seq/aij.h> 13 14 #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 15 #include <immintrin.h> 16 17 #if !defined(_MM_SCALE_8) 18 #define _MM_SCALE_8 8 19 #endif 20 #if !defined(_MM_SCALE_4) 21 #define _MM_SCALE_4 4 22 #endif 23 #endif 24 25 #define NDIM 512 26 /* NDIM specifies how many rows at a time we should work with when 27 * performing the vectorized mat-vec. This depends on various factors 28 * such as vector register length, etc., and I really need to add a 29 * way for the user (or the library) to tune this. I'm setting it to 30 * 512 for now since that is what Ed D'Azevedo was using in his Fortran 31 * routines. */ 32 33 typedef struct { 34 PetscObjectState nonzerostate; /* used to determine if the nonzero structure has changed and hence the permutations need updating */ 35 36 PetscInt ngroup; 37 PetscInt *xgroup; 38 /* Denotes where groups of rows with same number of nonzeros 39 * begin and end, i.e., xgroup[i] gives us the position in iperm[] 40 * where the ith group begins. */ 41 42 PetscInt *nzgroup; /* how many nonzeros each row that is a member of group i has. */ 43 PetscInt *iperm; /* The permutation vector. */ 44 45 /* Some of this stuff is for Ed's recursive triangular solve. 46 * I'm not sure what I need yet. */ 47 PetscInt blocksize; 48 PetscInt nstep; 49 PetscInt *jstart_list; 50 PetscInt *jend_list; 51 PetscInt *action_list; 52 PetscInt *ngroup_list; 53 PetscInt **ipointer_list; 54 PetscInt **xgroup_list; 55 PetscInt **nzgroup_list; 56 PetscInt **iperm_list; 57 } Mat_SeqAIJPERM; 58 59 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJPERM_SeqAIJ(Mat A, MatType type, MatReuse reuse, Mat *newmat) 60 { 61 /* This routine is only called to convert a MATAIJPERM to its base PETSc type, */ 62 /* so we will ignore 'MatType type'. */ 63 Mat B = *newmat; 64 Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM *)A->spptr; 65 66 PetscFunctionBegin; 67 if (reuse == MAT_INITIAL_MATRIX) { 68 PetscCall(MatDuplicate(A, MAT_COPY_VALUES, &B)); 69 aijperm = (Mat_SeqAIJPERM *)B->spptr; 70 } 71 72 /* Reset the original function pointers. */ 73 B->ops->assemblyend = MatAssemblyEnd_SeqAIJ; 74 B->ops->destroy = MatDestroy_SeqAIJ; 75 B->ops->duplicate = MatDuplicate_SeqAIJ; 76 B->ops->mult = MatMult_SeqAIJ; 77 B->ops->multadd = MatMultAdd_SeqAIJ; 78 79 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijperm_seqaij_C", NULL)); 80 81 /* Free everything in the Mat_SeqAIJPERM data structure.*/ 82 PetscCall(PetscFree(aijperm->xgroup)); 83 PetscCall(PetscFree(aijperm->nzgroup)); 84 PetscCall(PetscFree(aijperm->iperm)); 85 PetscCall(PetscFree(B->spptr)); 86 87 /* Change the type of B to MATSEQAIJ. */ 88 PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJ)); 89 90 *newmat = B; 91 PetscFunctionReturn(PETSC_SUCCESS); 92 } 93 94 static PetscErrorCode MatDestroy_SeqAIJPERM(Mat A) 95 { 96 Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM *)A->spptr; 97 98 PetscFunctionBegin; 99 if (aijperm) { 100 /* If MatHeaderMerge() was used then this SeqAIJPERM matrix will not have a spprt. */ 101 PetscCall(PetscFree(aijperm->xgroup)); 102 PetscCall(PetscFree(aijperm->nzgroup)); 103 PetscCall(PetscFree(aijperm->iperm)); 104 PetscCall(PetscFree(A->spptr)); 105 } 106 /* Change the type of A back to SEQAIJ and use MatDestroy_SeqAIJ() 107 * to destroy everything that remains. */ 108 PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJ)); 109 /* Note that I don't call MatSetType(). I believe this is because that 110 * is only to be called when *building* a matrix. I could be wrong, but 111 * that is how things work for the SuperLU matrix class. */ 112 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijperm_seqaij_C", NULL)); 113 PetscCall(MatDestroy_SeqAIJ(A)); 114 PetscFunctionReturn(PETSC_SUCCESS); 115 } 116 117 static PetscErrorCode MatDuplicate_SeqAIJPERM(Mat A, MatDuplicateOption op, Mat *M) 118 { 119 Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM *)A->spptr; 120 Mat_SeqAIJPERM *aijperm_dest; 121 PetscBool perm; 122 123 PetscFunctionBegin; 124 PetscCall(MatDuplicate_SeqAIJ(A, op, M)); 125 PetscCall(PetscObjectTypeCompare((PetscObject)*M, MATSEQAIJPERM, &perm)); 126 if (perm) { 127 aijperm_dest = (Mat_SeqAIJPERM *)(*M)->spptr; 128 PetscCall(PetscFree(aijperm_dest->xgroup)); 129 PetscCall(PetscFree(aijperm_dest->nzgroup)); 130 PetscCall(PetscFree(aijperm_dest->iperm)); 131 } else { 132 PetscCall(PetscNew(&aijperm_dest)); 133 (*M)->spptr = (void *)aijperm_dest; 134 PetscCall(PetscObjectChangeTypeName((PetscObject)*M, MATSEQAIJPERM)); 135 PetscCall(PetscObjectComposeFunction((PetscObject)*M, "MatConvert_seqaijperm_seqaij_C", MatConvert_SeqAIJPERM_SeqAIJ)); 136 } 137 PetscCall(PetscArraycpy(aijperm_dest, aijperm, 1)); 138 /* Allocate space for, and copy the grouping and permutation info. 139 * I note that when the groups are initially determined in 140 * MatSeqAIJPERM_create_perm, xgroup and nzgroup may be sized larger than 141 * necessary. But at this point, we know how large they need to be, and 142 * allocate only the necessary amount of memory. So the duplicated matrix 143 * may actually use slightly less storage than the original! */ 144 PetscCall(PetscMalloc1(A->rmap->n, &aijperm_dest->iperm)); 145 PetscCall(PetscMalloc1(aijperm->ngroup + 1, &aijperm_dest->xgroup)); 146 PetscCall(PetscMalloc1(aijperm->ngroup, &aijperm_dest->nzgroup)); 147 PetscCall(PetscArraycpy(aijperm_dest->iperm, aijperm->iperm, A->rmap->n)); 148 PetscCall(PetscArraycpy(aijperm_dest->xgroup, aijperm->xgroup, aijperm->ngroup + 1)); 149 PetscCall(PetscArraycpy(aijperm_dest->nzgroup, aijperm->nzgroup, aijperm->ngroup)); 150 PetscFunctionReturn(PETSC_SUCCESS); 151 } 152 153 static PetscErrorCode MatSeqAIJPERM_create_perm(Mat A) 154 { 155 Mat_SeqAIJ *a = (Mat_SeqAIJ *)(A)->data; 156 Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM *)A->spptr; 157 PetscInt m; /* Number of rows in the matrix. */ 158 PetscInt *ia; /* From the CSR representation; points to the beginning of each row. */ 159 PetscInt maxnz; /* Maximum number of nonzeros in any row. */ 160 PetscInt *rows_in_bucket; 161 /* To construct the permutation, we sort each row into one of maxnz 162 * buckets based on how many nonzeros are in the row. */ 163 PetscInt nz; 164 PetscInt *nz_in_row; /* the number of nonzero elements in row k. */ 165 PetscInt *ipnz; 166 /* When constructing the iperm permutation vector, 167 * ipnz[nz] is used to point to the next place in the permutation vector 168 * that a row with nz nonzero elements should be placed.*/ 169 PetscInt i, ngroup, istart, ipos; 170 171 PetscFunctionBegin; 172 if (aijperm->nonzerostate == A->nonzerostate) PetscFunctionReturn(PETSC_SUCCESS); /* permutation exists and matches current nonzero structure */ 173 aijperm->nonzerostate = A->nonzerostate; 174 /* Free anything previously put in the Mat_SeqAIJPERM data structure. */ 175 PetscCall(PetscFree(aijperm->xgroup)); 176 PetscCall(PetscFree(aijperm->nzgroup)); 177 PetscCall(PetscFree(aijperm->iperm)); 178 179 m = A->rmap->n; 180 ia = a->i; 181 182 /* Allocate the arrays that will hold the permutation vector. */ 183 PetscCall(PetscMalloc1(m, &aijperm->iperm)); 184 185 /* Allocate some temporary work arrays that will be used in 186 * calculating the permutation vector and groupings. */ 187 PetscCall(PetscMalloc1(m, &nz_in_row)); 188 189 /* Now actually figure out the permutation and grouping. */ 190 191 /* First pass: Determine number of nonzeros in each row, maximum 192 * number of nonzeros in any row, and how many rows fall into each 193 * "bucket" of rows with same number of nonzeros. */ 194 maxnz = 0; 195 for (i = 0; i < m; i++) { 196 nz_in_row[i] = ia[i + 1] - ia[i]; 197 if (nz_in_row[i] > maxnz) maxnz = nz_in_row[i]; 198 } 199 PetscCall(PetscMalloc1(PetscMax(maxnz, m) + 1, &rows_in_bucket)); 200 PetscCall(PetscMalloc1(PetscMax(maxnz, m) + 1, &ipnz)); 201 202 for (i = 0; i <= maxnz; i++) rows_in_bucket[i] = 0; 203 for (i = 0; i < m; i++) { 204 nz = nz_in_row[i]; 205 rows_in_bucket[nz]++; 206 } 207 208 /* Allocate space for the grouping info. There will be at most (maxnz + 1) 209 * groups. (It is maxnz + 1 instead of simply maxnz because there may be 210 * rows with no nonzero elements.) If there are (maxnz + 1) groups, 211 * then xgroup[] must consist of (maxnz + 2) elements, since the last 212 * element of xgroup will tell us where the (maxnz + 1)th group ends. 213 * We allocate space for the maximum number of groups; 214 * that is potentially a little wasteful, but not too much so. 215 * Perhaps I should fix it later. */ 216 PetscCall(PetscMalloc1(maxnz + 2, &aijperm->xgroup)); 217 PetscCall(PetscMalloc1(maxnz + 1, &aijperm->nzgroup)); 218 219 /* Second pass. Look at what is in the buckets and create the groupings. 220 * Note that it is OK to have a group of rows with no non-zero values. */ 221 ngroup = 0; 222 istart = 0; 223 for (i = 0; i <= maxnz; i++) { 224 if (rows_in_bucket[i] > 0) { 225 aijperm->nzgroup[ngroup] = i; 226 aijperm->xgroup[ngroup] = istart; 227 ngroup++; 228 istart += rows_in_bucket[i]; 229 } 230 } 231 232 aijperm->xgroup[ngroup] = istart; 233 aijperm->ngroup = ngroup; 234 235 /* Now fill in the permutation vector iperm. */ 236 ipnz[0] = 0; 237 for (i = 0; i < maxnz; i++) ipnz[i + 1] = ipnz[i] + rows_in_bucket[i]; 238 239 for (i = 0; i < m; i++) { 240 nz = nz_in_row[i]; 241 ipos = ipnz[nz]; 242 aijperm->iperm[ipos] = i; 243 ipnz[nz]++; 244 } 245 246 /* Clean up temporary work arrays. */ 247 PetscCall(PetscFree(rows_in_bucket)); 248 PetscCall(PetscFree(ipnz)); 249 PetscCall(PetscFree(nz_in_row)); 250 PetscFunctionReturn(PETSC_SUCCESS); 251 } 252 253 static PetscErrorCode MatAssemblyEnd_SeqAIJPERM(Mat A, MatAssemblyType mode) 254 { 255 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 256 257 PetscFunctionBegin; 258 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS); 259 260 /* Since a MATSEQAIJPERM matrix is really just a MATSEQAIJ with some 261 * extra information, call the AssemblyEnd routine for a MATSEQAIJ. 262 * I'm not sure if this is the best way to do this, but it avoids 263 * a lot of code duplication. 264 * I also note that currently MATSEQAIJPERM doesn't know anything about 265 * the Mat_CompressedRow data structure that SeqAIJ now uses when there 266 * are many zero rows. If the SeqAIJ assembly end routine decides to use 267 * this, this may break things. (Don't know... haven't looked at it.) */ 268 a->inode.use = PETSC_FALSE; 269 PetscCall(MatAssemblyEnd_SeqAIJ(A, mode)); 270 271 /* Now calculate the permutation and grouping information. */ 272 PetscCall(MatSeqAIJPERM_create_perm(A)); 273 PetscFunctionReturn(PETSC_SUCCESS); 274 } 275 276 static PetscErrorCode MatMult_SeqAIJPERM(Mat A, Vec xx, Vec yy) 277 { 278 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 279 const PetscScalar *x; 280 PetscScalar *y; 281 const MatScalar *aa; 282 const PetscInt *aj, *ai; 283 #if !(defined(PETSC_USE_FORTRAN_KERNEL_MULTAIJPERM) && defined(notworking)) 284 PetscInt i, j; 285 #endif 286 #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 287 __m512d vec_x, vec_y, vec_vals; 288 __m256i vec_idx, vec_ipos, vec_j; 289 __mmask8 mask; 290 #endif 291 292 /* Variables that don't appear in MatMult_SeqAIJ. */ 293 Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM *)A->spptr; 294 PetscInt *iperm; /* Points to the permutation vector. */ 295 PetscInt *xgroup; 296 /* Denotes where groups of rows with same number of nonzeros 297 * begin and end in iperm. */ 298 PetscInt *nzgroup; 299 PetscInt ngroup; 300 PetscInt igroup; 301 PetscInt jstart, jend; 302 /* jstart is used in loops to denote the position in iperm where a 303 * group starts; jend denotes the position where it ends. 304 * (jend + 1 is where the next group starts.) */ 305 PetscInt iold, nz; 306 PetscInt istart, iend, isize; 307 PetscInt ipos; 308 PetscScalar yp[NDIM]; 309 PetscInt ip[NDIM]; /* yp[] and ip[] are treated as vector "registers" for performing the mat-vec. */ 310 311 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 312 #pragma disjoint(*x, *y, *aa) 313 #endif 314 315 PetscFunctionBegin; 316 PetscCall(VecGetArrayRead(xx, &x)); 317 PetscCall(VecGetArray(yy, &y)); 318 aj = a->j; /* aj[k] gives column index for element aa[k]. */ 319 aa = a->a; /* Nonzero elements stored row-by-row. */ 320 ai = a->i; /* ai[k] is the position in aa and aj where row k starts. */ 321 322 /* Get the info we need about the permutations and groupings. */ 323 iperm = aijperm->iperm; 324 ngroup = aijperm->ngroup; 325 xgroup = aijperm->xgroup; 326 nzgroup = aijperm->nzgroup; 327 328 #if defined(PETSC_USE_FORTRAN_KERNEL_MULTAIJPERM) && defined(notworking) 329 fortranmultaijperm_(&m, x, ii, aj, aa, y); 330 #else 331 332 for (igroup = 0; igroup < ngroup; igroup++) { 333 jstart = xgroup[igroup]; 334 jend = xgroup[igroup + 1] - 1; 335 nz = nzgroup[igroup]; 336 337 /* Handle the special cases where the number of nonzeros per row 338 * in the group is either 0 or 1. */ 339 if (nz == 0) { 340 for (i = jstart; i <= jend; i++) y[iperm[i]] = 0.0; 341 } else if (nz == 1) { 342 for (i = jstart; i <= jend; i++) { 343 iold = iperm[i]; 344 ipos = ai[iold]; 345 y[iold] = aa[ipos] * x[aj[ipos]]; 346 } 347 } else { 348 /* We work our way through the current group in chunks of NDIM rows 349 * at a time. */ 350 351 for (istart = jstart; istart <= jend; istart += NDIM) { 352 /* Figure out where the chunk of 'isize' rows ends in iperm. 353 * 'isize may of course be less than NDIM for the last chunk. */ 354 iend = istart + (NDIM - 1); 355 356 if (iend > jend) iend = jend; 357 358 isize = iend - istart + 1; 359 360 /* Initialize the yp[] array that will be used to hold part of 361 * the permuted results vector, and figure out where in aa each 362 * row of the chunk will begin. */ 363 for (i = 0; i < isize; i++) { 364 iold = iperm[istart + i]; 365 /* iold is a row number from the matrix A *before* reordering. */ 366 ip[i] = ai[iold]; 367 /* ip[i] tells us where the ith row of the chunk begins in aa. */ 368 yp[i] = (PetscScalar)0.0; 369 } 370 371 /* If the number of zeros per row exceeds the number of rows in 372 * the chunk, we should vectorize along nz, that is, perform the 373 * mat-vec one row at a time as in the usual CSR case. */ 374 if (nz > isize) { 375 #if defined(PETSC_HAVE_CRAY_VECTOR) 376 #pragma _CRI preferstream 377 #endif 378 for (i = 0; i < isize; i++) { 379 #if defined(PETSC_HAVE_CRAY_VECTOR) 380 #pragma _CRI prefervector 381 #endif 382 383 #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 384 vec_y = _mm512_setzero_pd(); 385 ipos = ip[i]; 386 for (j = 0; j < (nz >> 3); j++) { 387 vec_idx = _mm256_loadu_si256((__m256i const *)&aj[ipos]); 388 vec_vals = _mm512_loadu_pd(&aa[ipos]); 389 vec_x = _mm512_i32gather_pd(vec_idx, x, _MM_SCALE_8); 390 vec_y = _mm512_fmadd_pd(vec_x, vec_vals, vec_y); 391 ipos += 8; 392 } 393 if ((nz & 0x07) > 2) { 394 mask = (__mmask8)(0xff >> (8 - (nz & 0x07))); 395 vec_idx = _mm256_loadu_si256((__m256i const *)&aj[ipos]); 396 vec_vals = _mm512_loadu_pd(&aa[ipos]); 397 vec_x = _mm512_mask_i32gather_pd(vec_x, mask, vec_idx, x, _MM_SCALE_8); 398 vec_y = _mm512_mask3_fmadd_pd(vec_x, vec_vals, vec_y, mask); 399 } else if ((nz & 0x07) == 2) { 400 yp[i] += aa[ipos] * x[aj[ipos]]; 401 yp[i] += aa[ipos + 1] * x[aj[ipos + 1]]; 402 } else if ((nz & 0x07) == 1) { 403 yp[i] += aa[ipos] * x[aj[ipos]]; 404 } 405 yp[i] += _mm512_reduce_add_pd(vec_y); 406 #else 407 for (j = 0; j < nz; j++) { 408 ipos = ip[i] + j; 409 yp[i] += aa[ipos] * x[aj[ipos]]; 410 } 411 #endif 412 } 413 } else { 414 /* Otherwise, there are enough rows in the chunk to make it 415 * worthwhile to vectorize across the rows, that is, to do the 416 * matvec by operating with "columns" of the chunk. */ 417 for (j = 0; j < nz; j++) { 418 #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 419 vec_j = _mm256_set1_epi32(j); 420 for (i = 0; i < ((isize >> 3) << 3); i += 8) { 421 vec_y = _mm512_loadu_pd(&yp[i]); 422 vec_ipos = _mm256_loadu_si256((__m256i const *)&ip[i]); 423 vec_ipos = _mm256_add_epi32(vec_ipos, vec_j); 424 vec_idx = _mm256_i32gather_epi32(aj, vec_ipos, _MM_SCALE_4); 425 vec_vals = _mm512_i32gather_pd(vec_ipos, aa, _MM_SCALE_8); 426 vec_x = _mm512_i32gather_pd(vec_idx, x, _MM_SCALE_8); 427 vec_y = _mm512_fmadd_pd(vec_x, vec_vals, vec_y); 428 _mm512_storeu_pd(&yp[i], vec_y); 429 } 430 for (i = isize - (isize & 0x07); i < isize; i++) { 431 ipos = ip[i] + j; 432 yp[i] += aa[ipos] * x[aj[ipos]]; 433 } 434 #else 435 for (i = 0; i < isize; i++) { 436 ipos = ip[i] + j; 437 yp[i] += aa[ipos] * x[aj[ipos]]; 438 } 439 #endif 440 } 441 } 442 443 #if defined(PETSC_HAVE_CRAY_VECTOR) 444 #pragma _CRI ivdep 445 #endif 446 /* Put results from yp[] into non-permuted result vector y. */ 447 for (i = 0; i < isize; i++) y[iperm[istart + i]] = yp[i]; 448 } /* End processing chunk of isize rows of a group. */ 449 } /* End handling matvec for chunk with nz > 1. */ 450 } /* End loop over igroup. */ 451 #endif 452 PetscCall(PetscLogFlops(PetscMax(2.0 * a->nz - A->rmap->n, 0))); 453 PetscCall(VecRestoreArrayRead(xx, &x)); 454 PetscCall(VecRestoreArray(yy, &y)); 455 PetscFunctionReturn(PETSC_SUCCESS); 456 } 457 458 /* MatMultAdd_SeqAIJPERM() calculates yy = ww + A * xx. 459 * Note that the names I used to designate the vectors differs from that 460 * used in MatMultAdd_SeqAIJ(). I did this to keep my notation consistent 461 * with the MatMult_SeqAIJPERM() routine, which is very similar to this one. */ 462 /* 463 I hate having virtually identical code for the mult and the multadd!!! 464 */ 465 static PetscErrorCode MatMultAdd_SeqAIJPERM(Mat A, Vec xx, Vec ww, Vec yy) 466 { 467 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 468 const PetscScalar *x; 469 PetscScalar *y, *w; 470 const MatScalar *aa; 471 const PetscInt *aj, *ai; 472 #if !defined(PETSC_USE_FORTRAN_KERNEL_MULTADDAIJPERM) 473 PetscInt i, j; 474 #endif 475 476 /* Variables that don't appear in MatMultAdd_SeqAIJ. */ 477 Mat_SeqAIJPERM *aijperm; 478 PetscInt *iperm; /* Points to the permutation vector. */ 479 PetscInt *xgroup; 480 /* Denotes where groups of rows with same number of nonzeros 481 * begin and end in iperm. */ 482 PetscInt *nzgroup; 483 PetscInt ngroup; 484 PetscInt igroup; 485 PetscInt jstart, jend; 486 /* jstart is used in loops to denote the position in iperm where a 487 * group starts; jend denotes the position where it ends. 488 * (jend + 1 is where the next group starts.) */ 489 PetscInt iold, nz; 490 PetscInt istart, iend, isize; 491 PetscInt ipos; 492 PetscScalar yp[NDIM]; 493 PetscInt ip[NDIM]; 494 /* yp[] and ip[] are treated as vector "registers" for performing 495 * the mat-vec. */ 496 497 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 498 #pragma disjoint(*x, *y, *aa) 499 #endif 500 501 PetscFunctionBegin; 502 PetscCall(VecGetArrayRead(xx, &x)); 503 PetscCall(VecGetArrayPair(yy, ww, &y, &w)); 504 505 aj = a->j; /* aj[k] gives column index for element aa[k]. */ 506 aa = a->a; /* Nonzero elements stored row-by-row. */ 507 ai = a->i; /* ai[k] is the position in aa and aj where row k starts. */ 508 509 /* Get the info we need about the permutations and groupings. */ 510 aijperm = (Mat_SeqAIJPERM *)A->spptr; 511 iperm = aijperm->iperm; 512 ngroup = aijperm->ngroup; 513 xgroup = aijperm->xgroup; 514 nzgroup = aijperm->nzgroup; 515 516 #if defined(PETSC_USE_FORTRAN_KERNEL_MULTADDAIJPERM) 517 fortranmultaddaijperm_(&m, x, ii, aj, aa, y, w); 518 #else 519 520 for (igroup = 0; igroup < ngroup; igroup++) { 521 jstart = xgroup[igroup]; 522 jend = xgroup[igroup + 1] - 1; 523 524 nz = nzgroup[igroup]; 525 526 /* Handle the special cases where the number of nonzeros per row 527 * in the group is either 0 or 1. */ 528 if (nz == 0) { 529 for (i = jstart; i <= jend; i++) { 530 iold = iperm[i]; 531 y[iold] = w[iold]; 532 } 533 } else if (nz == 1) { 534 for (i = jstart; i <= jend; i++) { 535 iold = iperm[i]; 536 ipos = ai[iold]; 537 y[iold] = w[iold] + aa[ipos] * x[aj[ipos]]; 538 } 539 } 540 /* For the general case: */ 541 else { 542 /* We work our way through the current group in chunks of NDIM rows 543 * at a time. */ 544 545 for (istart = jstart; istart <= jend; istart += NDIM) { 546 /* Figure out where the chunk of 'isize' rows ends in iperm. 547 * 'isize may of course be less than NDIM for the last chunk. */ 548 iend = istart + (NDIM - 1); 549 if (iend > jend) iend = jend; 550 isize = iend - istart + 1; 551 552 /* Initialize the yp[] array that will be used to hold part of 553 * the permuted results vector, and figure out where in aa each 554 * row of the chunk will begin. */ 555 for (i = 0; i < isize; i++) { 556 iold = iperm[istart + i]; 557 /* iold is a row number from the matrix A *before* reordering. */ 558 ip[i] = ai[iold]; 559 /* ip[i] tells us where the ith row of the chunk begins in aa. */ 560 yp[i] = w[iold]; 561 } 562 563 /* If the number of zeros per row exceeds the number of rows in 564 * the chunk, we should vectorize along nz, that is, perform the 565 * mat-vec one row at a time as in the usual CSR case. */ 566 if (nz > isize) { 567 #if defined(PETSC_HAVE_CRAY_VECTOR) 568 #pragma _CRI preferstream 569 #endif 570 for (i = 0; i < isize; i++) { 571 #if defined(PETSC_HAVE_CRAY_VECTOR) 572 #pragma _CRI prefervector 573 #endif 574 for (j = 0; j < nz; j++) { 575 ipos = ip[i] + j; 576 yp[i] += aa[ipos] * x[aj[ipos]]; 577 } 578 } 579 } 580 /* Otherwise, there are enough rows in the chunk to make it 581 * worthwhile to vectorize across the rows, that is, to do the 582 * matvec by operating with "columns" of the chunk. */ 583 else { 584 for (j = 0; j < nz; j++) { 585 for (i = 0; i < isize; i++) { 586 ipos = ip[i] + j; 587 yp[i] += aa[ipos] * x[aj[ipos]]; 588 } 589 } 590 } 591 592 #if defined(PETSC_HAVE_CRAY_VECTOR) 593 #pragma _CRI ivdep 594 #endif 595 /* Put results from yp[] into non-permuted result vector y. */ 596 for (i = 0; i < isize; i++) y[iperm[istart + i]] = yp[i]; 597 } /* End processing chunk of isize rows of a group. */ 598 599 } /* End handling matvec for chunk with nz > 1. */ 600 } /* End loop over igroup. */ 601 602 #endif 603 PetscCall(PetscLogFlops(2.0 * a->nz)); 604 PetscCall(VecRestoreArrayRead(xx, &x)); 605 PetscCall(VecRestoreArrayPair(yy, ww, &y, &w)); 606 PetscFunctionReturn(PETSC_SUCCESS); 607 } 608 609 /* MatConvert_SeqAIJ_SeqAIJPERM converts a SeqAIJ matrix into a 610 * SeqAIJPERM matrix. This routine is called by the MatCreate_SeqAIJPERM() 611 * routine, but can also be used to convert an assembled SeqAIJ matrix 612 * into a SeqAIJPERM one. */ 613 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJPERM(Mat A, MatType type, MatReuse reuse, Mat *newmat) 614 { 615 Mat B = *newmat; 616 Mat_SeqAIJPERM *aijperm; 617 PetscBool sametype; 618 619 PetscFunctionBegin; 620 if (reuse == MAT_INITIAL_MATRIX) PetscCall(MatDuplicate(A, MAT_COPY_VALUES, &B)); 621 PetscCall(PetscObjectTypeCompare((PetscObject)A, type, &sametype)); 622 if (sametype) PetscFunctionReturn(PETSC_SUCCESS); 623 624 PetscCall(PetscNew(&aijperm)); 625 B->spptr = (void *)aijperm; 626 627 /* Set function pointers for methods that we inherit from AIJ but override. */ 628 B->ops->duplicate = MatDuplicate_SeqAIJPERM; 629 B->ops->assemblyend = MatAssemblyEnd_SeqAIJPERM; 630 B->ops->destroy = MatDestroy_SeqAIJPERM; 631 B->ops->mult = MatMult_SeqAIJPERM; 632 B->ops->multadd = MatMultAdd_SeqAIJPERM; 633 634 aijperm->nonzerostate = -1; /* this will trigger the generation of the permutation information the first time through MatAssembly()*/ 635 /* If A has already been assembled, compute the permutation. */ 636 if (A->assembled) PetscCall(MatSeqAIJPERM_create_perm(B)); 637 638 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijperm_seqaij_C", MatConvert_SeqAIJPERM_SeqAIJ)); 639 640 PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJPERM)); 641 *newmat = B; 642 PetscFunctionReturn(PETSC_SUCCESS); 643 } 644 645 /*@C 646 MatCreateSeqAIJPERM - Creates a sparse matrix of type `MATSEQAIJPERM`. 647 648 Collective 649 650 Input Parameters: 651 + comm - MPI communicator, set to `PETSC_COMM_SELF` 652 . m - number of rows 653 . n - number of columns 654 . nz - number of nonzeros per row (same for all rows), ignored if `nnz` is given 655 - nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL` 656 657 Output Parameter: 658 . A - the matrix 659 660 Level: intermediate 661 662 Notes: 663 This type inherits from `MATSEQAIJ`, but calculates some additional permutation information 664 that is used to allow better vectorization of some operations. At the cost of increased 665 storage, the `MATSEQAIJ` formatted matrix can be copied to a format in which pieces of the 666 matrix are stored in ELLPACK format, allowing the vectorized matrix multiply routine to use 667 stride-1 memory accesses. 668 669 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateMPIAIJPERM()`, `MatSetValues()` 670 @*/ 671 PetscErrorCode MatCreateSeqAIJPERM(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A) 672 { 673 PetscFunctionBegin; 674 PetscCall(MatCreate(comm, A)); 675 PetscCall(MatSetSizes(*A, m, n, m, n)); 676 PetscCall(MatSetType(*A, MATSEQAIJPERM)); 677 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, nnz)); 678 PetscFunctionReturn(PETSC_SUCCESS); 679 } 680 681 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJPERM(Mat A) 682 { 683 PetscFunctionBegin; 684 PetscCall(MatSetType(A, MATSEQAIJ)); 685 PetscCall(MatConvert_SeqAIJ_SeqAIJPERM(A, MATSEQAIJPERM, MAT_INPLACE_MATRIX, &A)); 686 PetscFunctionReturn(PETSC_SUCCESS); 687 } 688