1b8a1809bSJed Brown static const char help[] = "STREAM benchmark specialized for SSE2\n\\n"; 2b8a1809bSJed Brown 3b8a1809bSJed Brown /* Note: this file has been modified significantly from its original version */ 4b8a1809bSJed Brown #include <emmintrin.h> 5b8a1809bSJed Brown #include <petsctime.h> 6b8a1809bSJed Brown #include <petscsys.h> 7fa8572e2SBarry Smith #if defined(HAVE_NUMA) 8b8a1809bSJed Brown #include <numa.h> 9fa8572e2SBarry Smith #endif 10b8a1809bSJed Brown #include <float.h> 11b8a1809bSJed Brown 12519f805aSKarl Rupp #if !defined(SSE2) 13b8a1809bSJed Brown #define SSE2 1 14b8a1809bSJed Brown #endif 15519f805aSKarl Rupp #if !defined(__SSE2__) 16b8a1809bSJed Brown #error SSE2 instruction set is not enabled, try adding -march=native to CFLAGS or disable by adding -DSSE2=0 17b8a1809bSJed Brown #endif 18519f805aSKarl Rupp #if !defined(PREFETCH_NTA) /* Use software prefetch and set non-temporal policy so that lines evicted from L1D will not subsequently reside in L2 or L3. */ 19b8a1809bSJed Brown #define PREFETCH_NTA 1 20b8a1809bSJed Brown #endif 21519f805aSKarl Rupp #if !defined(STATIC_ALLOC) /* Statically allocate the vectors. Most platforms do not find physical pages when memory is allocated, therefore the faulting strategy still affects performance. */ 22b8a1809bSJed Brown #define STATIC_ALLOC 0 23b8a1809bSJed Brown #endif 24519f805aSKarl Rupp #if !defined(FAULT_TOGETHER) /* Faults all three vectors together which usually interleaves DRAM pages in physical memory. */ 25b8a1809bSJed Brown #define FAULT_TOGETHER 0 26b8a1809bSJed Brown #endif 27519f805aSKarl Rupp #if !defined(USE_MEMCPY) /* Literally call memcpy(3) for the COPY benchmark. Some compilers detect the unoptimized loop as memcpy and call this anyway. */ 28b8a1809bSJed Brown #define USE_MEMCPY 0 29b8a1809bSJed Brown #endif 30b8a1809bSJed Brown 31b8a1809bSJed Brown /* 32b8a1809bSJed Brown * Program: Stream 33b8a1809bSJed Brown * Programmer: Joe R. Zagar 34b8a1809bSJed Brown * Revision: 4.0-BETA, October 24, 1995 35b8a1809bSJed Brown * Original code developed by John D. McCalpin 36b8a1809bSJed Brown * 37b8a1809bSJed Brown * This program measures memory transfer rates in MB/s for simple 38b8a1809bSJed Brown * computational kernels coded in C. These numbers reveal the quality 39b8a1809bSJed Brown * of code generation for simple uncacheable kernels as well as showing 40b8a1809bSJed Brown * the cost of floating-point operations relative to memory accesses. 41b8a1809bSJed Brown * 42b8a1809bSJed Brown * INSTRUCTIONS: 43b8a1809bSJed Brown * 44b8a1809bSJed Brown * 1) Stream requires a good bit of memory to run. Adjust the 45b8a1809bSJed Brown * value of 'N' (below) to give a 'timing calibration' of 46b8a1809bSJed Brown * at least 20 clock-ticks. This will provide rate estimates 47b8a1809bSJed Brown * that should be good to about 5% precision. 48b8a1809bSJed Brown */ 49b8a1809bSJed Brown 50b8a1809bSJed Brown #define N 4000000 51b8a1809bSJed Brown #define NTIMES 100 52b8a1809bSJed Brown #define OFFSET 0 53b8a1809bSJed Brown 54b8a1809bSJed Brown #define HLINE "-------------------------------------------------------------\n" 55b8a1809bSJed Brown 56519f805aSKarl Rupp #if !defined(MIN) 57b8a1809bSJed Brown #define MIN(x, y) ((x) < (y) ? (x) : (y)) 58b8a1809bSJed Brown #endif 59519f805aSKarl Rupp #if !defined(MAX) 60b8a1809bSJed Brown #define MAX(x, y) ((x) > (y) ? (x) : (y)) 61b8a1809bSJed Brown #endif 62b8a1809bSJed Brown 63b8a1809bSJed Brown #if STATIC_ALLOC 64b8a1809bSJed Brown double a[N + OFFSET], b[N + OFFSET], c[N + OFFSET]; 65b8a1809bSJed Brown #endif 66b8a1809bSJed Brown 67b8a1809bSJed Brown static int checktick(void); 68b8a1809bSJed Brown static double Second(void); 69b8a1809bSJed Brown 70b8a1809bSJed Brown int main(int argc, char *argv[]) 71b8a1809bSJed Brown { 72b8a1809bSJed Brown const char *label[4] = {"Copy", "Scale", "Add", "Triad"}; 73*67595998SJunchao Zhang const double bytes[4] = {2 * sizeof(double) * N, 2 * sizeof(double) * N, 3 * sizeof(double) * N, 3 * sizeof(double) * N}; 74b8a1809bSJed Brown double rmstime[4] = {0}, maxtime[4] = {0}, mintime[4] = {FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX}; 75b8a1809bSJed Brown int quantum; 76b8a1809bSJed Brown int BytesPerWord, j, k, size; 77b8a1809bSJed Brown PetscInt node = -1; 78b8a1809bSJed Brown double scalar, t, times[4][NTIMES]; 79b8a1809bSJed Brown #if !STATIC_ALLOC 80b8a1809bSJed Brown double *PETSC_RESTRICT a, *PETSC_RESTRICT b, *PETSC_RESTRICT c; 81b8a1809bSJed Brown #endif 82b8a1809bSJed Brown 839566063dSJacob Faibussowitsch PetscCall(PetscInitialize(&argc, &argv, 0, help)); 849566063dSJacob Faibussowitsch PetscCallMPI(MPI_Comm_size(PETSC_COMM_WORLD, &size)); 859566063dSJacob Faibussowitsch PetscCall(PetscOptionsGetInt(NULL, NULL, "-node", &node, NULL)); 86b8a1809bSJed Brown /* --- SETUP --- determine precision and check timing --- */ 87b8a1809bSJed Brown 88b8a1809bSJed Brown PetscPrintf(PETSC_COMM_WORLD, HLINE); 89b8a1809bSJed Brown BytesPerWord = sizeof(double); 9026f47effSBarry Smith PetscPrintf(PETSC_COMM_WORLD, "This system uses %d bytes per DOUBLE PRECISION word.\n", BytesPerWord); 91b8a1809bSJed Brown 92b8a1809bSJed Brown PetscPrintf(PETSC_COMM_WORLD, HLINE); 93b8a1809bSJed Brown PetscPrintf(PETSC_COMM_WORLD, "Array size = %d, Offset = %d\n", N, OFFSET); 9426f47effSBarry Smith PetscPrintf(PETSC_COMM_WORLD, "Total memory required = %.1f MB per process.\n", (3 * N * BytesPerWord) / 1048576.0); 95b8a1809bSJed Brown PetscPrintf(PETSC_COMM_WORLD, "Each test is run %d times, but only\n", NTIMES); 96b8a1809bSJed Brown PetscPrintf(PETSC_COMM_WORLD, "the *best* time for each is used.\n"); 97b8a1809bSJed Brown 98b8a1809bSJed Brown /* Get initial value for system clock. */ 99b8a1809bSJed Brown 100b8a1809bSJed Brown #if !STATIC_ALLOC 101b8a1809bSJed Brown if (node == -1) { 102b8a1809bSJed Brown posix_memalign((void **)&a, 64, N * sizeof(double)); 103b8a1809bSJed Brown posix_memalign((void **)&b, 64, N * sizeof(double)); 104b8a1809bSJed Brown posix_memalign((void **)&c, 64, N * sizeof(double)); 105b8a1809bSJed Brown } else if (node == -2) { 106b8a1809bSJed Brown a = malloc(N * sizeof(double)); 107b8a1809bSJed Brown b = malloc(N * sizeof(double)); 108b8a1809bSJed Brown c = malloc(N * sizeof(double)); 109fa8572e2SBarry Smith #if defined(HAVE_NUMA) 110b8a1809bSJed Brown } else { 111b8a1809bSJed Brown a = numa_alloc_onnode(N * sizeof(double), node); 112b8a1809bSJed Brown b = numa_alloc_onnode(N * sizeof(double), node); 113b8a1809bSJed Brown c = numa_alloc_onnode(N * sizeof(double), node); 114fa8572e2SBarry Smith #endif 115b8a1809bSJed Brown } 116b8a1809bSJed Brown #endif 117b8a1809bSJed Brown #if FAULT_TOGETHER 118b8a1809bSJed Brown for (j = 0; j < N; j++) { 119b8a1809bSJed Brown a[j] = 1.0; 120b8a1809bSJed Brown b[j] = 2.0; 121b8a1809bSJed Brown c[j] = 0.0; 122b8a1809bSJed Brown } 123b8a1809bSJed Brown #else 124b8a1809bSJed Brown for (j = 0; j < N; j++) a[j] = 1.0; 125b8a1809bSJed Brown for (j = 0; j < N; j++) b[j] = 2.0; 126b8a1809bSJed Brown for (j = 0; j < N; j++) c[j] = 0.0; 127b8a1809bSJed Brown #endif 128b8a1809bSJed Brown 129b8a1809bSJed Brown PetscPrintf(PETSC_COMM_WORLD, HLINE); 130b8a1809bSJed Brown 13194c9c6d3SKarl Rupp if ((quantum = checktick()) >= 1) PetscPrintf(PETSC_COMM_WORLD, "Your clock granularity/precision appears to be %d microseconds.\n", quantum); 13294c9c6d3SKarl Rupp else PetscPrintf(PETSC_COMM_WORLD, "Your clock granularity appears to be less than one microsecond.\n"); 133b8a1809bSJed Brown 134b8a1809bSJed Brown t = Second(); 1356f2b61bcSKarl Rupp for (j = 0; j < N; j++) a[j] = 2.0E0 * a[j]; 136b8a1809bSJed Brown t = 1.0E6 * (Second() - t); 137b8a1809bSJed Brown 13826f47effSBarry Smith PetscPrintf(PETSC_COMM_WORLD, "Each test below will take on the order of %d microseconds.\n", (int)t); 139b8a1809bSJed Brown PetscPrintf(PETSC_COMM_WORLD, " (= %d clock ticks)\n", (int)(t / quantum)); 140b8a1809bSJed Brown PetscPrintf(PETSC_COMM_WORLD, "Increase the size of the arrays if this shows that\n"); 141b8a1809bSJed Brown PetscPrintf(PETSC_COMM_WORLD, "you are not getting at least 20 clock ticks per test.\n"); 142b8a1809bSJed Brown 143b8a1809bSJed Brown PetscPrintf(PETSC_COMM_WORLD, HLINE); 144b8a1809bSJed Brown 145b8a1809bSJed Brown PetscPrintf(PETSC_COMM_WORLD, "WARNING -- The above is only a rough guideline.\n"); 146b8a1809bSJed Brown PetscPrintf(PETSC_COMM_WORLD, "For best results, please be sure you know the\n"); 147b8a1809bSJed Brown PetscPrintf(PETSC_COMM_WORLD, "precision of your system timer.\n"); 148b8a1809bSJed Brown PetscPrintf(PETSC_COMM_WORLD, HLINE); 149b8a1809bSJed Brown 150b8a1809bSJed Brown /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ 151b8a1809bSJed Brown 152b8a1809bSJed Brown scalar = 3.0; 153b8a1809bSJed Brown for (k = 0; k < NTIMES; k++) { 154106b1c52SJed Brown MPI_Barrier(PETSC_COMM_WORLD); 155b8a1809bSJed Brown /* ### COPY: c <- a ### */ 156b8a1809bSJed Brown times[0][k] = Second(); 157106b1c52SJed Brown MPI_Barrier(PETSC_COMM_WORLD); 158b8a1809bSJed Brown #if USE_MEMCPY 159b8a1809bSJed Brown memcpy(c, a, N * sizeof(double)); 160b8a1809bSJed Brown #elif SSE2 161b8a1809bSJed Brown for (j = 0; j < N; j += 8) { 162b8a1809bSJed Brown _mm_stream_pd(c + j + 0, _mm_load_pd(a + j + 0)); 163b8a1809bSJed Brown _mm_stream_pd(c + j + 2, _mm_load_pd(a + j + 2)); 164b8a1809bSJed Brown _mm_stream_pd(c + j + 4, _mm_load_pd(a + j + 4)); 165b8a1809bSJed Brown _mm_stream_pd(c + j + 6, _mm_load_pd(a + j + 6)); 166b8a1809bSJed Brown #if PREFETCH_NTA 167b8a1809bSJed Brown _mm_prefetch(a + j + 64, _MM_HINT_NTA); 168b8a1809bSJed Brown #endif 169b8a1809bSJed Brown } 170b8a1809bSJed Brown #else 171b8a1809bSJed Brown for (j = 0; j < N; j++) c[j] = a[j]; 172b8a1809bSJed Brown #endif 173106b1c52SJed Brown MPI_Barrier(PETSC_COMM_WORLD); 174b8a1809bSJed Brown times[0][k] = Second() - times[0][k]; 175b8a1809bSJed Brown 176b8a1809bSJed Brown /* ### SCALE: b <- scalar * c ### */ 177b8a1809bSJed Brown times[1][k] = Second(); 178106b1c52SJed Brown MPI_Barrier(PETSC_COMM_WORLD); 179b8a1809bSJed Brown #if SSE2 180b8a1809bSJed Brown { 181b8a1809bSJed Brown __m128d scalar2 = _mm_set1_pd(scalar); 182b8a1809bSJed Brown for (j = 0; j < N; j += 8) { 183b8a1809bSJed Brown _mm_stream_pd(b + j + 0, _mm_mul_pd(scalar2, _mm_load_pd(c + j + 0))); 184b8a1809bSJed Brown _mm_stream_pd(b + j + 2, _mm_mul_pd(scalar2, _mm_load_pd(c + j + 2))); 185b8a1809bSJed Brown _mm_stream_pd(b + j + 4, _mm_mul_pd(scalar2, _mm_load_pd(c + j + 4))); 186b8a1809bSJed Brown _mm_stream_pd(b + j + 6, _mm_mul_pd(scalar2, _mm_load_pd(c + j + 6))); 187b8a1809bSJed Brown #if PREFETCH_NTA 188b8a1809bSJed Brown _mm_prefetch(c + j + 64, _MM_HINT_NTA); 189b8a1809bSJed Brown #endif 190b8a1809bSJed Brown } 191b8a1809bSJed Brown } 192b8a1809bSJed Brown #else 193b8a1809bSJed Brown for (j = 0; j < N; j++) b[j] = scalar * c[j]; 194b8a1809bSJed Brown #endif 195106b1c52SJed Brown MPI_Barrier(PETSC_COMM_WORLD); 196b8a1809bSJed Brown times[1][k] = Second() - times[1][k]; 197b8a1809bSJed Brown 198b8a1809bSJed Brown /* ### ADD: c <- a + b ### */ 199b8a1809bSJed Brown times[2][k] = Second(); 200106b1c52SJed Brown MPI_Barrier(PETSC_COMM_WORLD); 201b8a1809bSJed Brown #if SSE2 202b8a1809bSJed Brown { 203b8a1809bSJed Brown for (j = 0; j < N; j += 8) { 204b8a1809bSJed Brown _mm_stream_pd(c + j + 0, _mm_add_pd(_mm_load_pd(a + j + 0), _mm_load_pd(b + j + 0))); 205b8a1809bSJed Brown _mm_stream_pd(c + j + 2, _mm_add_pd(_mm_load_pd(a + j + 2), _mm_load_pd(b + j + 2))); 206b8a1809bSJed Brown _mm_stream_pd(c + j + 4, _mm_add_pd(_mm_load_pd(a + j + 4), _mm_load_pd(b + j + 4))); 207b8a1809bSJed Brown _mm_stream_pd(c + j + 6, _mm_add_pd(_mm_load_pd(a + j + 6), _mm_load_pd(b + j + 6))); 208b8a1809bSJed Brown #if PREFETCH_NTA 209b8a1809bSJed Brown _mm_prefetch(a + j + 64, _MM_HINT_NTA); 210b8a1809bSJed Brown _mm_prefetch(b + j + 64, _MM_HINT_NTA); 211b8a1809bSJed Brown #endif 212b8a1809bSJed Brown } 213b8a1809bSJed Brown } 214b8a1809bSJed Brown #else 215b8a1809bSJed Brown for (j = 0; j < N; j++) c[j] = a[j] + b[j]; 216b8a1809bSJed Brown #endif 217106b1c52SJed Brown MPI_Barrier(PETSC_COMM_WORLD); 218b8a1809bSJed Brown times[2][k] = Second() - times[2][k]; 219b8a1809bSJed Brown 220b8a1809bSJed Brown /* ### TRIAD: a <- b + scalar * c ### */ 221b8a1809bSJed Brown times[3][k] = Second(); 222106b1c52SJed Brown MPI_Barrier(PETSC_COMM_WORLD); 223b8a1809bSJed Brown #if SSE2 224b8a1809bSJed Brown { 225b8a1809bSJed Brown __m128d scalar2 = _mm_set1_pd(scalar); 226b8a1809bSJed Brown for (j = 0; j < N; j += 8) { 227b8a1809bSJed Brown _mm_stream_pd(a + j + 0, _mm_add_pd(_mm_load_pd(b + j + 0), _mm_mul_pd(scalar2, _mm_load_pd(c + j + 0)))); 228b8a1809bSJed Brown _mm_stream_pd(a + j + 2, _mm_add_pd(_mm_load_pd(b + j + 2), _mm_mul_pd(scalar2, _mm_load_pd(c + j + 2)))); 229b8a1809bSJed Brown _mm_stream_pd(a + j + 4, _mm_add_pd(_mm_load_pd(b + j + 4), _mm_mul_pd(scalar2, _mm_load_pd(c + j + 4)))); 230b8a1809bSJed Brown _mm_stream_pd(a + j + 6, _mm_add_pd(_mm_load_pd(b + j + 6), _mm_mul_pd(scalar2, _mm_load_pd(c + j + 6)))); 231b8a1809bSJed Brown #if PREFETCH_NTA 232b8a1809bSJed Brown _mm_prefetch(b + j + 64, _MM_HINT_NTA); 233b8a1809bSJed Brown _mm_prefetch(c + j + 64, _MM_HINT_NTA); 234b8a1809bSJed Brown #endif 235b8a1809bSJed Brown } 236b8a1809bSJed Brown } 237b8a1809bSJed Brown #else 238b8a1809bSJed Brown for (j = 0; j < N; j++) a[j] = b[j] + scalar * c[j]; 239b8a1809bSJed Brown #endif 240106b1c52SJed Brown MPI_Barrier(PETSC_COMM_WORLD); 241b8a1809bSJed Brown times[3][k] = Second() - times[3][k]; 242b8a1809bSJed Brown } 243b8a1809bSJed Brown 244b8a1809bSJed Brown /* --- SUMMARY --- */ 245b8a1809bSJed Brown 2466f2b61bcSKarl Rupp for (k = 0; k < NTIMES; k++) 247b8a1809bSJed Brown for (j = 0; j < 4; j++) { 248b8a1809bSJed Brown rmstime[j] = rmstime[j] + (times[j][k] * times[j][k]); 249b8a1809bSJed Brown mintime[j] = MIN(mintime[j], times[j][k]); 250b8a1809bSJed Brown maxtime[j] = MAX(maxtime[j], times[j][k]); 251b8a1809bSJed Brown } 252b8a1809bSJed Brown 253b8a1809bSJed Brown PetscPrintf(PETSC_COMM_WORLD, "%8s: %11s %11s %11s %11s %11s\n", "Function", "Rate (MB/s)", "Total (MB/s)", "RMS time", "Min time", "Max time"); 254b8a1809bSJed Brown for (j = 0; j < 4; j++) { 255b8a1809bSJed Brown rmstime[j] = sqrt(rmstime[j] / (double)NTIMES); 256b8a1809bSJed Brown PetscPrintf(PETSC_COMM_WORLD, "%8s: %11.4f %11.4f %11.4f %11.4f %11.4f\n", label[j], 1.0e-06 * bytes[j] / mintime[j], size * 1.0e-06 * bytes[j] / mintime[j], rmstime[j], mintime[j], maxtime[j]); 257b8a1809bSJed Brown } 2589566063dSJacob Faibussowitsch PetscCall(PetscFinalize()); 259b122ec5aSJacob Faibussowitsch return 0; 260b8a1809bSJed Brown } 261b8a1809bSJed Brown 262a6dfd86eSKarl Rupp static double Second() 263a6dfd86eSKarl Rupp { 264b8a1809bSJed Brown double t; 2658563dfccSBarry Smith PetscTime(&t); 266b8a1809bSJed Brown return t; 267b8a1809bSJed Brown } 268b8a1809bSJed Brown 269b8a1809bSJed Brown #define M 20 270d1d3a73cSBarry Smith static int checktick(void) 271b8a1809bSJed Brown { 272b8a1809bSJed Brown int i, minDelta, Delta; 273b8a1809bSJed Brown double t1, t2, timesfound[M]; 274b8a1809bSJed Brown 275b8a1809bSJed Brown /* Collect a sequence of M unique time values from the system. */ 276b8a1809bSJed Brown 277b8a1809bSJed Brown for (i = 0; i < M; i++) { 278b8a1809bSJed Brown t1 = Second(); 279*67595998SJunchao Zhang while ((t2 = Second()) - t1 < 1.0E-6) { } 280b8a1809bSJed Brown timesfound[i] = t1 = t2; 281b8a1809bSJed Brown } 282b8a1809bSJed Brown 283b8a1809bSJed Brown /* 284b8a1809bSJed Brown * Determine the minimum difference between these M values. 285b8a1809bSJed Brown * This result will be our estimate (in microseconds) for the 286b8a1809bSJed Brown * clock granularity. 287b8a1809bSJed Brown */ 288b8a1809bSJed Brown 289b8a1809bSJed Brown minDelta = 1000000; 290b8a1809bSJed Brown for (i = 1; i < M; i++) { 291b8a1809bSJed Brown Delta = (int)(1.0E6 * (timesfound[i] - timesfound[i - 1])); 292b8a1809bSJed Brown minDelta = MIN(minDelta, MAX(Delta, 0)); 293b8a1809bSJed Brown } 294b8a1809bSJed Brown 29511cc89d2SBarry Smith return minDelta; 296b8a1809bSJed Brown } 297