1*b8a1809bSJed Brown static const char help[] = "STREAM benchmark specialized for SSE2\n\\n"; 2*b8a1809bSJed Brown 3*b8a1809bSJed Brown /* Note: this file has been modified significantly from its original version */ 4*b8a1809bSJed Brown #include <emmintrin.h> 5*b8a1809bSJed Brown #include <petsctime.h> 6*b8a1809bSJed Brown #include <petscsys.h> 7*b8a1809bSJed Brown #include <numa.h> 8*b8a1809bSJed Brown #include <limits.h> 9*b8a1809bSJed Brown #include <float.h> 10*b8a1809bSJed Brown 11*b8a1809bSJed Brown #ifndef SSE2 12*b8a1809bSJed Brown # define SSE2 1 13*b8a1809bSJed Brown #endif 14*b8a1809bSJed Brown #ifndef __SSE2__ 15*b8a1809bSJed Brown # error SSE2 instruction set is not enabled, try adding -march=native to CFLAGS or disable by adding -DSSE2=0 16*b8a1809bSJed Brown #endif 17*b8a1809bSJed Brown #ifndef PREFETCH_NTA /* Use software prefetch and set non-temporal policy so that lines evicted from L1D will not subsequently reside in L2 or L3. */ 18*b8a1809bSJed Brown # define PREFETCH_NTA 1 19*b8a1809bSJed Brown #endif 20*b8a1809bSJed Brown #ifndef STATIC_ALLOC /* Statically allocate the vectors. Most platforms do not find physical pages when memory is allocated, therefore the faulting strategy still affects performance. */ 21*b8a1809bSJed Brown # define STATIC_ALLOC 0 22*b8a1809bSJed Brown #endif 23*b8a1809bSJed Brown #ifndef FAULT_TOGETHER /* Faults all three vectors together which usually interleaves DRAM pages in physical memory. */ 24*b8a1809bSJed Brown # define FAULT_TOGETHER 0 25*b8a1809bSJed Brown #endif 26*b8a1809bSJed Brown #ifndef USE_MEMCPY /* Literally call memcpy(3) for the COPY benchmark. Some compilers detect the unoptimized loop as memcpy and call this anyway. */ 27*b8a1809bSJed Brown # define USE_MEMCPY 0 28*b8a1809bSJed Brown #endif 29*b8a1809bSJed Brown 30*b8a1809bSJed Brown /* 31*b8a1809bSJed Brown * Program: Stream 32*b8a1809bSJed Brown * Programmer: Joe R. Zagar 33*b8a1809bSJed Brown * Revision: 4.0-BETA, October 24, 1995 34*b8a1809bSJed Brown * Original code developed by John D. McCalpin 35*b8a1809bSJed Brown * 36*b8a1809bSJed Brown * This program measures memory transfer rates in MB/s for simple 37*b8a1809bSJed Brown * computational kernels coded in C. These numbers reveal the quality 38*b8a1809bSJed Brown * of code generation for simple uncacheable kernels as well as showing 39*b8a1809bSJed Brown * the cost of floating-point operations relative to memory accesses. 40*b8a1809bSJed Brown * 41*b8a1809bSJed Brown * INSTRUCTIONS: 42*b8a1809bSJed Brown * 43*b8a1809bSJed Brown * 1) Stream requires a good bit of memory to run. Adjust the 44*b8a1809bSJed Brown * value of 'N' (below) to give a 'timing calibration' of 45*b8a1809bSJed Brown * at least 20 clock-ticks. This will provide rate estimates 46*b8a1809bSJed Brown * that should be good to about 5% precision. 47*b8a1809bSJed Brown */ 48*b8a1809bSJed Brown 49*b8a1809bSJed Brown # define N 4000000 50*b8a1809bSJed Brown # define NTIMES 100 51*b8a1809bSJed Brown # define OFFSET 0 52*b8a1809bSJed Brown 53*b8a1809bSJed Brown # define HLINE "-------------------------------------------------------------\n" 54*b8a1809bSJed Brown 55*b8a1809bSJed Brown # ifndef MIN 56*b8a1809bSJed Brown # define MIN(x,y) ((x)<(y)?(x):(y)) 57*b8a1809bSJed Brown # endif 58*b8a1809bSJed Brown # ifndef MAX 59*b8a1809bSJed Brown # define MAX(x,y) ((x)>(y)?(x):(y)) 60*b8a1809bSJed Brown # endif 61*b8a1809bSJed Brown 62*b8a1809bSJed Brown #if STATIC_ALLOC 63*b8a1809bSJed Brown double a[N+OFFSET],b[N+OFFSET],c[N+OFFSET]; 64*b8a1809bSJed Brown #endif 65*b8a1809bSJed Brown 66*b8a1809bSJed Brown static int checktick(void); 67*b8a1809bSJed Brown static double Second(void); 68*b8a1809bSJed Brown 69*b8a1809bSJed Brown int main(int argc,char *argv[]) 70*b8a1809bSJed Brown { 71*b8a1809bSJed Brown const char *label[4] = {"Copy", "Scale","Add", "Triad"}; 72*b8a1809bSJed Brown const double bytes[4] = {2 * sizeof(double) * N, 73*b8a1809bSJed Brown 2 * sizeof(double) * N, 74*b8a1809bSJed Brown 3 * sizeof(double) * N, 75*b8a1809bSJed Brown 3 * sizeof(double) * N}; 76*b8a1809bSJed Brown double rmstime[4] = {0},maxtime[4] = {0},mintime[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX}; 77*b8a1809bSJed Brown int quantum; 78*b8a1809bSJed Brown int BytesPerWord,j,k,size; 79*b8a1809bSJed Brown PetscInt node = -1; 80*b8a1809bSJed Brown double scalar, t, times[4][NTIMES]; 81*b8a1809bSJed Brown #if !STATIC_ALLOC 82*b8a1809bSJed Brown double *PETSC_RESTRICT a,*PETSC_RESTRICT b,*PETSC_RESTRICT c; 83*b8a1809bSJed Brown #endif 84*b8a1809bSJed Brown 85*b8a1809bSJed Brown PetscInitialize(&argc,&argv,0,help); 86*b8a1809bSJed Brown MPI_Comm_size(PETSC_COMM_WORLD,&size); 87*b8a1809bSJed Brown PetscOptionsGetInt(PETSC_NULL,"-node",&node,PETSC_NULL); 88*b8a1809bSJed Brown /* --- SETUP --- determine precision and check timing --- */ 89*b8a1809bSJed Brown 90*b8a1809bSJed Brown PetscPrintf(PETSC_COMM_WORLD,HLINE); 91*b8a1809bSJed Brown BytesPerWord = sizeof(double); 92*b8a1809bSJed Brown PetscPrintf(PETSC_COMM_WORLD,"This system uses %d bytes per DOUBLE PRECISION word.\n", 93*b8a1809bSJed Brown BytesPerWord); 94*b8a1809bSJed Brown 95*b8a1809bSJed Brown PetscPrintf(PETSC_COMM_WORLD,HLINE); 96*b8a1809bSJed Brown PetscPrintf(PETSC_COMM_WORLD,"Array size = %d, Offset = %d\n" , N, OFFSET); 97*b8a1809bSJed Brown PetscPrintf(PETSC_COMM_WORLD,"Total memory required = %.1f MB per process.\n", 98*b8a1809bSJed Brown (3 * N * BytesPerWord) / 1048576.0); 99*b8a1809bSJed Brown PetscPrintf(PETSC_COMM_WORLD,"Each test is run %d times, but only\n", NTIMES); 100*b8a1809bSJed Brown PetscPrintf(PETSC_COMM_WORLD,"the *best* time for each is used.\n"); 101*b8a1809bSJed Brown 102*b8a1809bSJed Brown /* Get initial value for system clock. */ 103*b8a1809bSJed Brown 104*b8a1809bSJed Brown #if !STATIC_ALLOC 105*b8a1809bSJed Brown if (node == -1) { 106*b8a1809bSJed Brown posix_memalign((void**)&a,64,N*sizeof(double)); 107*b8a1809bSJed Brown posix_memalign((void**)&b,64,N*sizeof(double)); 108*b8a1809bSJed Brown posix_memalign((void**)&c,64,N*sizeof(double)); 109*b8a1809bSJed Brown } else if (node == -2) { 110*b8a1809bSJed Brown a = malloc(N*sizeof(double)); 111*b8a1809bSJed Brown b = malloc(N*sizeof(double)); 112*b8a1809bSJed Brown c = malloc(N*sizeof(double)); 113*b8a1809bSJed Brown } else { 114*b8a1809bSJed Brown a = numa_alloc_onnode(N*sizeof(double),node); 115*b8a1809bSJed Brown b = numa_alloc_onnode(N*sizeof(double),node); 116*b8a1809bSJed Brown c = numa_alloc_onnode(N*sizeof(double),node); 117*b8a1809bSJed Brown } 118*b8a1809bSJed Brown #endif 119*b8a1809bSJed Brown #if FAULT_TOGETHER 120*b8a1809bSJed Brown for (j=0; j<N; j++) { 121*b8a1809bSJed Brown a[j] = 1.0; 122*b8a1809bSJed Brown b[j] = 2.0; 123*b8a1809bSJed Brown c[j] = 0.0; 124*b8a1809bSJed Brown } 125*b8a1809bSJed Brown #else 126*b8a1809bSJed Brown for (j=0; j<N; j++) a[j] = 1.0; 127*b8a1809bSJed Brown for (j=0; j<N; j++) b[j] = 2.0; 128*b8a1809bSJed Brown for (j=0; j<N; j++) c[j] = 0.0; 129*b8a1809bSJed Brown #endif 130*b8a1809bSJed Brown 131*b8a1809bSJed Brown PetscPrintf(PETSC_COMM_WORLD,HLINE); 132*b8a1809bSJed Brown 133*b8a1809bSJed Brown if ( (quantum = checktick()) >= 1) 134*b8a1809bSJed Brown PetscPrintf(PETSC_COMM_WORLD,"Your clock granularity/precision appears to be " 135*b8a1809bSJed Brown "%d microseconds.\n", quantum); 136*b8a1809bSJed Brown else 137*b8a1809bSJed Brown PetscPrintf(PETSC_COMM_WORLD,"Your clock granularity appears to be " 138*b8a1809bSJed Brown "less than one microsecond.\n"); 139*b8a1809bSJed Brown 140*b8a1809bSJed Brown t = Second(); 141*b8a1809bSJed Brown for (j = 0; j < N; j++) 142*b8a1809bSJed Brown a[j] = 2.0E0 * a[j]; 143*b8a1809bSJed Brown t = 1.0E6 * (Second() - t); 144*b8a1809bSJed Brown 145*b8a1809bSJed Brown PetscPrintf(PETSC_COMM_WORLD,"Each test below will take on the order" 146*b8a1809bSJed Brown " of %d microseconds.\n", (int) t ); 147*b8a1809bSJed Brown PetscPrintf(PETSC_COMM_WORLD," (= %d clock ticks)\n", (int) (t/quantum) ); 148*b8a1809bSJed Brown PetscPrintf(PETSC_COMM_WORLD,"Increase the size of the arrays if this shows that\n"); 149*b8a1809bSJed Brown PetscPrintf(PETSC_COMM_WORLD,"you are not getting at least 20 clock ticks per test.\n"); 150*b8a1809bSJed Brown 151*b8a1809bSJed Brown PetscPrintf(PETSC_COMM_WORLD,HLINE); 152*b8a1809bSJed Brown 153*b8a1809bSJed Brown PetscPrintf(PETSC_COMM_WORLD,"WARNING -- The above is only a rough guideline.\n"); 154*b8a1809bSJed Brown PetscPrintf(PETSC_COMM_WORLD,"For best results, please be sure you know the\n"); 155*b8a1809bSJed Brown PetscPrintf(PETSC_COMM_WORLD,"precision of your system timer.\n"); 156*b8a1809bSJed Brown PetscPrintf(PETSC_COMM_WORLD,HLINE); 157*b8a1809bSJed Brown 158*b8a1809bSJed Brown /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ 159*b8a1809bSJed Brown 160*b8a1809bSJed Brown scalar = 3.0; 161*b8a1809bSJed Brown for (k=0; k<NTIMES; k++) { 162*b8a1809bSJed Brown /* ### COPY: c <- a ### */ 163*b8a1809bSJed Brown times[0][k] = Second(); 164*b8a1809bSJed Brown #if USE_MEMCPY 165*b8a1809bSJed Brown memcpy(c,a,N*sizeof(double)); 166*b8a1809bSJed Brown #elif SSE2 167*b8a1809bSJed Brown for (j=0; j<N; j+=8) { 168*b8a1809bSJed Brown _mm_stream_pd(c+j+0,_mm_load_pd(a+j+0)); 169*b8a1809bSJed Brown _mm_stream_pd(c+j+2,_mm_load_pd(a+j+2)); 170*b8a1809bSJed Brown _mm_stream_pd(c+j+4,_mm_load_pd(a+j+4)); 171*b8a1809bSJed Brown _mm_stream_pd(c+j+6,_mm_load_pd(a+j+6)); 172*b8a1809bSJed Brown # if PREFETCH_NTA 173*b8a1809bSJed Brown _mm_prefetch(a+j+64,_MM_HINT_NTA); 174*b8a1809bSJed Brown # endif 175*b8a1809bSJed Brown } 176*b8a1809bSJed Brown #else 177*b8a1809bSJed Brown for (j=0; j<N; j++) c[j] = a[j]; 178*b8a1809bSJed Brown #endif 179*b8a1809bSJed Brown times[0][k] = Second() - times[0][k]; 180*b8a1809bSJed Brown 181*b8a1809bSJed Brown /* ### SCALE: b <- scalar * c ### */ 182*b8a1809bSJed Brown times[1][k] = Second(); 183*b8a1809bSJed Brown #if SSE2 184*b8a1809bSJed Brown { 185*b8a1809bSJed Brown __m128d scalar2 = _mm_set1_pd(scalar); 186*b8a1809bSJed Brown for (j=0; j<N; j+=8) { 187*b8a1809bSJed Brown _mm_stream_pd(b+j+0,_mm_mul_pd(scalar2,_mm_load_pd(c+j+0))); 188*b8a1809bSJed Brown _mm_stream_pd(b+j+2,_mm_mul_pd(scalar2,_mm_load_pd(c+j+2))); 189*b8a1809bSJed Brown _mm_stream_pd(b+j+4,_mm_mul_pd(scalar2,_mm_load_pd(c+j+4))); 190*b8a1809bSJed Brown _mm_stream_pd(b+j+6,_mm_mul_pd(scalar2,_mm_load_pd(c+j+6))); 191*b8a1809bSJed Brown # if PREFETCH_NTA 192*b8a1809bSJed Brown _mm_prefetch(c+j+64,_MM_HINT_NTA); 193*b8a1809bSJed Brown # endif 194*b8a1809bSJed Brown } 195*b8a1809bSJed Brown } 196*b8a1809bSJed Brown #else 197*b8a1809bSJed Brown for (j=0; j<N; j++) b[j] = scalar*c[j]; 198*b8a1809bSJed Brown #endif 199*b8a1809bSJed Brown times[1][k] = Second() - times[1][k]; 200*b8a1809bSJed Brown 201*b8a1809bSJed Brown /* ### ADD: c <- a + b ### */ 202*b8a1809bSJed Brown times[2][k] = Second(); 203*b8a1809bSJed Brown #if SSE2 204*b8a1809bSJed Brown { 205*b8a1809bSJed Brown for (j=0; j<N; j+=8) { 206*b8a1809bSJed Brown _mm_stream_pd(c+j+0,_mm_add_pd(_mm_load_pd(a+j+0),_mm_load_pd(b+j+0))); 207*b8a1809bSJed Brown _mm_stream_pd(c+j+2,_mm_add_pd(_mm_load_pd(a+j+2),_mm_load_pd(b+j+2))); 208*b8a1809bSJed Brown _mm_stream_pd(c+j+4,_mm_add_pd(_mm_load_pd(a+j+4),_mm_load_pd(b+j+4))); 209*b8a1809bSJed Brown _mm_stream_pd(c+j+6,_mm_add_pd(_mm_load_pd(a+j+6),_mm_load_pd(b+j+6))); 210*b8a1809bSJed Brown # if PREFETCH_NTA 211*b8a1809bSJed Brown _mm_prefetch(a+j+64,_MM_HINT_NTA); 212*b8a1809bSJed Brown _mm_prefetch(b+j+64,_MM_HINT_NTA); 213*b8a1809bSJed Brown # endif 214*b8a1809bSJed Brown } 215*b8a1809bSJed Brown } 216*b8a1809bSJed Brown #else 217*b8a1809bSJed Brown for (j=0; j<N; j++) c[j] = a[j]+b[j]; 218*b8a1809bSJed Brown #endif 219*b8a1809bSJed Brown times[2][k] = Second() - times[2][k]; 220*b8a1809bSJed Brown 221*b8a1809bSJed Brown /* ### TRIAD: a <- b + scalar * c ### */ 222*b8a1809bSJed Brown times[3][k] = Second(); 223*b8a1809bSJed Brown #if SSE2 224*b8a1809bSJed Brown { 225*b8a1809bSJed Brown __m128d scalar2 = _mm_set1_pd(scalar); 226*b8a1809bSJed Brown for (j=0; j<N; j+=8) { 227*b8a1809bSJed Brown _mm_stream_pd(a+j+0,_mm_add_pd(_mm_load_pd(b+j+0),_mm_mul_pd(scalar2,_mm_load_pd(c+j+0)))); 228*b8a1809bSJed Brown _mm_stream_pd(a+j+2,_mm_add_pd(_mm_load_pd(b+j+2),_mm_mul_pd(scalar2,_mm_load_pd(c+j+2)))); 229*b8a1809bSJed Brown _mm_stream_pd(a+j+4,_mm_add_pd(_mm_load_pd(b+j+4),_mm_mul_pd(scalar2,_mm_load_pd(c+j+4)))); 230*b8a1809bSJed Brown _mm_stream_pd(a+j+6,_mm_add_pd(_mm_load_pd(b+j+6),_mm_mul_pd(scalar2,_mm_load_pd(c+j+6)))); 231*b8a1809bSJed Brown # if PREFETCH_NTA 232*b8a1809bSJed Brown _mm_prefetch(b+j+64,_MM_HINT_NTA); 233*b8a1809bSJed Brown _mm_prefetch(c+j+64,_MM_HINT_NTA); 234*b8a1809bSJed Brown # endif 235*b8a1809bSJed Brown } 236*b8a1809bSJed Brown } 237*b8a1809bSJed Brown #else 238*b8a1809bSJed Brown for (j=0; j<N; j++) a[j] = b[j]+scalar*c[j]; 239*b8a1809bSJed Brown #endif 240*b8a1809bSJed Brown times[3][k] = Second() - times[3][k]; 241*b8a1809bSJed Brown } 242*b8a1809bSJed Brown 243*b8a1809bSJed Brown /* --- SUMMARY --- */ 244*b8a1809bSJed Brown 245*b8a1809bSJed Brown for (k=0; k<NTIMES; k++) { 246*b8a1809bSJed Brown for (j=0; j<4; j++) { 247*b8a1809bSJed Brown rmstime[j] = rmstime[j] + (times[j][k] * times[j][k]); 248*b8a1809bSJed Brown mintime[j] = MIN(mintime[j], times[j][k]); 249*b8a1809bSJed Brown maxtime[j] = MAX(maxtime[j], times[j][k]); 250*b8a1809bSJed Brown } 251*b8a1809bSJed Brown } 252*b8a1809bSJed Brown 253*b8a1809bSJed Brown 254*b8a1809bSJed Brown PetscPrintf(PETSC_COMM_WORLD,"%8s: %11s %11s %11s %11s %11s\n","Function","Rate (MB/s)","Total (MB/s)","RMS time","Min time","Max time"); 255*b8a1809bSJed Brown for (j=0; j<4; j++) { 256*b8a1809bSJed Brown rmstime[j] = sqrt(rmstime[j]/(double)NTIMES); 257*b8a1809bSJed Brown PetscPrintf(PETSC_COMM_WORLD,"%8s: %11.4f %11.4f %11.4f %11.4f %11.4f\n", label[j], 1.0e-06*bytes[j]/mintime[j], size*1.0e-06*bytes[j]/mintime[j], rmstime[j], mintime[j], maxtime[j]); 258*b8a1809bSJed Brown } 259*b8a1809bSJed Brown PetscFinalize(); 260*b8a1809bSJed Brown return 0; 261*b8a1809bSJed Brown } 262*b8a1809bSJed Brown 263*b8a1809bSJed Brown static double Second() { 264*b8a1809bSJed Brown double t; 265*b8a1809bSJed Brown MPI_Barrier(PETSC_COMM_WORLD); 266*b8a1809bSJed Brown PetscTime(t); 267*b8a1809bSJed Brown MPI_Barrier(PETSC_COMM_WORLD); 268*b8a1809bSJed Brown return t; 269*b8a1809bSJed Brown } 270*b8a1809bSJed Brown 271*b8a1809bSJed Brown #define M 20 272*b8a1809bSJed Brown static int checktick() 273*b8a1809bSJed Brown { 274*b8a1809bSJed Brown int i, minDelta, Delta; 275*b8a1809bSJed Brown double t1, t2, timesfound[M]; 276*b8a1809bSJed Brown 277*b8a1809bSJed Brown /* Collect a sequence of M unique time values from the system. */ 278*b8a1809bSJed Brown 279*b8a1809bSJed Brown for (i = 0; i < M; i++) { 280*b8a1809bSJed Brown t1 = Second(); 281*b8a1809bSJed Brown while( ((t2=Second()) - t1) < 1.0E-6 ) 282*b8a1809bSJed Brown ; 283*b8a1809bSJed Brown timesfound[i] = t1 = t2; 284*b8a1809bSJed Brown } 285*b8a1809bSJed Brown 286*b8a1809bSJed Brown /* 287*b8a1809bSJed Brown * Determine the minimum difference between these M values. 288*b8a1809bSJed Brown * This result will be our estimate (in microseconds) for the 289*b8a1809bSJed Brown * clock granularity. 290*b8a1809bSJed Brown */ 291*b8a1809bSJed Brown 292*b8a1809bSJed Brown minDelta = 1000000; 293*b8a1809bSJed Brown for (i = 1; i < M; i++) { 294*b8a1809bSJed Brown Delta = (int)( 1.0E6 * (timesfound[i]-timesfound[i-1])); 295*b8a1809bSJed Brown minDelta = MIN(minDelta, MAX(Delta,0)); 296*b8a1809bSJed Brown } 297*b8a1809bSJed Brown 298*b8a1809bSJed Brown return(minDelta); 299*b8a1809bSJed Brown } 300