1d3ae85c4SBarry Smith 2d3ae85c4SBarry Smith # include <stdio.h> 3d3ae85c4SBarry Smith # include <math.h> 4d3ae85c4SBarry Smith # include <limits.h> 5d3ae85c4SBarry Smith # include <float.h> 65e71baefSBarry Smith #include <petscsys.h> 7d3ae85c4SBarry Smith 8d3ae85c4SBarry Smith /* 90e3d61c9SBarry Smith Program: Stream 100e3d61c9SBarry Smith Programmer: Joe R. Zagar 110e3d61c9SBarry Smith Revision: 4.0-BETA, October 24, 1995 120e3d61c9SBarry Smith Original code developed by John D. McCalpin 130e3d61c9SBarry Smith 140e3d61c9SBarry Smith This program measures memory transfer rates in MB/s for simple 150e3d61c9SBarry Smith computational kernels coded in C. These numbers reveal the quality 160e3d61c9SBarry Smith of code generation for simple uncacheable kernels as well as showing 170e3d61c9SBarry Smith the cost of floating-point operations relative to memory accesses. 180e3d61c9SBarry Smith 190e3d61c9SBarry Smith INSTRUCTIONS: 200e3d61c9SBarry Smith 210e3d61c9SBarry Smith 1) Stream requires a good bit of memory to run. Adjust the 220e3d61c9SBarry Smith value of 'N' (below) to give a 'timing calibration' of 230e3d61c9SBarry Smith at least 20 clock-ticks. This will provide rate estimates 240e3d61c9SBarry Smith that should be good to about 5% precision. 25d3ae85c4SBarry Smith */ 26d3ae85c4SBarry Smith 27d3ae85c4SBarry Smith # define N 2000000 28d3ae85c4SBarry Smith # define NTIMES 50 29d3ae85c4SBarry Smith # define OFFSET 0 30d3ae85c4SBarry Smith 31d3ae85c4SBarry Smith /* 320e3d61c9SBarry Smith 3) Compile the code with full optimization. Many compilers 330e3d61c9SBarry Smith generate unreasonably bad code before the optimizer tightens 340e3d61c9SBarry Smith things up. If the results are unreasonably good, on the 350e3d61c9SBarry Smith other hand, the optimizer might be too smart for me! 360e3d61c9SBarry Smith 370e3d61c9SBarry Smith Try compiling with: 380e3d61c9SBarry Smith cc -O stream_d.c second.c -o stream_d -lm 390e3d61c9SBarry Smith 400e3d61c9SBarry Smith This is known to work on Cray, SGI, IBM, and Sun machines. 410e3d61c9SBarry Smith 420e3d61c9SBarry Smith 4) Mail the results to mccalpin@cs.virginia.edu 430e3d61c9SBarry Smith Be sure to include: 440e3d61c9SBarry Smith a) computer hardware model number and software revision 450e3d61c9SBarry Smith b) the compiler flags 460e3d61c9SBarry Smith c) all of the output from the test case. 470e3d61c9SBarry Smith Thanks! 480e3d61c9SBarry Smith 49d3ae85c4SBarry Smith */ 50d3ae85c4SBarry Smith 51d3ae85c4SBarry Smith # define HLINE "-------------------------------------------------------------\n" 52d3ae85c4SBarry Smith 53d3ae85c4SBarry Smith # ifndef MIN 54d3ae85c4SBarry Smith # define MIN(x,y) ((x)<(y) ? (x) : (y)) 55d3ae85c4SBarry Smith # endif 56d3ae85c4SBarry Smith # ifndef MAX 57d3ae85c4SBarry Smith # define MAX(x,y) ((x)>(y) ? (x) : (y)) 58d3ae85c4SBarry Smith # endif 59d3ae85c4SBarry Smith 60d3ae85c4SBarry Smith static double a[N+OFFSET], 61d3ae85c4SBarry Smith b[N+OFFSET], 62d3ae85c4SBarry Smith c[N+OFFSET]; 63d3ae85c4SBarry Smith /*double *a,*b,*c;*/ 64d3ae85c4SBarry Smith 65d3ae85c4SBarry Smith static double mintime[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX}; 66d3ae85c4SBarry Smith 67d3ae85c4SBarry Smith static double bytes[4] = { 68d3ae85c4SBarry Smith 2 * sizeof(double) * N, 69d3ae85c4SBarry Smith 2 * sizeof(double) * N, 70d3ae85c4SBarry Smith 3 * sizeof(double) * N, 71d3ae85c4SBarry Smith 3 * sizeof(double) * N 72d3ae85c4SBarry Smith }; 73d3ae85c4SBarry Smith 74d3ae85c4SBarry Smith int main(int argc,char **args) 75d3ae85c4SBarry Smith { 76d1d3a73cSBarry Smith int quantum, checktick(void); 77d3ae85c4SBarry Smith register int j, k; 78d3ae85c4SBarry Smith double scalar, t, times[4][NTIMES],irate[4],rate[4]; 79d3ae85c4SBarry Smith int rank,size,resultlen; 80d3ae85c4SBarry Smith char hostname[MPI_MAX_PROCESSOR_NAME]; 811df1832dSBarry Smith MPI_Status status; 828a4d7553SBarry Smith int ierr; 834198fb66SBarry Smith FILE *fd; 84d3ae85c4SBarry Smith 85*b8abcfdeSJacob Faibussowitsch PetscCall(PetscInitialize(&argc,&args,NULL,NULL)); 868a4d7553SBarry Smith ierr = MPI_Comm_rank(MPI_COMM_WORLD,&rank);if (ierr) return ierr; 878a4d7553SBarry Smith ierr = MPI_Comm_size(MPI_COMM_WORLD,&size);if (ierr) return ierr; 88d3ae85c4SBarry Smith 896b58a888SBarry Smith for (j=0; j<MPI_MAX_PROCESSOR_NAME; j++) { 906b58a888SBarry Smith hostname[j] = 0; 916b58a888SBarry Smith } 928a4d7553SBarry Smith ierr = MPI_Get_processor_name(hostname,&resultlen);if (ierr) return ierr; 93dd400576SPatrick Sanan if (rank == 0) { 941df1832dSBarry Smith for (j=1; j<size; j++) { 958a4d7553SBarry Smith ierr = MPI_Recv(hostname,MPI_MAX_PROCESSOR_NAME,MPI_CHAR,j,0,MPI_COMM_WORLD,&status);if (ierr) return ierr; 961df1832dSBarry Smith } 971df1832dSBarry Smith } else { 988a4d7553SBarry Smith ierr = MPI_Send(hostname,MPI_MAX_PROCESSOR_NAME,MPI_CHAR,0,0,MPI_COMM_WORLD);if (ierr) return ierr; 99d3ae85c4SBarry Smith } 100b7250d5dSSatish Balay ierr = MPI_Barrier(MPI_COMM_WORLD); 101d3ae85c4SBarry Smith 102d3ae85c4SBarry Smith /* --- SETUP --- determine precision and check timing --- */ 103d3ae85c4SBarry Smith 104dd400576SPatrick Sanan if (rank == 0) { 105d3ae85c4SBarry Smith /*printf(HLINE); 106d3ae85c4SBarry Smith printf("Array size = %d, Offset = %d\n" , N, OFFSET); 107d3ae85c4SBarry Smith printf("Total memory required = %.1f MB.\n", (3 * N * BytesPerWord) / 1048576.0); 108d3ae85c4SBarry Smith printf("Each test is run %d times, but only\n", NTIMES); 109d3ae85c4SBarry Smith printf("the *best* time for each is used.\n"); 110d3ae85c4SBarry Smith printf(HLINE); */ 111d3ae85c4SBarry Smith } 112d3ae85c4SBarry Smith 113d3ae85c4SBarry Smith /* Get initial value for system clock. */ 114d3ae85c4SBarry Smith 115d3ae85c4SBarry Smith /* a = malloc(N*sizeof(double)); 116d3ae85c4SBarry Smith b = malloc(N*sizeof(double)); 117d3ae85c4SBarry Smith c = malloc(N*sizeof(double));*/ 118d3ae85c4SBarry Smith for (j=0; j<N; j++) { 119d3ae85c4SBarry Smith a[j] = 1.0; 120d3ae85c4SBarry Smith b[j] = 2.0; 121d3ae85c4SBarry Smith c[j] = 0.0; 122d3ae85c4SBarry Smith } 123d3ae85c4SBarry Smith 124dd400576SPatrick Sanan if (rank == 0) { 125d3ae85c4SBarry Smith if ((quantum = checktick()) >= 1) ; /* printf("Your clock granularity/precision appears to be %d microseconds.\n", quantum); */ 126d3ae85c4SBarry Smith else ; /* printf("Your clock granularity appears to be less than one microsecond.\n");*/ 127d3ae85c4SBarry Smith } 128d3ae85c4SBarry Smith 12919623ac0SBarry Smith t = MPI_Wtime(); 130d3ae85c4SBarry Smith for (j = 0; j < N; j++) a[j] = 2.0E0 * a[j]; 13119623ac0SBarry Smith t = 1.0E6 * (MPI_Wtime() - t); 132d3ae85c4SBarry Smith 133dd400576SPatrick Sanan if (rank == 0) { 134d3ae85c4SBarry Smith /* printf("Each test below will take on the order of %d microseconds.\n", (int) t); 135d3ae85c4SBarry Smith printf(" (= %d clock ticks)\n", (int) (t/quantum)); 136d3ae85c4SBarry Smith printf("Increase the size of the arrays if this shows that\n"); 137d3ae85c4SBarry Smith printf("you are not getting at least 20 clock ticks per test.\n"); 138d3ae85c4SBarry Smith printf(HLINE);*/ 139d3ae85c4SBarry Smith } 140d3ae85c4SBarry Smith 141d3ae85c4SBarry Smith /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ 142d3ae85c4SBarry Smith 143d3ae85c4SBarry Smith scalar = 3.0; 144d3ae85c4SBarry Smith for (k=0; k<NTIMES; k++) 145d3ae85c4SBarry Smith { 146b7250d5dSSatish Balay ierr = MPI_Barrier(MPI_COMM_WORLD); 14719623ac0SBarry Smith times[0][k] = MPI_Wtime(); 148d3ae85c4SBarry Smith /* should all these barriers be pulled outside of the time call? */ 149b7250d5dSSatish Balay ierr = MPI_Barrier(MPI_COMM_WORLD); 150d3ae85c4SBarry Smith for (j=0; j<N; j++) c[j] = a[j]; 151b7250d5dSSatish Balay ierr = MPI_Barrier(MPI_COMM_WORLD); 15219623ac0SBarry Smith times[0][k] = MPI_Wtime() - times[0][k]; 153d3ae85c4SBarry Smith 15419623ac0SBarry Smith times[1][k] = MPI_Wtime(); 155b7250d5dSSatish Balay ierr = MPI_Barrier(MPI_COMM_WORLD); 156d3ae85c4SBarry Smith for (j=0; j<N; j++) b[j] = scalar*c[j]; 157b7250d5dSSatish Balay ierr = MPI_Barrier(MPI_COMM_WORLD); 15819623ac0SBarry Smith times[1][k] = MPI_Wtime() - times[1][k]; 159d3ae85c4SBarry Smith 16019623ac0SBarry Smith times[2][k] = MPI_Wtime(); 161b7250d5dSSatish Balay ierr = MPI_Barrier(MPI_COMM_WORLD); 162d3ae85c4SBarry Smith for (j=0; j<N; j++) c[j] = a[j]+b[j]; 163b7250d5dSSatish Balay ierr = MPI_Barrier(MPI_COMM_WORLD); 16419623ac0SBarry Smith times[2][k] = MPI_Wtime() - times[2][k]; 165d3ae85c4SBarry Smith 16619623ac0SBarry Smith times[3][k] = MPI_Wtime(); 167b7250d5dSSatish Balay ierr = MPI_Barrier(MPI_COMM_WORLD); 168d3ae85c4SBarry Smith for (j=0; j<N; j++) a[j] = b[j]+scalar*c[j]; 169b7250d5dSSatish Balay ierr = MPI_Barrier(MPI_COMM_WORLD); 17019623ac0SBarry Smith times[3][k] = MPI_Wtime() - times[3][k]; 171d3ae85c4SBarry Smith } 172d3ae85c4SBarry Smith 173d3ae85c4SBarry Smith /* --- SUMMARY --- */ 174d3ae85c4SBarry Smith 175d3ae85c4SBarry Smith for (k=0; k<NTIMES; k++) 176d3ae85c4SBarry Smith for (j=0; j<4; j++) mintime[j] = MIN(mintime[j], times[j][k]); 177d3ae85c4SBarry Smith 178d3ae85c4SBarry Smith for (j=0; j<4; j++) irate[j] = 1.0E-06 * bytes[j]/mintime[j]; 179b7250d5dSSatish Balay ierr = MPI_Reduce(irate,rate,4,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); 180b7250d5dSSatish Balay if (ierr) printf("Error calling MPI\n"); 181d3ae85c4SBarry Smith 182dd400576SPatrick Sanan if (rank == 0) { 1834198fb66SBarry Smith if (size == 1) { 1844198fb66SBarry Smith printf("%d %11.4f Rate (MB/s)\n",size, rate[3]); 1854198fb66SBarry Smith fd = fopen("flops","w"); 1864198fb66SBarry Smith fprintf(fd,"%g\n",rate[3]); 1874198fb66SBarry Smith fclose(fd); 1884198fb66SBarry Smith } else { 1894198fb66SBarry Smith double prate; 1904198fb66SBarry Smith fd = fopen("flops","r"); 1914198fb66SBarry Smith fscanf(fd,"%lg",&prate); 1924198fb66SBarry Smith fclose(fd); 1934198fb66SBarry Smith printf("%d %11.4f Rate (MB/s) %g \n", size, rate[3],rate[3]/prate); 1944198fb66SBarry Smith } 195d3ae85c4SBarry Smith } 1965e71baefSBarry Smith PetscFinalize(); 197d3ae85c4SBarry Smith return 0; 198d3ae85c4SBarry Smith } 199d3ae85c4SBarry Smith 200