1d3ae85c4SBarry Smith #include <stdio.h> 2d3ae85c4SBarry Smith #include <math.h> 3d3ae85c4SBarry Smith #include <limits.h> 4d3ae85c4SBarry Smith #include <float.h> 55e71baefSBarry Smith #include <petscsys.h> 6d3ae85c4SBarry Smith 7d3ae85c4SBarry Smith /* 80e3d61c9SBarry Smith Program: Stream 90e3d61c9SBarry Smith Programmer: Joe R. Zagar 100e3d61c9SBarry Smith Revision: 4.0-BETA, October 24, 1995 110e3d61c9SBarry Smith Original code developed by John D. McCalpin 120e3d61c9SBarry Smith 130e3d61c9SBarry Smith This program measures memory transfer rates in MB/s for simple 140e3d61c9SBarry Smith computational kernels coded in C. These numbers reveal the quality 150e3d61c9SBarry Smith of code generation for simple uncacheable kernels as well as showing 160e3d61c9SBarry Smith the cost of floating-point operations relative to memory accesses. 170e3d61c9SBarry Smith 180e3d61c9SBarry Smith INSTRUCTIONS: 190e3d61c9SBarry Smith 200e3d61c9SBarry Smith 1) Stream requires a good bit of memory to run. Adjust the 210e3d61c9SBarry Smith value of 'N' (below) to give a 'timing calibration' of 220e3d61c9SBarry Smith at least 20 clock-ticks. This will provide rate estimates 230e3d61c9SBarry Smith that should be good to about 5% precision. 24d3ae85c4SBarry Smith */ 25d3ae85c4SBarry Smith 26d3ae85c4SBarry Smith #define N 2000000 2751096aa5SJacob Faibussowitsch #define M 20 28d3ae85c4SBarry Smith #define NTIMES 50 29d3ae85c4SBarry Smith #define OFFSET 0 30d3ae85c4SBarry Smith 31d3ae85c4SBarry Smith /* 320e3d61c9SBarry Smith 3) Compile the code with full optimization. Many compilers 330e3d61c9SBarry Smith generate unreasonably bad code before the optimizer tightens 340e3d61c9SBarry Smith things up. If the results are unreasonably good, on the 350e3d61c9SBarry Smith other hand, the optimizer might be too smart for me! 360e3d61c9SBarry Smith 370e3d61c9SBarry Smith Try compiling with: 380e3d61c9SBarry Smith cc -O stream_d.c second.c -o stream_d -lm 390e3d61c9SBarry Smith 400e3d61c9SBarry Smith This is known to work on Cray, SGI, IBM, and Sun machines. 410e3d61c9SBarry Smith 420e3d61c9SBarry Smith 4) Mail the results to mccalpin@cs.virginia.edu 430e3d61c9SBarry Smith Be sure to include: 440e3d61c9SBarry Smith a) computer hardware model number and software revision 450e3d61c9SBarry Smith b) the compiler flags 460e3d61c9SBarry Smith c) all of the output from the test case. 470e3d61c9SBarry Smith Thanks! 480e3d61c9SBarry Smith 49d3ae85c4SBarry Smith */ 50d3ae85c4SBarry Smith 51d3ae85c4SBarry Smith #define HLINE "-------------------------------------------------------------\n" 52d3ae85c4SBarry Smith 5351096aa5SJacob Faibussowitsch static double a[N+OFFSET],b[N+OFFSET],c[N+OFFSET]; 54d3ae85c4SBarry Smith 55d3ae85c4SBarry Smith static double mintime[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX}; 56d3ae85c4SBarry Smith 5751096aa5SJacob Faibussowitsch static int checktick(void) 5851096aa5SJacob Faibussowitsch { 5951096aa5SJacob Faibussowitsch int minDelta = 1000000; 6051096aa5SJacob Faibussowitsch double timesfound[M]; 6151096aa5SJacob Faibussowitsch 6251096aa5SJacob Faibussowitsch /* Collect a sequence of M unique time values from the system. */ 6351096aa5SJacob Faibussowitsch 6451096aa5SJacob Faibussowitsch for (int i = 0; i < M; ++i) { 6551096aa5SJacob Faibussowitsch const double t1 = MPI_Wtime(); 6651096aa5SJacob Faibussowitsch 6751096aa5SJacob Faibussowitsch while (((timesfound[i] = MPI_Wtime()) - t1) < 1.0E-6) ; 6851096aa5SJacob Faibussowitsch } 6951096aa5SJacob Faibussowitsch 7051096aa5SJacob Faibussowitsch /* 7151096aa5SJacob Faibussowitsch Determine the minimum difference between these M values. 7251096aa5SJacob Faibussowitsch This result will be our estimate (in microseconds) for the 7351096aa5SJacob Faibussowitsch clock granularity. 7451096aa5SJacob Faibussowitsch */ 7551096aa5SJacob Faibussowitsch 7651096aa5SJacob Faibussowitsch for (int i = 1; i < M; ++i) { 7751096aa5SJacob Faibussowitsch int Delta = (int)(1.0E6*(timesfound[i]-timesfound[i-1])); 7851096aa5SJacob Faibussowitsch 7951096aa5SJacob Faibussowitsch minDelta = PetscMin(minDelta,PetscMax(Delta,0)); 8051096aa5SJacob Faibussowitsch } 8151096aa5SJacob Faibussowitsch return minDelta; 8251096aa5SJacob Faibussowitsch } 8351096aa5SJacob Faibussowitsch 84d3ae85c4SBarry Smith static double bytes[4] = { 85d3ae85c4SBarry Smith 2 * sizeof(double) * N, 86d3ae85c4SBarry Smith 2 * sizeof(double) * N, 87d3ae85c4SBarry Smith 3 * sizeof(double) * N, 88d3ae85c4SBarry Smith 3 * sizeof(double) * N 89d3ae85c4SBarry Smith }; 90d3ae85c4SBarry Smith 91d3ae85c4SBarry Smith int main(int argc,char **args) 92d3ae85c4SBarry Smith { 9351096aa5SJacob Faibussowitsch const double scalar = 3.0; 9451096aa5SJacob Faibussowitsch double t,times[4][NTIMES],irate[4],rate[4]; 9551096aa5SJacob Faibussowitsch PetscMPIInt rank,size,resultlen; 9651096aa5SJacob Faibussowitsch char hostname[MPI_MAX_PROCESSOR_NAME] = {0}; 97d3ae85c4SBarry Smith 98b8abcfdeSJacob Faibussowitsch PetscCall(PetscInitialize(&argc,&args,NULL,NULL)); 9951096aa5SJacob Faibussowitsch PetscCallMPI(MPI_Comm_rank(MPI_COMM_WORLD,&rank)); 10051096aa5SJacob Faibussowitsch PetscCallMPI(MPI_Comm_size(MPI_COMM_WORLD,&size)); 101d3ae85c4SBarry Smith 10251096aa5SJacob Faibussowitsch PetscCallMPI(MPI_Get_processor_name(hostname,&resultlen));(void)resultlen; 10351096aa5SJacob Faibussowitsch if (rank) PetscCallMPI(MPI_Send(hostname,MPI_MAX_PROCESSOR_NAME,MPI_CHAR,0,0,MPI_COMM_WORLD)); 10451096aa5SJacob Faibussowitsch else { 10551096aa5SJacob Faibussowitsch for (int j = 1; j < size; ++j) { 10651096aa5SJacob Faibussowitsch PetscCallMPI(MPI_Recv(hostname,MPI_MAX_PROCESSOR_NAME,MPI_CHAR,j,0,MPI_COMM_WORLD,MPI_STATUS_IGNORE)); 1076b58a888SBarry Smith } 1081df1832dSBarry Smith } 10951096aa5SJacob Faibussowitsch PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 110d3ae85c4SBarry Smith 111d3ae85c4SBarry Smith /* --- SETUP --- determine precision and check timing --- */ 112d3ae85c4SBarry Smith 113dd400576SPatrick Sanan if (rank == 0) { 11451096aa5SJacob Faibussowitsch /* 11551096aa5SJacob Faibussowitsch printf(HLINE); 116d3ae85c4SBarry Smith printf("Array size = %d, Offset = %d\n" , N, OFFSET); 117d3ae85c4SBarry Smith printf("Total memory required = %.1f MB.\n", (3 * N * BytesPerWord) / 1048576.0); 118d3ae85c4SBarry Smith printf("Each test is run %d times, but only\n", NTIMES); 119d3ae85c4SBarry Smith printf("the *best* time for each is used.\n"); 12051096aa5SJacob Faibussowitsch printf(HLINE); 12151096aa5SJacob Faibussowitsch */ 122d3ae85c4SBarry Smith } 123d3ae85c4SBarry Smith 124d3ae85c4SBarry Smith /* Get initial value for system clock. */ 12551096aa5SJacob Faibussowitsch for (int j = 0; j < N; ++j) { 126d3ae85c4SBarry Smith a[j] = 1.0; 127d3ae85c4SBarry Smith b[j] = 2.0; 128d3ae85c4SBarry Smith c[j] = 0.0; 129d3ae85c4SBarry Smith } 130d3ae85c4SBarry Smith 131dd400576SPatrick Sanan if (rank == 0) { 13251096aa5SJacob Faibussowitsch int quantum; 133d3ae85c4SBarry Smith if ((quantum = checktick()) >= 1) ; /* printf("Your clock granularity/precision appears to be %d microseconds.\n", quantum); */ 134d3ae85c4SBarry Smith else ; /* printf("Your clock granularity appears to be less than one microsecond.\n");*/ 135d3ae85c4SBarry Smith } 136d3ae85c4SBarry Smith 13719623ac0SBarry Smith t = MPI_Wtime(); 13851096aa5SJacob Faibussowitsch for (int j = 0; j < N; ++j) a[j] *= 2.0; 13919623ac0SBarry Smith t = 1.0E6 * (MPI_Wtime() - t); 140d3ae85c4SBarry Smith 141dd400576SPatrick Sanan if (rank == 0) { 14251096aa5SJacob Faibussowitsch /* 14351096aa5SJacob Faibussowitsch printf("Each test below will take on the order of %d microseconds.\n", (int) t); 144d3ae85c4SBarry Smith printf(" (= %d clock ticks)\n", (int) (t/quantum)); 145d3ae85c4SBarry Smith printf("Increase the size of the arrays if this shows that\n"); 146d3ae85c4SBarry Smith printf("you are not getting at least 20 clock ticks per test.\n"); 14751096aa5SJacob Faibussowitsch printf(HLINE); 14851096aa5SJacob Faibussowitsch */ 149d3ae85c4SBarry Smith } 150d3ae85c4SBarry Smith 151d3ae85c4SBarry Smith /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ 152d3ae85c4SBarry Smith 15351096aa5SJacob Faibussowitsch for (int k = 0; k < NTIMES; ++k) { 15451096aa5SJacob Faibussowitsch PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 15519623ac0SBarry Smith times[0][k] = MPI_Wtime(); 156d3ae85c4SBarry Smith /* should all these barriers be pulled outside of the time call? */ 15751096aa5SJacob Faibussowitsch PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 15851096aa5SJacob Faibussowitsch PetscCall(PetscArraycpy(c,a,N)); 15951096aa5SJacob Faibussowitsch PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 16019623ac0SBarry Smith times[0][k] = MPI_Wtime() - times[0][k]; 161d3ae85c4SBarry Smith 16219623ac0SBarry Smith times[1][k] = MPI_Wtime(); 16351096aa5SJacob Faibussowitsch PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 16451096aa5SJacob Faibussowitsch for (int j = 0; j < N; ++j) b[j] = scalar*c[j]; 16551096aa5SJacob Faibussowitsch PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 16619623ac0SBarry Smith times[1][k] = MPI_Wtime() - times[1][k]; 167d3ae85c4SBarry Smith 16819623ac0SBarry Smith times[2][k] = MPI_Wtime(); 16951096aa5SJacob Faibussowitsch PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 17051096aa5SJacob Faibussowitsch for (int j = 0; j < N; ++j) c[j] = a[j]+b[j]; 17151096aa5SJacob Faibussowitsch PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 17219623ac0SBarry Smith times[2][k] = MPI_Wtime() - times[2][k]; 173d3ae85c4SBarry Smith 17419623ac0SBarry Smith times[3][k] = MPI_Wtime(); 17551096aa5SJacob Faibussowitsch PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 17651096aa5SJacob Faibussowitsch for (int j = 0; j < N; ++j) a[j] = b[j]+scalar*c[j]; 17751096aa5SJacob Faibussowitsch PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 17819623ac0SBarry Smith times[3][k] = MPI_Wtime() - times[3][k]; 179d3ae85c4SBarry Smith } 180d3ae85c4SBarry Smith 181d3ae85c4SBarry Smith /* --- SUMMARY --- */ 182d3ae85c4SBarry Smith 18351096aa5SJacob Faibussowitsch for (int k = 0; k < NTIMES; ++k) { 18451096aa5SJacob Faibussowitsch for (int j = 0; j < 4; ++j) mintime[j] = PetscMin(mintime[j],times[j][k]); 18551096aa5SJacob Faibussowitsch } 186d3ae85c4SBarry Smith 18751096aa5SJacob Faibussowitsch for (int j = 0; j < 4; ++j) irate[j] = 1.0E-06 * bytes[j]/mintime[j]; 18851096aa5SJacob Faibussowitsch PetscCallMPI(MPI_Reduce(irate,rate,4,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD)); 189d3ae85c4SBarry Smith 190*c5853193SPierre Jolivet if (rank == 0) { 19151096aa5SJacob Faibussowitsch FILE *fd; 19251096aa5SJacob Faibussowitsch 19351096aa5SJacob Faibussowitsch if (size) { 1944198fb66SBarry Smith double prate; 19551096aa5SJacob Faibussowitsch 1964198fb66SBarry Smith fd = fopen("flops","r"); 1974198fb66SBarry Smith fscanf(fd,"%lg",&prate); 1984198fb66SBarry Smith fclose(fd); 1994198fb66SBarry Smith printf("%d %11.4f Rate (MB/s) %g \n",size,rate[3],rate[3]/prate); 20051096aa5SJacob Faibussowitsch } else { 20151096aa5SJacob Faibussowitsch fd = fopen("flops","w"); 20251096aa5SJacob Faibussowitsch fprintf(fd,"%g\n",rate[3]); 20351096aa5SJacob Faibussowitsch fclose(fd); 20451096aa5SJacob Faibussowitsch printf("%d %11.4f Rate (MB/s)\n",size,rate[3]); 2054198fb66SBarry Smith } 206d3ae85c4SBarry Smith } 20751096aa5SJacob Faibussowitsch PetscCall(PetscFinalize()); 208d3ae85c4SBarry Smith return 0; 209d3ae85c4SBarry Smith } 210