1d3ae85c4SBarry Smith #include <stdio.h> 2d3ae85c4SBarry Smith #include <math.h> 3d3ae85c4SBarry Smith #include <limits.h> 4d3ae85c4SBarry Smith #include <float.h> 55e71baefSBarry Smith #include <petscsys.h> 6d3ae85c4SBarry Smith 7d3ae85c4SBarry Smith /* 80e3d61c9SBarry Smith Program: Stream 90e3d61c9SBarry Smith Programmer: Joe R. Zagar 100e3d61c9SBarry Smith Revision: 4.0-BETA, October 24, 1995 110e3d61c9SBarry Smith Original code developed by John D. McCalpin 120e3d61c9SBarry Smith 130e3d61c9SBarry Smith This program measures memory transfer rates in MB/s for simple 140e3d61c9SBarry Smith computational kernels coded in C. These numbers reveal the quality 150e3d61c9SBarry Smith of code generation for simple uncacheable kernels as well as showing 160e3d61c9SBarry Smith the cost of floating-point operations relative to memory accesses. 170e3d61c9SBarry Smith 180e3d61c9SBarry Smith INSTRUCTIONS: 190e3d61c9SBarry Smith 200e3d61c9SBarry Smith 1) Stream requires a good bit of memory to run. Adjust the 210e3d61c9SBarry Smith value of 'N' (below) to give a 'timing calibration' of 220e3d61c9SBarry Smith at least 20 clock-ticks. This will provide rate estimates 230e3d61c9SBarry Smith that should be good to about 5% precision. 24d3ae85c4SBarry Smith */ 25d3ae85c4SBarry Smith 26d3ae85c4SBarry Smith #define N 2000000 27*51096aa5SJacob Faibussowitsch #define M 20 28d3ae85c4SBarry Smith #define NTIMES 50 29d3ae85c4SBarry Smith #define OFFSET 0 30d3ae85c4SBarry Smith 31d3ae85c4SBarry Smith /* 320e3d61c9SBarry Smith 3) Compile the code with full optimization. Many compilers 330e3d61c9SBarry Smith generate unreasonably bad code before the optimizer tightens 340e3d61c9SBarry Smith things up. If the results are unreasonably good, on the 350e3d61c9SBarry Smith other hand, the optimizer might be too smart for me! 360e3d61c9SBarry Smith 370e3d61c9SBarry Smith Try compiling with: 380e3d61c9SBarry Smith cc -O stream_d.c second.c -o stream_d -lm 390e3d61c9SBarry Smith 400e3d61c9SBarry Smith This is known to work on Cray, SGI, IBM, and Sun machines. 410e3d61c9SBarry Smith 420e3d61c9SBarry Smith 4) Mail the results to mccalpin@cs.virginia.edu 430e3d61c9SBarry Smith Be sure to include: 440e3d61c9SBarry Smith a) computer hardware model number and software revision 450e3d61c9SBarry Smith b) the compiler flags 460e3d61c9SBarry Smith c) all of the output from the test case. 470e3d61c9SBarry Smith Thanks! 480e3d61c9SBarry Smith 49d3ae85c4SBarry Smith */ 50d3ae85c4SBarry Smith 51d3ae85c4SBarry Smith #define HLINE "-------------------------------------------------------------\n" 52d3ae85c4SBarry Smith 53*51096aa5SJacob Faibussowitsch static double a[N+OFFSET],b[N+OFFSET],c[N+OFFSET]; 54d3ae85c4SBarry Smith 55d3ae85c4SBarry Smith static double mintime[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX}; 56d3ae85c4SBarry Smith 57*51096aa5SJacob Faibussowitsch static int checktick(void) 58*51096aa5SJacob Faibussowitsch { 59*51096aa5SJacob Faibussowitsch int minDelta = 1000000; 60*51096aa5SJacob Faibussowitsch double timesfound[M]; 61*51096aa5SJacob Faibussowitsch 62*51096aa5SJacob Faibussowitsch /* Collect a sequence of M unique time values from the system. */ 63*51096aa5SJacob Faibussowitsch 64*51096aa5SJacob Faibussowitsch for (int i = 0; i < M; ++i) { 65*51096aa5SJacob Faibussowitsch const double t1 = MPI_Wtime(); 66*51096aa5SJacob Faibussowitsch 67*51096aa5SJacob Faibussowitsch while (((timesfound[i] = MPI_Wtime()) - t1) < 1.0E-6) ; 68*51096aa5SJacob Faibussowitsch } 69*51096aa5SJacob Faibussowitsch 70*51096aa5SJacob Faibussowitsch /* 71*51096aa5SJacob Faibussowitsch Determine the minimum difference between these M values. 72*51096aa5SJacob Faibussowitsch This result will be our estimate (in microseconds) for the 73*51096aa5SJacob Faibussowitsch clock granularity. 74*51096aa5SJacob Faibussowitsch */ 75*51096aa5SJacob Faibussowitsch 76*51096aa5SJacob Faibussowitsch for (int i = 1; i < M; ++i) { 77*51096aa5SJacob Faibussowitsch int Delta = (int)(1.0E6*(timesfound[i]-timesfound[i-1])); 78*51096aa5SJacob Faibussowitsch 79*51096aa5SJacob Faibussowitsch minDelta = PetscMin(minDelta,PetscMax(Delta,0)); 80*51096aa5SJacob Faibussowitsch } 81*51096aa5SJacob Faibussowitsch return minDelta; 82*51096aa5SJacob Faibussowitsch } 83*51096aa5SJacob Faibussowitsch 84d3ae85c4SBarry Smith static double bytes[4] = { 85d3ae85c4SBarry Smith 2 * sizeof(double) * N, 86d3ae85c4SBarry Smith 2 * sizeof(double) * N, 87d3ae85c4SBarry Smith 3 * sizeof(double) * N, 88d3ae85c4SBarry Smith 3 * sizeof(double) * N 89d3ae85c4SBarry Smith }; 90d3ae85c4SBarry Smith 91d3ae85c4SBarry Smith int main(int argc,char **args) 92d3ae85c4SBarry Smith { 93*51096aa5SJacob Faibussowitsch const double scalar = 3.0; 94*51096aa5SJacob Faibussowitsch double t,times[4][NTIMES],irate[4],rate[4]; 95*51096aa5SJacob Faibussowitsch PetscMPIInt rank,size,resultlen; 96*51096aa5SJacob Faibussowitsch char hostname[MPI_MAX_PROCESSOR_NAME] = {0}; 97d3ae85c4SBarry Smith 98b8abcfdeSJacob Faibussowitsch PetscCall(PetscInitialize(&argc,&args,NULL,NULL)); 99*51096aa5SJacob Faibussowitsch PetscCallMPI(MPI_Comm_rank(MPI_COMM_WORLD,&rank)); 100*51096aa5SJacob Faibussowitsch PetscCallMPI(MPI_Comm_size(MPI_COMM_WORLD,&size)); 101d3ae85c4SBarry Smith 102*51096aa5SJacob Faibussowitsch PetscCallMPI(MPI_Get_processor_name(hostname,&resultlen));(void)resultlen; 103*51096aa5SJacob Faibussowitsch if (rank) PetscCallMPI(MPI_Send(hostname,MPI_MAX_PROCESSOR_NAME,MPI_CHAR,0,0,MPI_COMM_WORLD)); 104*51096aa5SJacob Faibussowitsch else { 105*51096aa5SJacob Faibussowitsch for (int j = 1; j < size; ++j) { 106*51096aa5SJacob Faibussowitsch PetscCallMPI(MPI_Recv(hostname,MPI_MAX_PROCESSOR_NAME,MPI_CHAR,j,0,MPI_COMM_WORLD,MPI_STATUS_IGNORE)); 1076b58a888SBarry Smith } 1081df1832dSBarry Smith } 109*51096aa5SJacob Faibussowitsch PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 110d3ae85c4SBarry Smith 111d3ae85c4SBarry Smith /* --- SETUP --- determine precision and check timing --- */ 112d3ae85c4SBarry Smith 113dd400576SPatrick Sanan if (rank == 0) { 114*51096aa5SJacob Faibussowitsch /* 115*51096aa5SJacob Faibussowitsch printf(HLINE); 116d3ae85c4SBarry Smith printf("Array size = %d, Offset = %d\n" , N, OFFSET); 117d3ae85c4SBarry Smith printf("Total memory required = %.1f MB.\n", (3 * N * BytesPerWord) / 1048576.0); 118d3ae85c4SBarry Smith printf("Each test is run %d times, but only\n", NTIMES); 119d3ae85c4SBarry Smith printf("the *best* time for each is used.\n"); 120*51096aa5SJacob Faibussowitsch printf(HLINE); 121*51096aa5SJacob Faibussowitsch */ 122d3ae85c4SBarry Smith } 123d3ae85c4SBarry Smith 124d3ae85c4SBarry Smith /* Get initial value for system clock. */ 125*51096aa5SJacob Faibussowitsch for (int j = 0; j < N; ++j) { 126d3ae85c4SBarry Smith a[j] = 1.0; 127d3ae85c4SBarry Smith b[j] = 2.0; 128d3ae85c4SBarry Smith c[j] = 0.0; 129d3ae85c4SBarry Smith } 130d3ae85c4SBarry Smith 131dd400576SPatrick Sanan if (rank == 0) { 132*51096aa5SJacob Faibussowitsch int quantum; 133d3ae85c4SBarry Smith if ((quantum = checktick()) >= 1) ; /* printf("Your clock granularity/precision appears to be %d microseconds.\n", quantum); */ 134d3ae85c4SBarry Smith else ; /* printf("Your clock granularity appears to be less than one microsecond.\n");*/ 135d3ae85c4SBarry Smith } 136d3ae85c4SBarry Smith 13719623ac0SBarry Smith t = MPI_Wtime(); 138*51096aa5SJacob Faibussowitsch for (int j = 0; j < N; ++j) a[j] *= 2.0; 13919623ac0SBarry Smith t = 1.0E6 * (MPI_Wtime() - t); 140d3ae85c4SBarry Smith 141dd400576SPatrick Sanan if (rank == 0) { 142*51096aa5SJacob Faibussowitsch /* 143*51096aa5SJacob Faibussowitsch printf("Each test below will take on the order of %d microseconds.\n", (int) t); 144d3ae85c4SBarry Smith printf(" (= %d clock ticks)\n", (int) (t/quantum)); 145d3ae85c4SBarry Smith printf("Increase the size of the arrays if this shows that\n"); 146d3ae85c4SBarry Smith printf("you are not getting at least 20 clock ticks per test.\n"); 147*51096aa5SJacob Faibussowitsch printf(HLINE); 148*51096aa5SJacob Faibussowitsch */ 149d3ae85c4SBarry Smith } 150d3ae85c4SBarry Smith 151d3ae85c4SBarry Smith /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ 152d3ae85c4SBarry Smith 153*51096aa5SJacob Faibussowitsch for (int k = 0; k < NTIMES; ++k) { 154*51096aa5SJacob Faibussowitsch PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 15519623ac0SBarry Smith times[0][k] = MPI_Wtime(); 156d3ae85c4SBarry Smith /* should all these barriers be pulled outside of the time call? */ 157*51096aa5SJacob Faibussowitsch PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 158*51096aa5SJacob Faibussowitsch PetscCall(PetscArraycpy(c,a,N)); 159*51096aa5SJacob Faibussowitsch PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 16019623ac0SBarry Smith times[0][k] = MPI_Wtime() - times[0][k]; 161d3ae85c4SBarry Smith 16219623ac0SBarry Smith times[1][k] = MPI_Wtime(); 163*51096aa5SJacob Faibussowitsch PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 164*51096aa5SJacob Faibussowitsch for (int j = 0; j < N; ++j) b[j] = scalar*c[j]; 165*51096aa5SJacob Faibussowitsch PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 16619623ac0SBarry Smith times[1][k] = MPI_Wtime() - times[1][k]; 167d3ae85c4SBarry Smith 16819623ac0SBarry Smith times[2][k] = MPI_Wtime(); 169*51096aa5SJacob Faibussowitsch PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 170*51096aa5SJacob Faibussowitsch for (int j = 0; j < N; ++j) c[j] = a[j]+b[j]; 171*51096aa5SJacob Faibussowitsch PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 17219623ac0SBarry Smith times[2][k] = MPI_Wtime() - times[2][k]; 173d3ae85c4SBarry Smith 17419623ac0SBarry Smith times[3][k] = MPI_Wtime(); 175*51096aa5SJacob Faibussowitsch PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 176*51096aa5SJacob Faibussowitsch for (int j = 0; j < N; ++j) a[j] = b[j]+scalar*c[j]; 177*51096aa5SJacob Faibussowitsch PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 17819623ac0SBarry Smith times[3][k] = MPI_Wtime() - times[3][k]; 179d3ae85c4SBarry Smith } 180d3ae85c4SBarry Smith 181d3ae85c4SBarry Smith /* --- SUMMARY --- */ 182d3ae85c4SBarry Smith 183*51096aa5SJacob Faibussowitsch for (int k = 0; k < NTIMES; ++k) { 184*51096aa5SJacob Faibussowitsch for (int j = 0; j < 4; ++j) mintime[j] = PetscMin(mintime[j],times[j][k]); 185*51096aa5SJacob Faibussowitsch } 186d3ae85c4SBarry Smith 187*51096aa5SJacob Faibussowitsch for (int j = 0; j < 4; ++j) irate[j] = 1.0E-06 * bytes[j]/mintime[j]; 188*51096aa5SJacob Faibussowitsch PetscCallMPI(MPI_Reduce(irate,rate,4,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD)); 189d3ae85c4SBarry Smith 190*51096aa5SJacob Faibussowitsch if (!rank) { 191*51096aa5SJacob Faibussowitsch FILE *fd; 192*51096aa5SJacob Faibussowitsch 193*51096aa5SJacob Faibussowitsch if (size) { 1944198fb66SBarry Smith double prate; 195*51096aa5SJacob Faibussowitsch 1964198fb66SBarry Smith fd = fopen("flops","r"); 1974198fb66SBarry Smith fscanf(fd,"%lg",&prate); 1984198fb66SBarry Smith fclose(fd); 1994198fb66SBarry Smith printf("%d %11.4f Rate (MB/s) %g \n",size,rate[3],rate[3]/prate); 200*51096aa5SJacob Faibussowitsch } else { 201*51096aa5SJacob Faibussowitsch fd = fopen("flops","w"); 202*51096aa5SJacob Faibussowitsch fprintf(fd,"%g\n",rate[3]); 203*51096aa5SJacob Faibussowitsch fclose(fd); 204*51096aa5SJacob Faibussowitsch printf("%d %11.4f Rate (MB/s)\n",size,rate[3]); 2054198fb66SBarry Smith } 206d3ae85c4SBarry Smith } 207*51096aa5SJacob Faibussowitsch PetscCall(PetscFinalize()); 208d3ae85c4SBarry Smith return 0; 209d3ae85c4SBarry Smith } 210