xref: /petsc/src/benchmarks/streams/MPIVersion.c (revision c5853193be7e0cd23913bbf6af39be2e963e2513)
1d3ae85c4SBarry Smith #include <stdio.h>
2d3ae85c4SBarry Smith #include <math.h>
3d3ae85c4SBarry Smith #include <limits.h>
4d3ae85c4SBarry Smith #include <float.h>
55e71baefSBarry Smith #include <petscsys.h>
6d3ae85c4SBarry Smith 
7d3ae85c4SBarry Smith /*
80e3d61c9SBarry Smith   Program: Stream
90e3d61c9SBarry Smith   Programmer: Joe R. Zagar
100e3d61c9SBarry Smith   Revision: 4.0-BETA, October 24, 1995
110e3d61c9SBarry Smith   Original code developed by John D. McCalpin
120e3d61c9SBarry Smith 
130e3d61c9SBarry Smith   This program measures memory transfer rates in MB/s for simple
140e3d61c9SBarry Smith   computational kernels coded in C.  These numbers reveal the quality
150e3d61c9SBarry Smith   of code generation for simple uncacheable kernels as well as showing
160e3d61c9SBarry Smith   the cost of floating-point operations relative to memory accesses.
170e3d61c9SBarry Smith 
180e3d61c9SBarry Smith   INSTRUCTIONS:
190e3d61c9SBarry Smith 
200e3d61c9SBarry Smith         1) Stream requires a good bit of memory to run.  Adjust the
210e3d61c9SBarry Smith            value of 'N' (below) to give a 'timing calibration' of
220e3d61c9SBarry Smith            at least 20 clock-ticks.  This will provide rate estimates
230e3d61c9SBarry Smith            that should be good to about 5% precision.
24d3ae85c4SBarry Smith */
25d3ae85c4SBarry Smith 
26d3ae85c4SBarry Smith #define N      2000000
2751096aa5SJacob Faibussowitsch #define M      20
28d3ae85c4SBarry Smith #define NTIMES 50
29d3ae85c4SBarry Smith #define OFFSET 0
30d3ae85c4SBarry Smith 
31d3ae85c4SBarry Smith /*
320e3d61c9SBarry Smith        3) Compile the code with full optimization.  Many compilers
330e3d61c9SBarry Smith           generate unreasonably bad code before the optimizer tightens
340e3d61c9SBarry Smith           things up.  If the results are unreasonably good, on the
350e3d61c9SBarry Smith           other hand, the optimizer might be too smart for me!
360e3d61c9SBarry Smith 
370e3d61c9SBarry Smith           Try compiling with:
380e3d61c9SBarry Smith                 cc -O stream_d.c second.c -o stream_d -lm
390e3d61c9SBarry Smith 
400e3d61c9SBarry Smith           This is known to work on Cray, SGI, IBM, and Sun machines.
410e3d61c9SBarry Smith 
420e3d61c9SBarry Smith        4) Mail the results to mccalpin@cs.virginia.edu
430e3d61c9SBarry Smith           Be sure to include:
440e3d61c9SBarry Smith                a) computer hardware model number and software revision
450e3d61c9SBarry Smith                b) the compiler flags
460e3d61c9SBarry Smith                c) all of the output from the test case.
470e3d61c9SBarry Smith   Thanks!
480e3d61c9SBarry Smith 
49d3ae85c4SBarry Smith  */
50d3ae85c4SBarry Smith 
51d3ae85c4SBarry Smith #define HLINE "-------------------------------------------------------------\n"
52d3ae85c4SBarry Smith 
5351096aa5SJacob Faibussowitsch static double a[N+OFFSET],b[N+OFFSET],c[N+OFFSET];
54d3ae85c4SBarry Smith 
55d3ae85c4SBarry Smith static double mintime[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};
56d3ae85c4SBarry Smith 
5751096aa5SJacob Faibussowitsch static int checktick(void)
5851096aa5SJacob Faibussowitsch {
5951096aa5SJacob Faibussowitsch   int    minDelta = 1000000;
6051096aa5SJacob Faibussowitsch   double timesfound[M];
6151096aa5SJacob Faibussowitsch 
6251096aa5SJacob Faibussowitsch   /* Collect a sequence of M unique time values from the system. */
6351096aa5SJacob Faibussowitsch 
6451096aa5SJacob Faibussowitsch   for (int i = 0; i < M; ++i) {
6551096aa5SJacob Faibussowitsch     const double t1 = MPI_Wtime();
6651096aa5SJacob Faibussowitsch 
6751096aa5SJacob Faibussowitsch     while (((timesfound[i] = MPI_Wtime()) - t1) < 1.0E-6) ;
6851096aa5SJacob Faibussowitsch   }
6951096aa5SJacob Faibussowitsch 
7051096aa5SJacob Faibussowitsch   /*
7151096aa5SJacob Faibussowitsch     Determine the minimum difference between these M values.
7251096aa5SJacob Faibussowitsch     This result will be our estimate (in microseconds) for the
7351096aa5SJacob Faibussowitsch     clock granularity.
7451096aa5SJacob Faibussowitsch   */
7551096aa5SJacob Faibussowitsch 
7651096aa5SJacob Faibussowitsch   for (int i = 1; i < M; ++i) {
7751096aa5SJacob Faibussowitsch     int Delta = (int)(1.0E6*(timesfound[i]-timesfound[i-1]));
7851096aa5SJacob Faibussowitsch 
7951096aa5SJacob Faibussowitsch     minDelta  = PetscMin(minDelta,PetscMax(Delta,0));
8051096aa5SJacob Faibussowitsch   }
8151096aa5SJacob Faibussowitsch   return minDelta;
8251096aa5SJacob Faibussowitsch }
8351096aa5SJacob Faibussowitsch 
84d3ae85c4SBarry Smith static double bytes[4] = {
85d3ae85c4SBarry Smith   2 * sizeof(double) * N,
86d3ae85c4SBarry Smith   2 * sizeof(double) * N,
87d3ae85c4SBarry Smith   3 * sizeof(double) * N,
88d3ae85c4SBarry Smith   3 * sizeof(double) * N
89d3ae85c4SBarry Smith };
90d3ae85c4SBarry Smith 
91d3ae85c4SBarry Smith int main(int argc,char **args)
92d3ae85c4SBarry Smith {
9351096aa5SJacob Faibussowitsch   const double scalar = 3.0;
9451096aa5SJacob Faibussowitsch   double       t,times[4][NTIMES],irate[4],rate[4];
9551096aa5SJacob Faibussowitsch   PetscMPIInt  rank,size,resultlen;
9651096aa5SJacob Faibussowitsch   char         hostname[MPI_MAX_PROCESSOR_NAME] = {0};
97d3ae85c4SBarry Smith 
98b8abcfdeSJacob Faibussowitsch   PetscCall(PetscInitialize(&argc,&args,NULL,NULL));
9951096aa5SJacob Faibussowitsch   PetscCallMPI(MPI_Comm_rank(MPI_COMM_WORLD,&rank));
10051096aa5SJacob Faibussowitsch   PetscCallMPI(MPI_Comm_size(MPI_COMM_WORLD,&size));
101d3ae85c4SBarry Smith 
10251096aa5SJacob Faibussowitsch   PetscCallMPI(MPI_Get_processor_name(hostname,&resultlen));(void)resultlen;
10351096aa5SJacob Faibussowitsch   if (rank) PetscCallMPI(MPI_Send(hostname,MPI_MAX_PROCESSOR_NAME,MPI_CHAR,0,0,MPI_COMM_WORLD));
10451096aa5SJacob Faibussowitsch   else {
10551096aa5SJacob Faibussowitsch     for (int j = 1; j < size; ++j) {
10651096aa5SJacob Faibussowitsch       PetscCallMPI(MPI_Recv(hostname,MPI_MAX_PROCESSOR_NAME,MPI_CHAR,j,0,MPI_COMM_WORLD,MPI_STATUS_IGNORE));
1076b58a888SBarry Smith     }
1081df1832dSBarry Smith   }
10951096aa5SJacob Faibussowitsch   PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
110d3ae85c4SBarry Smith 
111d3ae85c4SBarry Smith   /* --- SETUP --- determine precision and check timing --- */
112d3ae85c4SBarry Smith 
113dd400576SPatrick Sanan   if (rank == 0) {
11451096aa5SJacob Faibussowitsch     /*
11551096aa5SJacob Faibussowitsch       printf(HLINE);
116d3ae85c4SBarry Smith       printf("Array size = %d, Offset = %d\n" , N, OFFSET);
117d3ae85c4SBarry Smith       printf("Total memory required = %.1f MB.\n", (3 * N * BytesPerWord) / 1048576.0);
118d3ae85c4SBarry Smith       printf("Each test is run %d times, but only\n", NTIMES);
119d3ae85c4SBarry Smith       printf("the *best* time for each is used.\n");
12051096aa5SJacob Faibussowitsch       printf(HLINE);
12151096aa5SJacob Faibussowitsch     */
122d3ae85c4SBarry Smith   }
123d3ae85c4SBarry Smith 
124d3ae85c4SBarry Smith   /* Get initial value for system clock. */
12551096aa5SJacob Faibussowitsch   for (int j = 0; j < N; ++j) {
126d3ae85c4SBarry Smith     a[j] = 1.0;
127d3ae85c4SBarry Smith     b[j] = 2.0;
128d3ae85c4SBarry Smith     c[j] = 0.0;
129d3ae85c4SBarry Smith   }
130d3ae85c4SBarry Smith 
131dd400576SPatrick Sanan   if (rank == 0) {
13251096aa5SJacob Faibussowitsch     int quantum;
133d3ae85c4SBarry Smith     if  ((quantum = checktick()) >= 1) ; /* printf("Your clock granularity/precision appears to be %d microseconds.\n", quantum); */
134d3ae85c4SBarry Smith     else ; /* printf("Your clock granularity appears to be less than one microsecond.\n");*/
135d3ae85c4SBarry Smith   }
136d3ae85c4SBarry Smith 
13719623ac0SBarry Smith   t = MPI_Wtime();
13851096aa5SJacob Faibussowitsch   for (int j = 0; j < N; ++j) a[j] *= 2.0;
13919623ac0SBarry Smith   t = 1.0E6 * (MPI_Wtime() - t);
140d3ae85c4SBarry Smith 
141dd400576SPatrick Sanan   if (rank == 0) {
14251096aa5SJacob Faibussowitsch     /*
14351096aa5SJacob Faibussowitsch       printf("Each test below will take on the order of %d microseconds.\n", (int) t);
144d3ae85c4SBarry Smith       printf("   (= %d clock ticks)\n", (int) (t/quantum));
145d3ae85c4SBarry Smith       printf("Increase the size of the arrays if this shows that\n");
146d3ae85c4SBarry Smith       printf("you are not getting at least 20 clock ticks per test.\n");
14751096aa5SJacob Faibussowitsch       printf(HLINE);
14851096aa5SJacob Faibussowitsch     */
149d3ae85c4SBarry Smith   }
150d3ae85c4SBarry Smith 
151d3ae85c4SBarry Smith   /*   --- MAIN LOOP --- repeat test cases NTIMES times --- */
152d3ae85c4SBarry Smith 
15351096aa5SJacob Faibussowitsch   for (int k = 0; k < NTIMES; ++k) {
15451096aa5SJacob Faibussowitsch     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
15519623ac0SBarry Smith     times[0][k] = MPI_Wtime();
156d3ae85c4SBarry Smith     /* should all these barriers be pulled outside of the time call? */
15751096aa5SJacob Faibussowitsch     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
15851096aa5SJacob Faibussowitsch     PetscCall(PetscArraycpy(c,a,N));
15951096aa5SJacob Faibussowitsch     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
16019623ac0SBarry Smith     times[0][k] = MPI_Wtime() - times[0][k];
161d3ae85c4SBarry Smith 
16219623ac0SBarry Smith     times[1][k] = MPI_Wtime();
16351096aa5SJacob Faibussowitsch     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
16451096aa5SJacob Faibussowitsch     for (int j = 0; j < N; ++j) b[j] = scalar*c[j];
16551096aa5SJacob Faibussowitsch     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
16619623ac0SBarry Smith     times[1][k] = MPI_Wtime() - times[1][k];
167d3ae85c4SBarry Smith 
16819623ac0SBarry Smith     times[2][k] = MPI_Wtime();
16951096aa5SJacob Faibussowitsch     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
17051096aa5SJacob Faibussowitsch     for (int j = 0; j < N; ++j) c[j] = a[j]+b[j];
17151096aa5SJacob Faibussowitsch     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
17219623ac0SBarry Smith     times[2][k] = MPI_Wtime() - times[2][k];
173d3ae85c4SBarry Smith 
17419623ac0SBarry Smith     times[3][k] = MPI_Wtime();
17551096aa5SJacob Faibussowitsch     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
17651096aa5SJacob Faibussowitsch     for (int j = 0; j < N; ++j) a[j] = b[j]+scalar*c[j];
17751096aa5SJacob Faibussowitsch     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
17819623ac0SBarry Smith     times[3][k] = MPI_Wtime() - times[3][k];
179d3ae85c4SBarry Smith   }
180d3ae85c4SBarry Smith 
181d3ae85c4SBarry Smith   /*   --- SUMMARY --- */
182d3ae85c4SBarry Smith 
18351096aa5SJacob Faibussowitsch   for (int k = 0; k < NTIMES; ++k) {
18451096aa5SJacob Faibussowitsch     for (int j = 0; j < 4; ++j) mintime[j] = PetscMin(mintime[j],times[j][k]);
18551096aa5SJacob Faibussowitsch   }
186d3ae85c4SBarry Smith 
18751096aa5SJacob Faibussowitsch   for (int j = 0; j < 4; ++j) irate[j] = 1.0E-06 * bytes[j]/mintime[j];
18851096aa5SJacob Faibussowitsch   PetscCallMPI(MPI_Reduce(irate,rate,4,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD));
189d3ae85c4SBarry Smith 
190*c5853193SPierre Jolivet   if (rank == 0) {
19151096aa5SJacob Faibussowitsch     FILE *fd;
19251096aa5SJacob Faibussowitsch 
19351096aa5SJacob Faibussowitsch     if (size) {
1944198fb66SBarry Smith       double prate;
19551096aa5SJacob Faibussowitsch 
1964198fb66SBarry Smith       fd = fopen("flops","r");
1974198fb66SBarry Smith       fscanf(fd,"%lg",&prate);
1984198fb66SBarry Smith       fclose(fd);
1994198fb66SBarry Smith       printf("%d %11.4f   Rate (MB/s) %g \n",size,rate[3],rate[3]/prate);
20051096aa5SJacob Faibussowitsch     } else {
20151096aa5SJacob Faibussowitsch       fd = fopen("flops","w");
20251096aa5SJacob Faibussowitsch       fprintf(fd,"%g\n",rate[3]);
20351096aa5SJacob Faibussowitsch       fclose(fd);
20451096aa5SJacob Faibussowitsch       printf("%d %11.4f   Rate (MB/s)\n",size,rate[3]);
2054198fb66SBarry Smith     }
206d3ae85c4SBarry Smith   }
20751096aa5SJacob Faibussowitsch   PetscCall(PetscFinalize());
208d3ae85c4SBarry Smith   return 0;
209d3ae85c4SBarry Smith }
210