xref: /petsc/src/benchmarks/streams/MPIVersion.c (revision 51096aa50358f66cbf0c2dad5ec9eaa5d6ab2381)
1d3ae85c4SBarry Smith #include <stdio.h>
2d3ae85c4SBarry Smith #include <math.h>
3d3ae85c4SBarry Smith #include <limits.h>
4d3ae85c4SBarry Smith #include <float.h>
55e71baefSBarry Smith #include <petscsys.h>
6d3ae85c4SBarry Smith 
7d3ae85c4SBarry Smith /*
80e3d61c9SBarry Smith   Program: Stream
90e3d61c9SBarry Smith   Programmer: Joe R. Zagar
100e3d61c9SBarry Smith   Revision: 4.0-BETA, October 24, 1995
110e3d61c9SBarry Smith   Original code developed by John D. McCalpin
120e3d61c9SBarry Smith 
130e3d61c9SBarry Smith   This program measures memory transfer rates in MB/s for simple
140e3d61c9SBarry Smith   computational kernels coded in C.  These numbers reveal the quality
150e3d61c9SBarry Smith   of code generation for simple uncacheable kernels as well as showing
160e3d61c9SBarry Smith   the cost of floating-point operations relative to memory accesses.
170e3d61c9SBarry Smith 
180e3d61c9SBarry Smith   INSTRUCTIONS:
190e3d61c9SBarry Smith 
200e3d61c9SBarry Smith         1) Stream requires a good bit of memory to run.  Adjust the
210e3d61c9SBarry Smith            value of 'N' (below) to give a 'timing calibration' of
220e3d61c9SBarry Smith            at least 20 clock-ticks.  This will provide rate estimates
230e3d61c9SBarry Smith            that should be good to about 5% precision.
24d3ae85c4SBarry Smith */
25d3ae85c4SBarry Smith 
26d3ae85c4SBarry Smith #define N      2000000
27*51096aa5SJacob Faibussowitsch #define M      20
28d3ae85c4SBarry Smith #define NTIMES 50
29d3ae85c4SBarry Smith #define OFFSET 0
30d3ae85c4SBarry Smith 
31d3ae85c4SBarry Smith /*
320e3d61c9SBarry Smith        3) Compile the code with full optimization.  Many compilers
330e3d61c9SBarry Smith           generate unreasonably bad code before the optimizer tightens
340e3d61c9SBarry Smith           things up.  If the results are unreasonably good, on the
350e3d61c9SBarry Smith           other hand, the optimizer might be too smart for me!
360e3d61c9SBarry Smith 
370e3d61c9SBarry Smith           Try compiling with:
380e3d61c9SBarry Smith                 cc -O stream_d.c second.c -o stream_d -lm
390e3d61c9SBarry Smith 
400e3d61c9SBarry Smith           This is known to work on Cray, SGI, IBM, and Sun machines.
410e3d61c9SBarry Smith 
420e3d61c9SBarry Smith        4) Mail the results to mccalpin@cs.virginia.edu
430e3d61c9SBarry Smith           Be sure to include:
440e3d61c9SBarry Smith                a) computer hardware model number and software revision
450e3d61c9SBarry Smith                b) the compiler flags
460e3d61c9SBarry Smith                c) all of the output from the test case.
470e3d61c9SBarry Smith   Thanks!
480e3d61c9SBarry Smith 
49d3ae85c4SBarry Smith  */
50d3ae85c4SBarry Smith 
51d3ae85c4SBarry Smith #define HLINE "-------------------------------------------------------------\n"
52d3ae85c4SBarry Smith 
53*51096aa5SJacob Faibussowitsch static double a[N+OFFSET],b[N+OFFSET],c[N+OFFSET];
54d3ae85c4SBarry Smith 
55d3ae85c4SBarry Smith static double mintime[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};
56d3ae85c4SBarry Smith 
57*51096aa5SJacob Faibussowitsch static int checktick(void)
58*51096aa5SJacob Faibussowitsch {
59*51096aa5SJacob Faibussowitsch   int    minDelta = 1000000;
60*51096aa5SJacob Faibussowitsch   double timesfound[M];
61*51096aa5SJacob Faibussowitsch 
62*51096aa5SJacob Faibussowitsch   /* Collect a sequence of M unique time values from the system. */
63*51096aa5SJacob Faibussowitsch 
64*51096aa5SJacob Faibussowitsch   for (int i = 0; i < M; ++i) {
65*51096aa5SJacob Faibussowitsch     const double t1 = MPI_Wtime();
66*51096aa5SJacob Faibussowitsch 
67*51096aa5SJacob Faibussowitsch     while (((timesfound[i] = MPI_Wtime()) - t1) < 1.0E-6) ;
68*51096aa5SJacob Faibussowitsch   }
69*51096aa5SJacob Faibussowitsch 
70*51096aa5SJacob Faibussowitsch   /*
71*51096aa5SJacob Faibussowitsch     Determine the minimum difference between these M values.
72*51096aa5SJacob Faibussowitsch     This result will be our estimate (in microseconds) for the
73*51096aa5SJacob Faibussowitsch     clock granularity.
74*51096aa5SJacob Faibussowitsch   */
75*51096aa5SJacob Faibussowitsch 
76*51096aa5SJacob Faibussowitsch   for (int i = 1; i < M; ++i) {
77*51096aa5SJacob Faibussowitsch     int Delta = (int)(1.0E6*(timesfound[i]-timesfound[i-1]));
78*51096aa5SJacob Faibussowitsch 
79*51096aa5SJacob Faibussowitsch     minDelta  = PetscMin(minDelta,PetscMax(Delta,0));
80*51096aa5SJacob Faibussowitsch   }
81*51096aa5SJacob Faibussowitsch   return minDelta;
82*51096aa5SJacob Faibussowitsch }
83*51096aa5SJacob Faibussowitsch 
84d3ae85c4SBarry Smith static double bytes[4] = {
85d3ae85c4SBarry Smith   2 * sizeof(double) * N,
86d3ae85c4SBarry Smith   2 * sizeof(double) * N,
87d3ae85c4SBarry Smith   3 * sizeof(double) * N,
88d3ae85c4SBarry Smith   3 * sizeof(double) * N
89d3ae85c4SBarry Smith };
90d3ae85c4SBarry Smith 
91d3ae85c4SBarry Smith int main(int argc,char **args)
92d3ae85c4SBarry Smith {
93*51096aa5SJacob Faibussowitsch   const double scalar = 3.0;
94*51096aa5SJacob Faibussowitsch   double       t,times[4][NTIMES],irate[4],rate[4];
95*51096aa5SJacob Faibussowitsch   PetscMPIInt  rank,size,resultlen;
96*51096aa5SJacob Faibussowitsch   char         hostname[MPI_MAX_PROCESSOR_NAME] = {0};
97d3ae85c4SBarry Smith 
98b8abcfdeSJacob Faibussowitsch   PetscCall(PetscInitialize(&argc,&args,NULL,NULL));
99*51096aa5SJacob Faibussowitsch   PetscCallMPI(MPI_Comm_rank(MPI_COMM_WORLD,&rank));
100*51096aa5SJacob Faibussowitsch   PetscCallMPI(MPI_Comm_size(MPI_COMM_WORLD,&size));
101d3ae85c4SBarry Smith 
102*51096aa5SJacob Faibussowitsch   PetscCallMPI(MPI_Get_processor_name(hostname,&resultlen));(void)resultlen;
103*51096aa5SJacob Faibussowitsch   if (rank) PetscCallMPI(MPI_Send(hostname,MPI_MAX_PROCESSOR_NAME,MPI_CHAR,0,0,MPI_COMM_WORLD));
104*51096aa5SJacob Faibussowitsch   else {
105*51096aa5SJacob Faibussowitsch     for (int j = 1; j < size; ++j) {
106*51096aa5SJacob Faibussowitsch       PetscCallMPI(MPI_Recv(hostname,MPI_MAX_PROCESSOR_NAME,MPI_CHAR,j,0,MPI_COMM_WORLD,MPI_STATUS_IGNORE));
1076b58a888SBarry Smith     }
1081df1832dSBarry Smith   }
109*51096aa5SJacob Faibussowitsch   PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
110d3ae85c4SBarry Smith 
111d3ae85c4SBarry Smith   /* --- SETUP --- determine precision and check timing --- */
112d3ae85c4SBarry Smith 
113dd400576SPatrick Sanan   if (rank == 0) {
114*51096aa5SJacob Faibussowitsch     /*
115*51096aa5SJacob Faibussowitsch       printf(HLINE);
116d3ae85c4SBarry Smith       printf("Array size = %d, Offset = %d\n" , N, OFFSET);
117d3ae85c4SBarry Smith       printf("Total memory required = %.1f MB.\n", (3 * N * BytesPerWord) / 1048576.0);
118d3ae85c4SBarry Smith       printf("Each test is run %d times, but only\n", NTIMES);
119d3ae85c4SBarry Smith       printf("the *best* time for each is used.\n");
120*51096aa5SJacob Faibussowitsch       printf(HLINE);
121*51096aa5SJacob Faibussowitsch     */
122d3ae85c4SBarry Smith   }
123d3ae85c4SBarry Smith 
124d3ae85c4SBarry Smith   /* Get initial value for system clock. */
125*51096aa5SJacob Faibussowitsch   for (int j = 0; j < N; ++j) {
126d3ae85c4SBarry Smith     a[j] = 1.0;
127d3ae85c4SBarry Smith     b[j] = 2.0;
128d3ae85c4SBarry Smith     c[j] = 0.0;
129d3ae85c4SBarry Smith   }
130d3ae85c4SBarry Smith 
131dd400576SPatrick Sanan   if (rank == 0) {
132*51096aa5SJacob Faibussowitsch     int quantum;
133d3ae85c4SBarry Smith     if  ((quantum = checktick()) >= 1) ; /* printf("Your clock granularity/precision appears to be %d microseconds.\n", quantum); */
134d3ae85c4SBarry Smith     else ; /* printf("Your clock granularity appears to be less than one microsecond.\n");*/
135d3ae85c4SBarry Smith   }
136d3ae85c4SBarry Smith 
13719623ac0SBarry Smith   t = MPI_Wtime();
138*51096aa5SJacob Faibussowitsch   for (int j = 0; j < N; ++j) a[j] *= 2.0;
13919623ac0SBarry Smith   t = 1.0E6 * (MPI_Wtime() - t);
140d3ae85c4SBarry Smith 
141dd400576SPatrick Sanan   if (rank == 0) {
142*51096aa5SJacob Faibussowitsch     /*
143*51096aa5SJacob Faibussowitsch       printf("Each test below will take on the order of %d microseconds.\n", (int) t);
144d3ae85c4SBarry Smith       printf("   (= %d clock ticks)\n", (int) (t/quantum));
145d3ae85c4SBarry Smith       printf("Increase the size of the arrays if this shows that\n");
146d3ae85c4SBarry Smith       printf("you are not getting at least 20 clock ticks per test.\n");
147*51096aa5SJacob Faibussowitsch       printf(HLINE);
148*51096aa5SJacob Faibussowitsch     */
149d3ae85c4SBarry Smith   }
150d3ae85c4SBarry Smith 
151d3ae85c4SBarry Smith   /*   --- MAIN LOOP --- repeat test cases NTIMES times --- */
152d3ae85c4SBarry Smith 
153*51096aa5SJacob Faibussowitsch   for (int k = 0; k < NTIMES; ++k) {
154*51096aa5SJacob Faibussowitsch     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
15519623ac0SBarry Smith     times[0][k] = MPI_Wtime();
156d3ae85c4SBarry Smith     /* should all these barriers be pulled outside of the time call? */
157*51096aa5SJacob Faibussowitsch     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
158*51096aa5SJacob Faibussowitsch     PetscCall(PetscArraycpy(c,a,N));
159*51096aa5SJacob Faibussowitsch     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
16019623ac0SBarry Smith     times[0][k] = MPI_Wtime() - times[0][k];
161d3ae85c4SBarry Smith 
16219623ac0SBarry Smith     times[1][k] = MPI_Wtime();
163*51096aa5SJacob Faibussowitsch     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
164*51096aa5SJacob Faibussowitsch     for (int j = 0; j < N; ++j) b[j] = scalar*c[j];
165*51096aa5SJacob Faibussowitsch     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
16619623ac0SBarry Smith     times[1][k] = MPI_Wtime() - times[1][k];
167d3ae85c4SBarry Smith 
16819623ac0SBarry Smith     times[2][k] = MPI_Wtime();
169*51096aa5SJacob Faibussowitsch     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
170*51096aa5SJacob Faibussowitsch     for (int j = 0; j < N; ++j) c[j] = a[j]+b[j];
171*51096aa5SJacob Faibussowitsch     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
17219623ac0SBarry Smith     times[2][k] = MPI_Wtime() - times[2][k];
173d3ae85c4SBarry Smith 
17419623ac0SBarry Smith     times[3][k] = MPI_Wtime();
175*51096aa5SJacob Faibussowitsch     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
176*51096aa5SJacob Faibussowitsch     for (int j = 0; j < N; ++j) a[j] = b[j]+scalar*c[j];
177*51096aa5SJacob Faibussowitsch     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
17819623ac0SBarry Smith     times[3][k] = MPI_Wtime() - times[3][k];
179d3ae85c4SBarry Smith   }
180d3ae85c4SBarry Smith 
181d3ae85c4SBarry Smith   /*   --- SUMMARY --- */
182d3ae85c4SBarry Smith 
183*51096aa5SJacob Faibussowitsch   for (int k = 0; k < NTIMES; ++k) {
184*51096aa5SJacob Faibussowitsch     for (int j = 0; j < 4; ++j) mintime[j] = PetscMin(mintime[j],times[j][k]);
185*51096aa5SJacob Faibussowitsch   }
186d3ae85c4SBarry Smith 
187*51096aa5SJacob Faibussowitsch   for (int j = 0; j < 4; ++j) irate[j] = 1.0E-06 * bytes[j]/mintime[j];
188*51096aa5SJacob Faibussowitsch   PetscCallMPI(MPI_Reduce(irate,rate,4,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD));
189d3ae85c4SBarry Smith 
190*51096aa5SJacob Faibussowitsch   if (!rank) {
191*51096aa5SJacob Faibussowitsch     FILE *fd;
192*51096aa5SJacob Faibussowitsch 
193*51096aa5SJacob Faibussowitsch     if (size) {
1944198fb66SBarry Smith       double prate;
195*51096aa5SJacob Faibussowitsch 
1964198fb66SBarry Smith       fd = fopen("flops","r");
1974198fb66SBarry Smith       fscanf(fd,"%lg",&prate);
1984198fb66SBarry Smith       fclose(fd);
1994198fb66SBarry Smith       printf("%d %11.4f   Rate (MB/s) %g \n",size,rate[3],rate[3]/prate);
200*51096aa5SJacob Faibussowitsch     } else {
201*51096aa5SJacob Faibussowitsch       fd = fopen("flops","w");
202*51096aa5SJacob Faibussowitsch       fprintf(fd,"%g\n",rate[3]);
203*51096aa5SJacob Faibussowitsch       fclose(fd);
204*51096aa5SJacob Faibussowitsch       printf("%d %11.4f   Rate (MB/s)\n",size,rate[3]);
2054198fb66SBarry Smith     }
206d3ae85c4SBarry Smith   }
207*51096aa5SJacob Faibussowitsch   PetscCall(PetscFinalize());
208d3ae85c4SBarry Smith   return 0;
209d3ae85c4SBarry Smith }
210