xref: /petsc/src/benchmarks/streams/MPIVersion.c (revision b8abcfde4cf799610cd89775278ac4145d1798ce)
1d3ae85c4SBarry Smith 
2d3ae85c4SBarry Smith # include <stdio.h>
3d3ae85c4SBarry Smith # include <math.h>
4d3ae85c4SBarry Smith # include <limits.h>
5d3ae85c4SBarry Smith # include <float.h>
65e71baefSBarry Smith #include <petscsys.h>
7d3ae85c4SBarry Smith 
8d3ae85c4SBarry Smith /*
90e3d61c9SBarry Smith   Program: Stream
100e3d61c9SBarry Smith   Programmer: Joe R. Zagar
110e3d61c9SBarry Smith   Revision: 4.0-BETA, October 24, 1995
120e3d61c9SBarry Smith   Original code developed by John D. McCalpin
130e3d61c9SBarry Smith 
140e3d61c9SBarry Smith   This program measures memory transfer rates in MB/s for simple
150e3d61c9SBarry Smith   computational kernels coded in C.  These numbers reveal the quality
160e3d61c9SBarry Smith   of code generation for simple uncacheable kernels as well as showing
170e3d61c9SBarry Smith   the cost of floating-point operations relative to memory accesses.
180e3d61c9SBarry Smith 
190e3d61c9SBarry Smith   INSTRUCTIONS:
200e3d61c9SBarry Smith 
210e3d61c9SBarry Smith         1) Stream requires a good bit of memory to run.  Adjust the
220e3d61c9SBarry Smith            value of 'N' (below) to give a 'timing calibration' of
230e3d61c9SBarry Smith            at least 20 clock-ticks.  This will provide rate estimates
240e3d61c9SBarry Smith            that should be good to about 5% precision.
25d3ae85c4SBarry Smith */
26d3ae85c4SBarry Smith 
27d3ae85c4SBarry Smith # define N      2000000
28d3ae85c4SBarry Smith # define NTIMES 50
29d3ae85c4SBarry Smith # define OFFSET 0
30d3ae85c4SBarry Smith 
31d3ae85c4SBarry Smith /*
320e3d61c9SBarry Smith        3) Compile the code with full optimization.  Many compilers
330e3d61c9SBarry Smith           generate unreasonably bad code before the optimizer tightens
340e3d61c9SBarry Smith           things up.  If the results are unreasonably good, on the
350e3d61c9SBarry Smith           other hand, the optimizer might be too smart for me!
360e3d61c9SBarry Smith 
370e3d61c9SBarry Smith           Try compiling with:
380e3d61c9SBarry Smith                 cc -O stream_d.c second.c -o stream_d -lm
390e3d61c9SBarry Smith 
400e3d61c9SBarry Smith           This is known to work on Cray, SGI, IBM, and Sun machines.
410e3d61c9SBarry Smith 
420e3d61c9SBarry Smith        4) Mail the results to mccalpin@cs.virginia.edu
430e3d61c9SBarry Smith           Be sure to include:
440e3d61c9SBarry Smith                a) computer hardware model number and software revision
450e3d61c9SBarry Smith                b) the compiler flags
460e3d61c9SBarry Smith                c) all of the output from the test case.
470e3d61c9SBarry Smith   Thanks!
480e3d61c9SBarry Smith 
49d3ae85c4SBarry Smith  */
50d3ae85c4SBarry Smith 
51d3ae85c4SBarry Smith # define HLINE "-------------------------------------------------------------\n"
52d3ae85c4SBarry Smith 
53d3ae85c4SBarry Smith # ifndef MIN
54d3ae85c4SBarry Smith # define MIN(x,y) ((x)<(y) ? (x) : (y))
55d3ae85c4SBarry Smith # endif
56d3ae85c4SBarry Smith # ifndef MAX
57d3ae85c4SBarry Smith # define MAX(x,y) ((x)>(y) ? (x) : (y))
58d3ae85c4SBarry Smith # endif
59d3ae85c4SBarry Smith 
60d3ae85c4SBarry Smith static double a[N+OFFSET],
61d3ae85c4SBarry Smith               b[N+OFFSET],
62d3ae85c4SBarry Smith               c[N+OFFSET];
63d3ae85c4SBarry Smith /*double *a,*b,*c;*/
64d3ae85c4SBarry Smith 
65d3ae85c4SBarry Smith static double mintime[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};
66d3ae85c4SBarry Smith 
67d3ae85c4SBarry Smith static double bytes[4] = {
68d3ae85c4SBarry Smith   2 * sizeof(double) * N,
69d3ae85c4SBarry Smith   2 * sizeof(double) * N,
70d3ae85c4SBarry Smith   3 * sizeof(double) * N,
71d3ae85c4SBarry Smith   3 * sizeof(double) * N
72d3ae85c4SBarry Smith };
73d3ae85c4SBarry Smith 
74d3ae85c4SBarry Smith int main(int argc,char **args)
75d3ae85c4SBarry Smith {
76d1d3a73cSBarry Smith   int            quantum, checktick(void);
77d3ae85c4SBarry Smith   register int   j, k;
78d3ae85c4SBarry Smith   double         scalar, t, times[4][NTIMES],irate[4],rate[4];
79d3ae85c4SBarry Smith   int            rank,size,resultlen;
80d3ae85c4SBarry Smith   char           hostname[MPI_MAX_PROCESSOR_NAME];
811df1832dSBarry Smith   MPI_Status     status;
828a4d7553SBarry Smith   int            ierr;
834198fb66SBarry Smith   FILE           *fd;
84d3ae85c4SBarry Smith 
85*b8abcfdeSJacob Faibussowitsch   PetscCall(PetscInitialize(&argc,&args,NULL,NULL));
868a4d7553SBarry Smith   ierr = MPI_Comm_rank(MPI_COMM_WORLD,&rank);if (ierr) return ierr;
878a4d7553SBarry Smith   ierr = MPI_Comm_size(MPI_COMM_WORLD,&size);if (ierr) return ierr;
88d3ae85c4SBarry Smith 
896b58a888SBarry Smith   for (j=0; j<MPI_MAX_PROCESSOR_NAME; j++) {
906b58a888SBarry Smith     hostname[j] = 0;
916b58a888SBarry Smith   }
928a4d7553SBarry Smith   ierr = MPI_Get_processor_name(hostname,&resultlen);if (ierr) return ierr;
93dd400576SPatrick Sanan   if (rank == 0) {
941df1832dSBarry Smith     for (j=1; j<size; j++) {
958a4d7553SBarry Smith       ierr = MPI_Recv(hostname,MPI_MAX_PROCESSOR_NAME,MPI_CHAR,j,0,MPI_COMM_WORLD,&status);if (ierr) return ierr;
961df1832dSBarry Smith     }
971df1832dSBarry Smith  } else {
988a4d7553SBarry Smith    ierr = MPI_Send(hostname,MPI_MAX_PROCESSOR_NAME,MPI_CHAR,0,0,MPI_COMM_WORLD);if (ierr) return ierr;
99d3ae85c4SBarry Smith  }
100b7250d5dSSatish Balay  ierr = MPI_Barrier(MPI_COMM_WORLD);
101d3ae85c4SBarry Smith 
102d3ae85c4SBarry Smith   /* --- SETUP --- determine precision and check timing --- */
103d3ae85c4SBarry Smith 
104dd400576SPatrick Sanan   if (rank == 0) {
105d3ae85c4SBarry Smith     /*printf(HLINE);
106d3ae85c4SBarry Smith     printf("Array size = %d, Offset = %d\n" , N, OFFSET);
107d3ae85c4SBarry Smith     printf("Total memory required = %.1f MB.\n", (3 * N * BytesPerWord) / 1048576.0);
108d3ae85c4SBarry Smith     printf("Each test is run %d times, but only\n", NTIMES);
109d3ae85c4SBarry Smith     printf("the *best* time for each is used.\n");
110d3ae85c4SBarry Smith     printf(HLINE); */
111d3ae85c4SBarry Smith   }
112d3ae85c4SBarry Smith 
113d3ae85c4SBarry Smith   /* Get initial value for system clock. */
114d3ae85c4SBarry Smith 
115d3ae85c4SBarry Smith   /*  a = malloc(N*sizeof(double));
116d3ae85c4SBarry Smith   b = malloc(N*sizeof(double));
117d3ae85c4SBarry Smith   c = malloc(N*sizeof(double));*/
118d3ae85c4SBarry Smith   for (j=0; j<N; j++) {
119d3ae85c4SBarry Smith     a[j] = 1.0;
120d3ae85c4SBarry Smith     b[j] = 2.0;
121d3ae85c4SBarry Smith     c[j] = 0.0;
122d3ae85c4SBarry Smith   }
123d3ae85c4SBarry Smith 
124dd400576SPatrick Sanan   if (rank == 0) {
125d3ae85c4SBarry Smith     if  ((quantum = checktick()) >= 1) ; /* printf("Your clock granularity/precision appears to be %d microseconds.\n", quantum); */
126d3ae85c4SBarry Smith     else ; /* printf("Your clock granularity appears to be less than one microsecond.\n");*/
127d3ae85c4SBarry Smith   }
128d3ae85c4SBarry Smith 
12919623ac0SBarry Smith   t = MPI_Wtime();
130d3ae85c4SBarry Smith   for (j = 0; j < N; j++) a[j] = 2.0E0 * a[j];
13119623ac0SBarry Smith   t = 1.0E6 * (MPI_Wtime() - t);
132d3ae85c4SBarry Smith 
133dd400576SPatrick Sanan   if (rank == 0) {
134d3ae85c4SBarry Smith     /*  printf("Each test below will take on the order of %d microseconds.\n", (int) t);
135d3ae85c4SBarry Smith     printf("   (= %d clock ticks)\n", (int) (t/quantum));
136d3ae85c4SBarry Smith     printf("Increase the size of the arrays if this shows that\n");
137d3ae85c4SBarry Smith     printf("you are not getting at least 20 clock ticks per test.\n");
138d3ae85c4SBarry Smith     printf(HLINE);*/
139d3ae85c4SBarry Smith   }
140d3ae85c4SBarry Smith 
141d3ae85c4SBarry Smith   /*   --- MAIN LOOP --- repeat test cases NTIMES times --- */
142d3ae85c4SBarry Smith 
143d3ae85c4SBarry Smith   scalar = 3.0;
144d3ae85c4SBarry Smith   for (k=0; k<NTIMES; k++)
145d3ae85c4SBarry Smith   {
146b7250d5dSSatish Balay     ierr = MPI_Barrier(MPI_COMM_WORLD);
14719623ac0SBarry Smith     times[0][k] = MPI_Wtime();
148d3ae85c4SBarry Smith     /* should all these barriers be pulled outside of the time call? */
149b7250d5dSSatish Balay     ierr = MPI_Barrier(MPI_COMM_WORLD);
150d3ae85c4SBarry Smith     for (j=0; j<N; j++) c[j] = a[j];
151b7250d5dSSatish Balay     ierr = MPI_Barrier(MPI_COMM_WORLD);
15219623ac0SBarry Smith     times[0][k] = MPI_Wtime() - times[0][k];
153d3ae85c4SBarry Smith 
15419623ac0SBarry Smith     times[1][k] = MPI_Wtime();
155b7250d5dSSatish Balay     ierr = MPI_Barrier(MPI_COMM_WORLD);
156d3ae85c4SBarry Smith     for (j=0; j<N; j++) b[j] = scalar*c[j];
157b7250d5dSSatish Balay     ierr = MPI_Barrier(MPI_COMM_WORLD);
15819623ac0SBarry Smith     times[1][k] = MPI_Wtime() - times[1][k];
159d3ae85c4SBarry Smith 
16019623ac0SBarry Smith     times[2][k] = MPI_Wtime();
161b7250d5dSSatish Balay     ierr = MPI_Barrier(MPI_COMM_WORLD);
162d3ae85c4SBarry Smith     for (j=0; j<N; j++) c[j] = a[j]+b[j];
163b7250d5dSSatish Balay     ierr = MPI_Barrier(MPI_COMM_WORLD);
16419623ac0SBarry Smith     times[2][k] = MPI_Wtime() - times[2][k];
165d3ae85c4SBarry Smith 
16619623ac0SBarry Smith     times[3][k] = MPI_Wtime();
167b7250d5dSSatish Balay     ierr = MPI_Barrier(MPI_COMM_WORLD);
168d3ae85c4SBarry Smith     for (j=0; j<N; j++) a[j] = b[j]+scalar*c[j];
169b7250d5dSSatish Balay     ierr = MPI_Barrier(MPI_COMM_WORLD);
17019623ac0SBarry Smith     times[3][k] = MPI_Wtime() - times[3][k];
171d3ae85c4SBarry Smith   }
172d3ae85c4SBarry Smith 
173d3ae85c4SBarry Smith   /*   --- SUMMARY --- */
174d3ae85c4SBarry Smith 
175d3ae85c4SBarry Smith   for (k=0; k<NTIMES; k++)
176d3ae85c4SBarry Smith     for (j=0; j<4; j++) mintime[j] = MIN(mintime[j], times[j][k]);
177d3ae85c4SBarry Smith 
178d3ae85c4SBarry Smith   for (j=0; j<4; j++) irate[j] = 1.0E-06 * bytes[j]/mintime[j];
179b7250d5dSSatish Balay   ierr = MPI_Reduce(irate,rate,4,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD);
180b7250d5dSSatish Balay   if (ierr) printf("Error calling MPI\n");
181d3ae85c4SBarry Smith 
182dd400576SPatrick Sanan   if (rank == 0) {
1834198fb66SBarry Smith     if (size == 1) {
1844198fb66SBarry Smith       printf("%d %11.4f   Rate (MB/s)\n",size, rate[3]);
1854198fb66SBarry Smith       fd = fopen("flops","w");
1864198fb66SBarry Smith       fprintf(fd,"%g\n",rate[3]);
1874198fb66SBarry Smith       fclose(fd);
1884198fb66SBarry Smith     } else {
1894198fb66SBarry Smith       double prate;
1904198fb66SBarry Smith       fd = fopen("flops","r");
1914198fb66SBarry Smith       fscanf(fd,"%lg",&prate);
1924198fb66SBarry Smith       fclose(fd);
1934198fb66SBarry Smith       printf("%d %11.4f   Rate (MB/s) %g \n", size, rate[3],rate[3]/prate);
1944198fb66SBarry Smith     }
195d3ae85c4SBarry Smith   }
1965e71baefSBarry Smith   PetscFinalize();
197d3ae85c4SBarry Smith   return 0;
198d3ae85c4SBarry Smith }
199d3ae85c4SBarry Smith 
200