xref: /petsc/src/benchmarks/streams/CUDAVersion.cu (revision 5f80ce2ab25dff0f4601e710601cbbcecf323266)
1403adfb6SMatthew G Knepley /*
2403adfb6SMatthew G Knepley   STREAM benchmark implementation in CUDA.
3403adfb6SMatthew G Knepley 
4403adfb6SMatthew G Knepley     COPY:       a(i) = b(i)
5403adfb6SMatthew G Knepley     SCALE:      a(i) = q*b(i)
6403adfb6SMatthew G Knepley     SUM:        a(i) = b(i) + c(i)
7403adfb6SMatthew G Knepley     TRIAD:      a(i) = b(i) + q*c(i)
8403adfb6SMatthew G Knepley 
9403adfb6SMatthew G Knepley   It measures the memory system on the device.
1019816777SMark   The implementation is in double precision with a single option.
11403adfb6SMatthew G Knepley 
12403adfb6SMatthew G Knepley   Code based on the code developed by John D. McCalpin
13403adfb6SMatthew G Knepley   http://www.cs.virginia.edu/stream/FTP/Code/stream.c
14403adfb6SMatthew G Knepley 
15403adfb6SMatthew G Knepley   Written by: Massimiliano Fatica, NVIDIA Corporation
16403adfb6SMatthew G Knepley   Modified by: Douglas Enright (dpephd-nvidia@yahoo.com), 1 December 2010
17403adfb6SMatthew G Knepley   Extensive Revisions, 4 December 2010
18403adfb6SMatthew G Knepley   Modified for PETSc by: Matthew G. Knepley 14 Aug 2011
19403adfb6SMatthew G Knepley 
20403adfb6SMatthew G Knepley   User interface motivated by bandwidthTest NVIDIA SDK example.
21403adfb6SMatthew G Knepley */
2219816777SMark static char help[] = "Double-Precision STREAM Benchmark implementation in CUDA\n Performs Copy, Scale, Add, and Triad double-precision kernels\n\n";
23403adfb6SMatthew G Knepley 
24403adfb6SMatthew G Knepley #include <petscconf.h>
25403adfb6SMatthew G Knepley #include <petscsys.h>
26403adfb6SMatthew G Knepley #include <petsctime.h>
27*5f80ce2aSJacob Faibussowitsch #include <petscdevice.h>
28403adfb6SMatthew G Knepley 
2919816777SMark #define N        10000000
30403adfb6SMatthew G Knepley #define NTIMES   10
31403adfb6SMatthew G Knepley 
32403adfb6SMatthew G Knepley # ifndef MIN
33403adfb6SMatthew G Knepley # define MIN(x,y) ((x)<(y) ? (x) : (y))
34403adfb6SMatthew G Knepley # endif
35403adfb6SMatthew G Knepley # ifndef MAX
36403adfb6SMatthew G Knepley # define MAX(x,y) ((x)>(y) ? (x) : (y))
37403adfb6SMatthew G Knepley # endif
38403adfb6SMatthew G Knepley 
39403adfb6SMatthew G Knepley const float  flt_eps = 1.192092896e-07f;
40caccb7e3SMatthew G Knepley const double dbl_eps = 2.2204460492503131e-16;
41403adfb6SMatthew G Knepley 
42403adfb6SMatthew G Knepley __global__ void set_array(float *a,  float value, size_t len)
43403adfb6SMatthew G Knepley {
44403adfb6SMatthew G Knepley   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
45403adfb6SMatthew G Knepley   while (idx < len) {
46403adfb6SMatthew G Knepley     a[idx] = value;
47403adfb6SMatthew G Knepley     idx   += blockDim.x * gridDim.x;
48403adfb6SMatthew G Knepley   }
49403adfb6SMatthew G Knepley }
50403adfb6SMatthew G Knepley 
51caccb7e3SMatthew G Knepley __global__ void set_array_double(double *a,  double value, size_t len)
52caccb7e3SMatthew G Knepley {
53caccb7e3SMatthew G Knepley   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
54caccb7e3SMatthew G Knepley   while (idx < len) {
55caccb7e3SMatthew G Knepley     a[idx] = value;
56caccb7e3SMatthew G Knepley     idx   += blockDim.x * gridDim.x;
57caccb7e3SMatthew G Knepley   }
58caccb7e3SMatthew G Knepley }
59caccb7e3SMatthew G Knepley 
60403adfb6SMatthew G Knepley __global__ void STREAM_Copy(float *a, float *b, size_t len)
61403adfb6SMatthew G Knepley {
62403adfb6SMatthew G Knepley   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
63403adfb6SMatthew G Knepley   while (idx < len) {
64403adfb6SMatthew G Knepley     b[idx] = a[idx];
65403adfb6SMatthew G Knepley     idx   += blockDim.x * gridDim.x;
66403adfb6SMatthew G Knepley   }
67403adfb6SMatthew G Knepley }
68403adfb6SMatthew G Knepley 
69caccb7e3SMatthew G Knepley __global__ void STREAM_Copy_double(double *a, double *b, size_t len)
70caccb7e3SMatthew G Knepley {
71caccb7e3SMatthew G Knepley   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
72caccb7e3SMatthew G Knepley   while (idx < len) {
73caccb7e3SMatthew G Knepley     b[idx] = a[idx];
74caccb7e3SMatthew G Knepley     idx   += blockDim.x * gridDim.x;
75caccb7e3SMatthew G Knepley   }
76caccb7e3SMatthew G Knepley }
77caccb7e3SMatthew G Knepley 
78403adfb6SMatthew G Knepley __global__ void STREAM_Copy_Optimized(float *a, float *b, size_t len)
79403adfb6SMatthew G Knepley {
80403adfb6SMatthew G Knepley   /*
81403adfb6SMatthew G Knepley    * Ensure size of thread index space is as large as or greater than
82403adfb6SMatthew G Knepley    * vector index space else return.
83403adfb6SMatthew G Knepley    */
84403adfb6SMatthew G Knepley   if (blockDim.x * gridDim.x < len) return;
85403adfb6SMatthew G Knepley   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
86403adfb6SMatthew G Knepley   if (idx < len) b[idx] = a[idx];
87403adfb6SMatthew G Knepley }
88403adfb6SMatthew G Knepley 
89caccb7e3SMatthew G Knepley __global__ void STREAM_Copy_Optimized_double(double *a, double *b, size_t len)
90caccb7e3SMatthew G Knepley {
91caccb7e3SMatthew G Knepley   /*
92caccb7e3SMatthew G Knepley    * Ensure size of thread index space is as large as or greater than
93caccb7e3SMatthew G Knepley    * vector index space else return.
94caccb7e3SMatthew G Knepley    */
95caccb7e3SMatthew G Knepley   if (blockDim.x * gridDim.x < len) return;
96caccb7e3SMatthew G Knepley   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
97caccb7e3SMatthew G Knepley   if (idx < len) b[idx] = a[idx];
98caccb7e3SMatthew G Knepley }
99caccb7e3SMatthew G Knepley 
100403adfb6SMatthew G Knepley __global__ void STREAM_Scale(float *a, float *b, float scale,  size_t len)
101403adfb6SMatthew G Knepley {
102403adfb6SMatthew G Knepley   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
103403adfb6SMatthew G Knepley   while (idx < len) {
104403adfb6SMatthew G Knepley     b[idx] = scale* a[idx];
105403adfb6SMatthew G Knepley     idx   += blockDim.x * gridDim.x;
106403adfb6SMatthew G Knepley   }
107403adfb6SMatthew G Knepley }
108403adfb6SMatthew G Knepley 
109caccb7e3SMatthew G Knepley __global__ void STREAM_Scale_double(double *a, double *b, double scale,  size_t len)
110caccb7e3SMatthew G Knepley {
111caccb7e3SMatthew G Knepley   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
112caccb7e3SMatthew G Knepley   while (idx < len) {
113caccb7e3SMatthew G Knepley     b[idx] = scale* a[idx];
114caccb7e3SMatthew G Knepley     idx   += blockDim.x * gridDim.x;
115caccb7e3SMatthew G Knepley   }
116caccb7e3SMatthew G Knepley }
117caccb7e3SMatthew G Knepley 
118caccb7e3SMatthew G Knepley __global__ void STREAM_Scale_Optimized(float *a, float *b, float scale,  size_t len)
119caccb7e3SMatthew G Knepley {
120caccb7e3SMatthew G Knepley   /*
121caccb7e3SMatthew G Knepley    * Ensure size of thread index space is as large as or greater than
122caccb7e3SMatthew G Knepley    * vector index space else return.
123caccb7e3SMatthew G Knepley    */
124caccb7e3SMatthew G Knepley   if (blockDim.x * gridDim.x < len) return;
125caccb7e3SMatthew G Knepley   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
126caccb7e3SMatthew G Knepley   if (idx < len) b[idx] = scale* a[idx];
127caccb7e3SMatthew G Knepley }
128caccb7e3SMatthew G Knepley 
129caccb7e3SMatthew G Knepley __global__ void STREAM_Scale_Optimized_double(double *a, double *b, double scale,  size_t len)
130caccb7e3SMatthew G Knepley {
131caccb7e3SMatthew G Knepley   /*
132caccb7e3SMatthew G Knepley    * Ensure size of thread index space is as large as or greater than
133caccb7e3SMatthew G Knepley    * vector index space else return.
134caccb7e3SMatthew G Knepley    */
135caccb7e3SMatthew G Knepley   if (blockDim.x * gridDim.x < len) return;
136caccb7e3SMatthew G Knepley   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
137caccb7e3SMatthew G Knepley   if (idx < len) b[idx] = scale* a[idx];
138caccb7e3SMatthew G Knepley }
139caccb7e3SMatthew G Knepley 
140403adfb6SMatthew G Knepley __global__ void STREAM_Add(float *a, float *b, float *c,  size_t len)
141403adfb6SMatthew G Knepley {
142403adfb6SMatthew G Knepley   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
143403adfb6SMatthew G Knepley   while (idx < len) {
144403adfb6SMatthew G Knepley     c[idx] = a[idx]+b[idx];
145403adfb6SMatthew G Knepley     idx   += blockDim.x * gridDim.x;
146403adfb6SMatthew G Knepley   }
147403adfb6SMatthew G Knepley }
148403adfb6SMatthew G Knepley 
149caccb7e3SMatthew G Knepley __global__ void STREAM_Add_double(double *a, double *b, double *c,  size_t len)
150caccb7e3SMatthew G Knepley {
151caccb7e3SMatthew G Knepley   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
152caccb7e3SMatthew G Knepley   while (idx < len) {
153caccb7e3SMatthew G Knepley     c[idx] = a[idx]+b[idx];
154caccb7e3SMatthew G Knepley     idx   += blockDim.x * gridDim.x;
155caccb7e3SMatthew G Knepley   }
156caccb7e3SMatthew G Knepley }
157caccb7e3SMatthew G Knepley 
158caccb7e3SMatthew G Knepley __global__ void STREAM_Add_Optimized(float *a, float *b, float *c,  size_t len)
159caccb7e3SMatthew G Knepley {
160caccb7e3SMatthew G Knepley   /*
161caccb7e3SMatthew G Knepley    * Ensure size of thread index space is as large as or greater than
162caccb7e3SMatthew G Knepley    * vector index space else return.
163caccb7e3SMatthew G Knepley    */
164caccb7e3SMatthew G Knepley   if (blockDim.x * gridDim.x < len) return;
165caccb7e3SMatthew G Knepley   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
166caccb7e3SMatthew G Knepley   if (idx < len) c[idx] = a[idx]+b[idx];
167caccb7e3SMatthew G Knepley }
168caccb7e3SMatthew G Knepley 
169caccb7e3SMatthew G Knepley __global__ void STREAM_Add_Optimized_double(double *a, double *b, double *c,  size_t len)
170caccb7e3SMatthew G Knepley {
171caccb7e3SMatthew G Knepley   /*
172caccb7e3SMatthew G Knepley    * Ensure size of thread index space is as large as or greater than
173caccb7e3SMatthew G Knepley    * vector index space else return.
174caccb7e3SMatthew G Knepley    */
175caccb7e3SMatthew G Knepley   if (blockDim.x * gridDim.x < len) return;
176caccb7e3SMatthew G Knepley   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
177caccb7e3SMatthew G Knepley   if (idx < len) c[idx] = a[idx]+b[idx];
178caccb7e3SMatthew G Knepley }
179caccb7e3SMatthew G Knepley 
180403adfb6SMatthew G Knepley __global__ void STREAM_Triad(float *a, float *b, float *c, float scalar, size_t len)
181403adfb6SMatthew G Knepley {
182403adfb6SMatthew G Knepley   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
183403adfb6SMatthew G Knepley   while (idx < len) {
184403adfb6SMatthew G Knepley     c[idx] = a[idx]+scalar*b[idx];
185403adfb6SMatthew G Knepley     idx   += blockDim.x * gridDim.x;
186403adfb6SMatthew G Knepley   }
187403adfb6SMatthew G Knepley }
188403adfb6SMatthew G Knepley 
189caccb7e3SMatthew G Knepley __global__ void STREAM_Triad_double(double *a, double *b, double *c, double scalar, size_t len)
190caccb7e3SMatthew G Knepley {
191caccb7e3SMatthew G Knepley   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
192caccb7e3SMatthew G Knepley   while (idx < len) {
193caccb7e3SMatthew G Knepley     c[idx] = a[idx]+scalar*b[idx];
194caccb7e3SMatthew G Knepley     idx   += blockDim.x * gridDim.x;
195caccb7e3SMatthew G Knepley   }
196caccb7e3SMatthew G Knepley }
197caccb7e3SMatthew G Knepley 
198caccb7e3SMatthew G Knepley __global__ void STREAM_Triad_Optimized(float *a, float *b, float *c, float scalar, size_t len)
199caccb7e3SMatthew G Knepley {
200caccb7e3SMatthew G Knepley   /*
201caccb7e3SMatthew G Knepley    * Ensure size of thread index space is as large as or greater than
202caccb7e3SMatthew G Knepley    * vector index space else return.
203caccb7e3SMatthew G Knepley    */
204caccb7e3SMatthew G Knepley   if (blockDim.x * gridDim.x < len) return;
205caccb7e3SMatthew G Knepley   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
206caccb7e3SMatthew G Knepley   if (idx < len) c[idx] = a[idx]+scalar*b[idx];
207caccb7e3SMatthew G Knepley }
208caccb7e3SMatthew G Knepley 
209caccb7e3SMatthew G Knepley __global__ void STREAM_Triad_Optimized_double(double *a, double *b, double *c, double scalar, size_t len)
210caccb7e3SMatthew G Knepley {
211caccb7e3SMatthew G Knepley   /*
212caccb7e3SMatthew G Knepley    * Ensure size of thread index space is as large as or greater than
213caccb7e3SMatthew G Knepley    * vector index space else return.
214caccb7e3SMatthew G Knepley    */
215caccb7e3SMatthew G Knepley   if (blockDim.x * gridDim.x < len) return;
216caccb7e3SMatthew G Knepley   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
217caccb7e3SMatthew G Knepley   if (idx < len) c[idx] = a[idx]+scalar*b[idx];
218caccb7e3SMatthew G Knepley }
219caccb7e3SMatthew G Knepley 
220403adfb6SMatthew G Knepley /* Host side verification routines */
221a6dfd86eSKarl Rupp bool STREAM_Copy_verify(float *a, float *b, size_t len)
222a6dfd86eSKarl Rupp {
223403adfb6SMatthew G Knepley   size_t idx;
224403adfb6SMatthew G Knepley   bool   bDifferent = false;
225403adfb6SMatthew G Knepley 
226403adfb6SMatthew G Knepley   for (idx = 0; idx < len && !bDifferent; idx++) {
227403adfb6SMatthew G Knepley     float expectedResult     = a[idx];
228403adfb6SMatthew G Knepley     float diffResultExpected = (b[idx] - expectedResult);
229403adfb6SMatthew G Knepley     float relErrorULPS       = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps;
230403adfb6SMatthew G Knepley     /* element-wise relative error determination */
231403adfb6SMatthew G Knepley     bDifferent = (relErrorULPS > 2.f);
232403adfb6SMatthew G Knepley   }
233403adfb6SMatthew G Knepley 
234403adfb6SMatthew G Knepley   return bDifferent;
235403adfb6SMatthew G Knepley }
236403adfb6SMatthew G Knepley 
237a6dfd86eSKarl Rupp bool STREAM_Copy_verify_double(double *a, double *b, size_t len)
238a6dfd86eSKarl Rupp {
239caccb7e3SMatthew G Knepley   size_t idx;
240caccb7e3SMatthew G Knepley   bool   bDifferent = false;
241caccb7e3SMatthew G Knepley 
242caccb7e3SMatthew G Knepley   for (idx = 0; idx < len && !bDifferent; idx++) {
243caccb7e3SMatthew G Knepley     double expectedResult     = a[idx];
244caccb7e3SMatthew G Knepley     double diffResultExpected = (b[idx] - expectedResult);
24519816777SMark     double relErrorULPS       = (fabsf(diffResultExpected)/fabsf(expectedResult))/dbl_eps;
246caccb7e3SMatthew G Knepley     /* element-wise relative error determination */
247caccb7e3SMatthew G Knepley     bDifferent = (relErrorULPS > 2.);
248caccb7e3SMatthew G Knepley   }
249caccb7e3SMatthew G Knepley 
250caccb7e3SMatthew G Knepley   return bDifferent;
251caccb7e3SMatthew G Knepley }
252caccb7e3SMatthew G Knepley 
253a6dfd86eSKarl Rupp bool STREAM_Scale_verify(float *a, float *b, float scale, size_t len)
254a6dfd86eSKarl Rupp {
255403adfb6SMatthew G Knepley   size_t idx;
256403adfb6SMatthew G Knepley   bool   bDifferent = false;
257403adfb6SMatthew G Knepley 
258403adfb6SMatthew G Knepley   for (idx = 0; idx < len && !bDifferent; idx++) {
259403adfb6SMatthew G Knepley     float expectedResult     = scale*a[idx];
260403adfb6SMatthew G Knepley     float diffResultExpected = (b[idx] - expectedResult);
261403adfb6SMatthew G Knepley     float relErrorULPS       = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps;
262403adfb6SMatthew G Knepley     /* element-wise relative error determination */
263403adfb6SMatthew G Knepley     bDifferent = (relErrorULPS > 2.f);
264403adfb6SMatthew G Knepley   }
265403adfb6SMatthew G Knepley 
266403adfb6SMatthew G Knepley   return bDifferent;
267403adfb6SMatthew G Knepley }
268403adfb6SMatthew G Knepley 
269a6dfd86eSKarl Rupp bool STREAM_Scale_verify_double(double *a, double *b, double scale, size_t len)
270a6dfd86eSKarl Rupp {
271caccb7e3SMatthew G Knepley   size_t idx;
272caccb7e3SMatthew G Knepley   bool   bDifferent = false;
273caccb7e3SMatthew G Knepley 
274caccb7e3SMatthew G Knepley   for (idx = 0; idx < len && !bDifferent; idx++) {
275caccb7e3SMatthew G Knepley     double expectedResult     = scale*a[idx];
276caccb7e3SMatthew G Knepley     double diffResultExpected = (b[idx] - expectedResult);
277caccb7e3SMatthew G Knepley     double relErrorULPS       = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps;
278caccb7e3SMatthew G Knepley     /* element-wise relative error determination */
279caccb7e3SMatthew G Knepley     bDifferent = (relErrorULPS > 2.);
280caccb7e3SMatthew G Knepley   }
281caccb7e3SMatthew G Knepley 
282caccb7e3SMatthew G Knepley   return bDifferent;
283caccb7e3SMatthew G Knepley }
284caccb7e3SMatthew G Knepley 
285a6dfd86eSKarl Rupp bool STREAM_Add_verify(float *a, float *b, float *c, size_t len)
286a6dfd86eSKarl Rupp {
287403adfb6SMatthew G Knepley   size_t idx;
288403adfb6SMatthew G Knepley   bool   bDifferent = false;
289403adfb6SMatthew G Knepley 
290403adfb6SMatthew G Knepley   for (idx = 0; idx < len && !bDifferent; idx++) {
291403adfb6SMatthew G Knepley     float expectedResult     = a[idx] + b[idx];
292403adfb6SMatthew G Knepley     float diffResultExpected = (c[idx] - expectedResult);
293403adfb6SMatthew G Knepley     float relErrorULPS       = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps;
294403adfb6SMatthew G Knepley     /* element-wise relative error determination */
295403adfb6SMatthew G Knepley     bDifferent = (relErrorULPS > 2.f);
296403adfb6SMatthew G Knepley   }
297403adfb6SMatthew G Knepley 
298403adfb6SMatthew G Knepley   return bDifferent;
299403adfb6SMatthew G Knepley }
300403adfb6SMatthew G Knepley 
301a6dfd86eSKarl Rupp bool STREAM_Add_verify_double(double *a, double *b, double *c, size_t len)
302a6dfd86eSKarl Rupp {
303caccb7e3SMatthew G Knepley   size_t idx;
304caccb7e3SMatthew G Knepley   bool   bDifferent = false;
305caccb7e3SMatthew G Knepley 
306caccb7e3SMatthew G Knepley   for (idx = 0; idx < len && !bDifferent; idx++) {
307caccb7e3SMatthew G Knepley     double expectedResult     = a[idx] + b[idx];
308caccb7e3SMatthew G Knepley     double diffResultExpected = (c[idx] - expectedResult);
309caccb7e3SMatthew G Knepley     double relErrorULPS       = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps;
310caccb7e3SMatthew G Knepley     /* element-wise relative error determination */
311caccb7e3SMatthew G Knepley     bDifferent = (relErrorULPS > 2.);
312caccb7e3SMatthew G Knepley   }
313caccb7e3SMatthew G Knepley 
314caccb7e3SMatthew G Knepley   return bDifferent;
315caccb7e3SMatthew G Knepley }
316caccb7e3SMatthew G Knepley 
317a6dfd86eSKarl Rupp bool STREAM_Triad_verify(float *a, float *b, float *c, float scalar, size_t len)
318a6dfd86eSKarl Rupp {
319403adfb6SMatthew G Knepley   size_t idx;
320403adfb6SMatthew G Knepley   bool   bDifferent = false;
321403adfb6SMatthew G Knepley 
322403adfb6SMatthew G Knepley   for (idx = 0; idx < len && !bDifferent; idx++) {
323403adfb6SMatthew G Knepley     float expectedResult     = a[idx] + scalar*b[idx];
324403adfb6SMatthew G Knepley     float diffResultExpected = (c[idx] - expectedResult);
325403adfb6SMatthew G Knepley     float relErrorULPS       = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps;
326403adfb6SMatthew G Knepley     /* element-wise relative error determination */
327403adfb6SMatthew G Knepley     bDifferent = (relErrorULPS > 3.f);
328403adfb6SMatthew G Knepley   }
329403adfb6SMatthew G Knepley 
330403adfb6SMatthew G Knepley   return bDifferent;
331403adfb6SMatthew G Knepley }
332403adfb6SMatthew G Knepley 
333a6dfd86eSKarl Rupp bool STREAM_Triad_verify_double(double *a, double *b, double *c, double scalar, size_t len)
334a6dfd86eSKarl Rupp {
335caccb7e3SMatthew G Knepley   size_t idx;
336caccb7e3SMatthew G Knepley   bool   bDifferent = false;
337caccb7e3SMatthew G Knepley 
338caccb7e3SMatthew G Knepley   for (idx = 0; idx < len && !bDifferent; idx++) {
339caccb7e3SMatthew G Knepley     double expectedResult     = a[idx] + scalar*b[idx];
340caccb7e3SMatthew G Knepley     double diffResultExpected = (c[idx] - expectedResult);
341caccb7e3SMatthew G Knepley     double relErrorULPS       = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps;
342caccb7e3SMatthew G Knepley     /* element-wise relative error determination */
343caccb7e3SMatthew G Knepley     bDifferent = (relErrorULPS > 3.);
344caccb7e3SMatthew G Knepley   }
345caccb7e3SMatthew G Knepley 
346caccb7e3SMatthew G Knepley   return bDifferent;
347caccb7e3SMatthew G Knepley }
348caccb7e3SMatthew G Knepley 
349403adfb6SMatthew G Knepley /* forward declarations */
350caccb7e3SMatthew G Knepley PetscErrorCode setupStream(PetscInt device, PetscBool runDouble, PetscBool cpuTiming);
351403adfb6SMatthew G Knepley PetscErrorCode runStream(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming);
352caccb7e3SMatthew G Knepley PetscErrorCode runStreamDouble(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming);
35319816777SMark PetscErrorCode printResultsReadable(float times[][NTIMES], size_t);
354403adfb6SMatthew G Knepley 
355403adfb6SMatthew G Knepley int main(int argc, char *argv[])
356403adfb6SMatthew G Knepley {
357403adfb6SMatthew G Knepley   PetscInt       device    = 0;
35819816777SMark   PetscBool      runDouble = PETSC_TRUE;
35919816777SMark   const PetscBool cpuTiming = PETSC_TRUE; // must be true
360403adfb6SMatthew G Knepley   PetscErrorCode ierr;
361403adfb6SMatthew G Knepley 
362*5f80ce2aSJacob Faibussowitsch   CHKERRCUDA(cudaSetDeviceFlags(cudaDeviceBlockingSync));
36319816777SMark 
364a438ae71SBarry Smith   ierr = PetscInitialize(&argc, &argv, 0, help);if (ierr) return ierr;
365403adfb6SMatthew G Knepley 
366403adfb6SMatthew G Knepley   ierr = PetscOptionsBegin(PETSC_COMM_WORLD, "", "STREAM Benchmark Options", "STREAM");CHKERRQ(ierr);
367*5f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscOptionsBoundedInt("-device", "Specify the CUDA device to be used", "STREAM", device, &device, NULL,0));
368*5f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscOptionsBool("-double",    "Also run double precision tests",   "STREAM", runDouble, &runDouble, NULL));
369403adfb6SMatthew G Knepley   ierr = PetscOptionsEnd();
370403adfb6SMatthew G Knepley 
371caccb7e3SMatthew G Knepley   ierr = setupStream(device, runDouble, cpuTiming);
37219816777SMark   if (ierr) {
373*5f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscPrintf(PETSC_COMM_SELF, "\n[streamBenchmark] - results:\t%s\n\n", (ierr == 0) ? "PASSES" : "FAILED"));
374403adfb6SMatthew G Knepley   }
37526f47effSBarry Smith   ierr = PetscFinalize();
37626f47effSBarry Smith   return ierr;
377403adfb6SMatthew G Knepley }
378403adfb6SMatthew G Knepley 
379403adfb6SMatthew G Knepley ///////////////////////////////////////////////////////////////////////////////
380403adfb6SMatthew G Knepley //Run the appropriate tests
381403adfb6SMatthew G Knepley ///////////////////////////////////////////////////////////////////////////////
382caccb7e3SMatthew G Knepley PetscErrorCode setupStream(PetscInt deviceNum, PetscBool runDouble, PetscBool cpuTiming)
383403adfb6SMatthew G Knepley {
384403adfb6SMatthew G Knepley   PetscInt       iNumThreadsPerBlock = 128;
385403adfb6SMatthew G Knepley   PetscErrorCode ierr;
386403adfb6SMatthew G Knepley 
387403adfb6SMatthew G Knepley   PetscFunctionBegin;
388403adfb6SMatthew G Knepley   // Check device
389403adfb6SMatthew G Knepley   {
390403adfb6SMatthew G Knepley     int deviceCount;
391403adfb6SMatthew G Knepley 
392403adfb6SMatthew G Knepley     cudaGetDeviceCount(&deviceCount);
393403adfb6SMatthew G Knepley     if (deviceCount == 0) {
394*5f80ce2aSJacob Faibussowitsch       CHKERRQ(PetscPrintf(PETSC_COMM_SELF, "!!!!!No devices found!!!!!\n"));
395403adfb6SMatthew G Knepley       return -1000;
396403adfb6SMatthew G Knepley     }
397403adfb6SMatthew G Knepley 
398403adfb6SMatthew G Knepley     if (deviceNum >= deviceCount || deviceNum < 0) {
399*5f80ce2aSJacob Faibussowitsch       CHKERRQ(PetscPrintf(PETSC_COMM_SELF, "\n!!!!!Invalid GPU number %d given hence default gpu %d will be used !!!!!\n", deviceNum, 0));
400403adfb6SMatthew G Knepley       deviceNum = 0;
401403adfb6SMatthew G Knepley     }
402403adfb6SMatthew G Knepley   }
403403adfb6SMatthew G Knepley 
404403adfb6SMatthew G Knepley   cudaSetDevice(deviceNum);
405*5f80ce2aSJacob Faibussowitsch   // CHKERRQ(PetscPrintf(PETSC_COMM_SELF, "Running on...\n\n"));
406403adfb6SMatthew G Knepley   cudaDeviceProp deviceProp;
40719816777SMark   if (cudaGetDeviceProperties(&deviceProp, deviceNum) != cudaSuccess) {
408*5f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscPrintf(PETSC_COMM_SELF, " Unable to determine device %d properties, exiting\n"));
409403adfb6SMatthew G Knepley     return -1;
410403adfb6SMatthew G Knepley   }
411403adfb6SMatthew G Knepley 
412caccb7e3SMatthew G Knepley   if (runDouble && deviceProp.major == 1 && deviceProp.minor < 3) {
413*5f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscPrintf(PETSC_COMM_SELF, " Unable to run double-precision STREAM benchmark on a compute capability GPU less than 1.3\n"));
414caccb7e3SMatthew G Knepley     return -1;
415caccb7e3SMatthew G Knepley   }
4166f2b61bcSKarl Rupp   if (deviceProp.major == 2 && deviceProp.minor == 1) iNumThreadsPerBlock = 192; /* GF104 architecture / 48 CUDA Cores per MP */
4176f2b61bcSKarl Rupp   else iNumThreadsPerBlock = 128; /* GF100 architecture / 32 CUDA Cores per MP */
418403adfb6SMatthew G Knepley 
419caccb7e3SMatthew G Knepley   if (runDouble) {
420*5f80ce2aSJacob Faibussowitsch     CHKERRQ(runStreamDouble(iNumThreadsPerBlock, cpuTiming));
42119816777SMark   } else {
422*5f80ce2aSJacob Faibussowitsch     CHKERRQ(runStream(iNumThreadsPerBlock, cpuTiming));
423caccb7e3SMatthew G Knepley   }
424403adfb6SMatthew G Knepley   PetscFunctionReturn(0);
425403adfb6SMatthew G Knepley }
426403adfb6SMatthew G Knepley 
427403adfb6SMatthew G Knepley ///////////////////////////////////////////////////////////////////////////
428403adfb6SMatthew G Knepley // runStream
429403adfb6SMatthew G Knepley ///////////////////////////////////////////////////////////////////////////
430403adfb6SMatthew G Knepley PetscErrorCode runStream(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming)
431403adfb6SMatthew G Knepley {
432403adfb6SMatthew G Knepley   float          *d_a, *d_b, *d_c;
433403adfb6SMatthew G Knepley   int            k;
434caccb7e3SMatthew G Knepley   float          times[8][NTIMES];
435403adfb6SMatthew G Knepley   float          scalar;
436403adfb6SMatthew G Knepley   PetscErrorCode ierr;
437403adfb6SMatthew G Knepley 
438403adfb6SMatthew G Knepley   PetscFunctionBegin;
439403adfb6SMatthew G Knepley   /* Allocate memory on device */
440*5f80ce2aSJacob Faibussowitsch   CHKERRCUDA(cudaMalloc((void**)&d_a, sizeof(float)*N));
441*5f80ce2aSJacob Faibussowitsch   CHKERRCUDA(cudaMalloc((void**)&d_b, sizeof(float)*N));
442*5f80ce2aSJacob Faibussowitsch   CHKERRCUDA(cudaMalloc((void**)&d_c, sizeof(float)*N));
443403adfb6SMatthew G Knepley 
444403adfb6SMatthew G Knepley   /* Compute execution configuration */
445403adfb6SMatthew G Knepley 
446403adfb6SMatthew G Knepley   dim3 dimBlock(iNumThreadsPerBlock); /* (iNumThreadsPerBlock,1,1) */
447403adfb6SMatthew G Knepley   dim3 dimGrid(N/dimBlock.x); /* (N/dimBlock.x,1,1) */
448403adfb6SMatthew G Knepley   if (N % dimBlock.x != 0) dimGrid.x+=1;
449403adfb6SMatthew G Knepley 
450403adfb6SMatthew G Knepley   /* Initialize memory on the device */
451403adfb6SMatthew G Knepley   set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N);
452403adfb6SMatthew G Knepley   set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N);
453403adfb6SMatthew G Knepley   set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N);
454403adfb6SMatthew G Knepley 
455403adfb6SMatthew G Knepley   /* --- MAIN LOOP --- repeat test cases NTIMES times --- */
456403adfb6SMatthew G Knepley   PetscLogDouble cpuTimer = 0.0;
457403adfb6SMatthew G Knepley 
458403adfb6SMatthew G Knepley   scalar=3.0f;
459403adfb6SMatthew G Knepley   for (k = 0; k < NTIMES; ++k) {
4608563dfccSBarry Smith     PetscTimeSubtract(&cpuTimer);
461403adfb6SMatthew G Knepley     STREAM_Copy<<<dimGrid,dimBlock>>>(d_a, d_c, N);
46219816777SMark     cudaStreamSynchronize(NULL);
463*5f80ce2aSJacob Faibussowitsch     CHKERRMPI(MPI_Barrier(MPI_COMM_WORLD));
4648563dfccSBarry Smith     PetscTimeAdd(&cpuTimer);
46519816777SMark     if (bDontUseGPUTiming) times[0][k] = cpuTimer*1.e3; // millisec
466403adfb6SMatthew G Knepley 
467403adfb6SMatthew G Knepley     cpuTimer = 0.0;
4688563dfccSBarry Smith     PetscTimeSubtract(&cpuTimer);
469403adfb6SMatthew G Knepley     STREAM_Copy_Optimized<<<dimGrid,dimBlock>>>(d_a, d_c, N);
47019816777SMark     cudaStreamSynchronize(NULL);
471*5f80ce2aSJacob Faibussowitsch     CHKERRMPI(MPI_Barrier(MPI_COMM_WORLD));
472df3898eeSBarry Smith     //get the total elapsed time in ms
4738563dfccSBarry Smith     PetscTimeAdd(&cpuTimer);
47419816777SMark     if (bDontUseGPUTiming) times[1][k] = cpuTimer*1.e3;
475403adfb6SMatthew G Knepley 
476403adfb6SMatthew G Knepley     cpuTimer = 0.0;
4778563dfccSBarry Smith     PetscTimeSubtract(&cpuTimer);
478403adfb6SMatthew G Knepley     STREAM_Scale<<<dimGrid,dimBlock>>>(d_b, d_c, scalar,  N);
47919816777SMark     cudaStreamSynchronize(NULL);
480*5f80ce2aSJacob Faibussowitsch     CHKERRMPI(MPI_Barrier(MPI_COMM_WORLD));
481df3898eeSBarry Smith     //get the total elapsed time in ms
4828563dfccSBarry Smith     PetscTimeAdd(&cpuTimer);
48319816777SMark     if (bDontUseGPUTiming) times[2][k] = cpuTimer*1.e3;
484403adfb6SMatthew G Knepley 
485403adfb6SMatthew G Knepley     cpuTimer = 0.0;
4868563dfccSBarry Smith     PetscTimeSubtract(&cpuTimer);
487caccb7e3SMatthew G Knepley     STREAM_Scale_Optimized<<<dimGrid,dimBlock>>>(d_b, d_c, scalar,  N);
48819816777SMark     cudaStreamSynchronize(NULL);
489*5f80ce2aSJacob Faibussowitsch     CHKERRMPI(MPI_Barrier(MPI_COMM_WORLD));
490df3898eeSBarry Smith     //get the total elapsed time in ms
4918563dfccSBarry Smith     PetscTimeAdd(&cpuTimer);
49219816777SMark     if (bDontUseGPUTiming) times[3][k] = cpuTimer*1.e3;
493403adfb6SMatthew G Knepley 
494403adfb6SMatthew G Knepley     cpuTimer = 0.0;
4958563dfccSBarry Smith     PetscTimeSubtract(&cpuTimer);
496*5f80ce2aSJacob Faibussowitsch     // CHKERRCUDA(cudaEventRecord(start, 0));
497caccb7e3SMatthew G Knepley     STREAM_Add<<<dimGrid,dimBlock>>>(d_a, d_b, d_c,  N);
49819816777SMark     cudaStreamSynchronize(NULL);
499*5f80ce2aSJacob Faibussowitsch     CHKERRMPI(MPI_Barrier(MPI_COMM_WORLD));
500*5f80ce2aSJacob Faibussowitsch     CHKERRCUDA(cudaEventRecord(stop, 0));
501*5f80ce2aSJacob Faibussowitsch     // CHKERRCUDA(cudaEventSynchronize(stop));
502df3898eeSBarry Smith     //get the total elapsed time in ms
5038563dfccSBarry Smith     PetscTimeAdd(&cpuTimer);
50419816777SMark     if (bDontUseGPUTiming) times[4][k] = cpuTimer*1.e3;
5056f2b61bcSKarl Rupp     else {
506*5f80ce2aSJacob Faibussowitsch       // CHKERRCUDA(cudaEventElapsedTime(&times[4][k], start, stop));
507403adfb6SMatthew G Knepley     }
508403adfb6SMatthew G Knepley 
509caccb7e3SMatthew G Knepley     cpuTimer = 0.0;
5108563dfccSBarry Smith     PetscTimeSubtract(&cpuTimer);
511caccb7e3SMatthew G Knepley     STREAM_Add_Optimized<<<dimGrid,dimBlock>>>(d_a, d_b, d_c,  N);
51219816777SMark     cudaStreamSynchronize(NULL);
513*5f80ce2aSJacob Faibussowitsch     CHKERRMPI(MPI_Barrier(MPI_COMM_WORLD));
514df3898eeSBarry Smith     //get the total elapsed time in ms
5158563dfccSBarry Smith     PetscTimeAdd(&cpuTimer);
51619816777SMark     if (bDontUseGPUTiming) times[5][k] = cpuTimer*1.e3;
517caccb7e3SMatthew G Knepley 
518caccb7e3SMatthew G Knepley     cpuTimer = 0.0;
5198563dfccSBarry Smith     PetscTimeSubtract(&cpuTimer);
520caccb7e3SMatthew G Knepley     STREAM_Triad<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar,  N);
52119816777SMark     cudaStreamSynchronize(NULL);
522*5f80ce2aSJacob Faibussowitsch     CHKERRMPI(MPI_Barrier(MPI_COMM_WORLD));
523df3898eeSBarry Smith     //get the total elapsed time in ms
5248563dfccSBarry Smith     PetscTimeAdd(&cpuTimer);
52519816777SMark     if (bDontUseGPUTiming) times[6][k] = cpuTimer*1.e3;
526caccb7e3SMatthew G Knepley 
527caccb7e3SMatthew G Knepley     cpuTimer = 0.0;
5288563dfccSBarry Smith     PetscTimeSubtract(&cpuTimer);
529caccb7e3SMatthew G Knepley     STREAM_Triad_Optimized<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar,  N);
53019816777SMark     cudaStreamSynchronize(NULL);
531*5f80ce2aSJacob Faibussowitsch     CHKERRMPI(MPI_Barrier(MPI_COMM_WORLD));
532df3898eeSBarry Smith     //get the total elapsed time in ms
5338563dfccSBarry Smith     PetscTimeAdd(&cpuTimer);
53419816777SMark     if (bDontUseGPUTiming) times[7][k] = cpuTimer*1.e3;
535caccb7e3SMatthew G Knepley   }
536caccb7e3SMatthew G Knepley 
53719816777SMark   if (1) { /* verify kernels */
538403adfb6SMatthew G Knepley   float *h_a, *h_b, *h_c;
539403adfb6SMatthew G Knepley   bool  errorSTREAMkernel = true;
540403adfb6SMatthew G Knepley 
541403adfb6SMatthew G Knepley   if ((h_a = (float*)calloc(N, sizeof(float))) == (float*)NULL) {
542403adfb6SMatthew G Knepley     printf("Unable to allocate array h_a, exiting ...\n");
543403adfb6SMatthew G Knepley     exit(1);
544403adfb6SMatthew G Knepley   }
545403adfb6SMatthew G Knepley   if ((h_b = (float*)calloc(N, sizeof(float))) == (float*)NULL) {
546403adfb6SMatthew G Knepley     printf("Unable to allocate array h_b, exiting ...\n");
547403adfb6SMatthew G Knepley     exit(1);
548403adfb6SMatthew G Knepley   }
549403adfb6SMatthew G Knepley 
550403adfb6SMatthew G Knepley   if ((h_c = (float*)calloc(N, sizeof(float))) == (float*)NULL) {
551403adfb6SMatthew G Knepley     printf("Unalbe to allocate array h_c, exiting ...\n");
552403adfb6SMatthew G Knepley     exit(1);
553403adfb6SMatthew G Knepley   }
554403adfb6SMatthew G Knepley 
555403adfb6SMatthew G Knepley   /*
556403adfb6SMatthew G Knepley    * perform kernel, copy device memory into host memory and verify each
557403adfb6SMatthew G Knepley    * device kernel output
558403adfb6SMatthew G Knepley    */
559403adfb6SMatthew G Knepley 
560403adfb6SMatthew G Knepley   /* Initialize memory on the device */
561403adfb6SMatthew G Knepley   set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N);
562403adfb6SMatthew G Knepley   set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N);
563403adfb6SMatthew G Knepley   set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N);
564403adfb6SMatthew G Knepley 
565403adfb6SMatthew G Knepley   STREAM_Copy<<<dimGrid,dimBlock>>>(d_a, d_c, N);
566*5f80ce2aSJacob Faibussowitsch   CHKERRCUDA(cudaMemcpy(h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost));
567*5f80ce2aSJacob Faibussowitsch   CHKERRCUDA(cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost));
568403adfb6SMatthew G Knepley   errorSTREAMkernel = STREAM_Copy_verify(h_a, h_c, N);
569403adfb6SMatthew G Knepley   if (errorSTREAMkernel) {
570*5f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy:\t\tError detected in device STREAM_Copy, exiting\n"));
571403adfb6SMatthew G Knepley     exit(-2000);
572403adfb6SMatthew G Knepley   }
573403adfb6SMatthew G Knepley 
574403adfb6SMatthew G Knepley   /* Initialize memory on the device */
575403adfb6SMatthew G Knepley   set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N);
576403adfb6SMatthew G Knepley   set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N);
577403adfb6SMatthew G Knepley   set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N);
578403adfb6SMatthew G Knepley 
579403adfb6SMatthew G Knepley   STREAM_Copy_Optimized<<<dimGrid,dimBlock>>>(d_a, d_c, N);
580*5f80ce2aSJacob Faibussowitsch   CHKERRCUDA(cudaMemcpy(h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost));
581*5f80ce2aSJacob Faibussowitsch   CHKERRCUDA(cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost));
582403adfb6SMatthew G Knepley   errorSTREAMkernel = STREAM_Copy_verify(h_a, h_c, N);
583403adfb6SMatthew G Knepley   if (errorSTREAMkernel) {
584*5f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy_Optimized:\tError detected in device STREAM_Copy_Optimized, exiting\n"));
585403adfb6SMatthew G Knepley     exit(-3000);
586403adfb6SMatthew G Knepley   }
587403adfb6SMatthew G Knepley 
588403adfb6SMatthew G Knepley   /* Initialize memory on the device */
58919816777SMark   set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N);
590403adfb6SMatthew G Knepley   set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N);
591403adfb6SMatthew G Knepley   set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N);
592403adfb6SMatthew G Knepley 
593403adfb6SMatthew G Knepley   STREAM_Scale<<<dimGrid,dimBlock>>>(d_b, d_c, scalar, N);
594*5f80ce2aSJacob Faibussowitsch   CHKERRCUDA(cudaMemcpy(h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost));
595*5f80ce2aSJacob Faibussowitsch   CHKERRCUDA(cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost));
596403adfb6SMatthew G Knepley   errorSTREAMkernel = STREAM_Scale_verify(h_b, h_c, scalar, N);
597403adfb6SMatthew G Knepley   if (errorSTREAMkernel) {
598*5f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Scale:\t\tError detected in device STREAM_Scale, exiting\n"));
599403adfb6SMatthew G Knepley     exit(-4000);
600403adfb6SMatthew G Knepley   }
601403adfb6SMatthew G Knepley 
602403adfb6SMatthew G Knepley   /* Initialize memory on the device */
603403adfb6SMatthew G Knepley   set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N);
604403adfb6SMatthew G Knepley   set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N);
605403adfb6SMatthew G Knepley   set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N);
606403adfb6SMatthew G Knepley 
607403adfb6SMatthew G Knepley   STREAM_Add<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N);
608*5f80ce2aSJacob Faibussowitsch   CHKERRCUDA(cudaMemcpy(h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost));
609*5f80ce2aSJacob Faibussowitsch   CHKERRCUDA(cudaMemcpy(h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost));
610*5f80ce2aSJacob Faibussowitsch   CHKERRCUDA(cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost));
611403adfb6SMatthew G Knepley   errorSTREAMkernel = STREAM_Add_verify(h_a, h_b, h_c, N);
612403adfb6SMatthew G Knepley   if (errorSTREAMkernel) {
613*5f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Add:\t\tError detected in device STREAM_Add, exiting\n"));
614403adfb6SMatthew G Knepley     exit(-5000);
615403adfb6SMatthew G Knepley   }
616403adfb6SMatthew G Knepley 
617403adfb6SMatthew G Knepley   /* Initialize memory on the device */
618403adfb6SMatthew G Knepley   set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N);
619403adfb6SMatthew G Knepley   set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N);
620403adfb6SMatthew G Knepley   set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N);
621403adfb6SMatthew G Knepley 
622403adfb6SMatthew G Knepley   STREAM_Triad<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N);
623*5f80ce2aSJacob Faibussowitsch   CHKERRCUDA(cudaMemcpy(h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost));
624*5f80ce2aSJacob Faibussowitsch   CHKERRCUDA(cudaMemcpy(h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost));
625*5f80ce2aSJacob Faibussowitsch   CHKERRCUDA(cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost));
626403adfb6SMatthew G Knepley   errorSTREAMkernel = STREAM_Triad_verify(h_b, h_c, h_a, scalar, N);
627403adfb6SMatthew G Knepley   if (errorSTREAMkernel) {
628*5f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Triad:\t\tError detected in device STREAM_Triad, exiting\n"));
629403adfb6SMatthew G Knepley     exit(-6000);
630403adfb6SMatthew G Knepley   }
631403adfb6SMatthew G Knepley 
63219816777SMark   free(h_a);
63319816777SMark   free(h_b);
63419816777SMark   free(h_c);
63519816777SMark   }
636403adfb6SMatthew G Knepley   /* continue from here */
63719816777SMark   printResultsReadable(times, sizeof(float));
638403adfb6SMatthew G Knepley 
639403adfb6SMatthew G Knepley   /* Free memory on device */
640*5f80ce2aSJacob Faibussowitsch   CHKERRCUDA(cudaFree(d_a));
641*5f80ce2aSJacob Faibussowitsch   CHKERRCUDA(cudaFree(d_b));
642*5f80ce2aSJacob Faibussowitsch   CHKERRCUDA(cudaFree(d_c));
64319816777SMark 
644403adfb6SMatthew G Knepley   PetscFunctionReturn(0);
645403adfb6SMatthew G Knepley }
646403adfb6SMatthew G Knepley 
647caccb7e3SMatthew G Knepley PetscErrorCode runStreamDouble(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming)
648caccb7e3SMatthew G Knepley {
649caccb7e3SMatthew G Knepley   double         *d_a, *d_b, *d_c;
650caccb7e3SMatthew G Knepley   int            k;
651caccb7e3SMatthew G Knepley   float          times[8][NTIMES];
652caccb7e3SMatthew G Knepley   double         scalar;
653caccb7e3SMatthew G Knepley   PetscErrorCode ierr;
654caccb7e3SMatthew G Knepley 
655caccb7e3SMatthew G Knepley   PetscFunctionBegin;
656caccb7e3SMatthew G Knepley   /* Allocate memory on device */
657*5f80ce2aSJacob Faibussowitsch   CHKERRCUDA(cudaMalloc((void**)&d_a, sizeof(double)*N));
658*5f80ce2aSJacob Faibussowitsch   CHKERRCUDA(cudaMalloc((void**)&d_b, sizeof(double)*N));
659*5f80ce2aSJacob Faibussowitsch   CHKERRCUDA(cudaMalloc((void**)&d_c, sizeof(double)*N));
660caccb7e3SMatthew G Knepley 
661caccb7e3SMatthew G Knepley   /* Compute execution configuration */
662caccb7e3SMatthew G Knepley 
663caccb7e3SMatthew G Knepley   dim3 dimBlock(iNumThreadsPerBlock); /* (iNumThreadsPerBlock,1,1) */
664caccb7e3SMatthew G Knepley   dim3 dimGrid(N/dimBlock.x); /* (N/dimBlock.x,1,1) */
665caccb7e3SMatthew G Knepley   if (N % dimBlock.x != 0) dimGrid.x+=1;
666caccb7e3SMatthew G Knepley 
667caccb7e3SMatthew G Knepley   /* Initialize memory on the device */
668caccb7e3SMatthew G Knepley   set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N);
669caccb7e3SMatthew G Knepley   set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N);
670caccb7e3SMatthew G Knepley   set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N);
671caccb7e3SMatthew G Knepley 
672caccb7e3SMatthew G Knepley   /* --- MAIN LOOP --- repeat test cases NTIMES times --- */
673caccb7e3SMatthew G Knepley   PetscLogDouble cpuTimer = 0.0;
674caccb7e3SMatthew G Knepley 
675caccb7e3SMatthew G Knepley   scalar=3.0;
676caccb7e3SMatthew G Knepley   for (k = 0; k < NTIMES; ++k) {
6778563dfccSBarry Smith     PetscTimeSubtract(&cpuTimer);
678caccb7e3SMatthew G Knepley     STREAM_Copy_double<<<dimGrid,dimBlock>>>(d_a, d_c, N);
67919816777SMark     cudaStreamSynchronize(NULL);
680*5f80ce2aSJacob Faibussowitsch     CHKERRMPI(MPI_Barrier(MPI_COMM_WORLD));
681df3898eeSBarry Smith     //get the total elapsed time in ms
682caccb7e3SMatthew G Knepley     if (bDontUseGPUTiming) {
6838563dfccSBarry Smith       PetscTimeAdd(&cpuTimer);
68419816777SMark       times[0][k] = cpuTimer*1.e3;
685caccb7e3SMatthew G Knepley     }
686caccb7e3SMatthew G Knepley 
687caccb7e3SMatthew G Knepley     cpuTimer = 0.0;
6888563dfccSBarry Smith     PetscTimeSubtract(&cpuTimer);
689caccb7e3SMatthew G Knepley     STREAM_Copy_Optimized_double<<<dimGrid,dimBlock>>>(d_a, d_c, N);
69019816777SMark     cudaStreamSynchronize(NULL);
691*5f80ce2aSJacob Faibussowitsch     CHKERRMPI(MPI_Barrier(MPI_COMM_WORLD));
692df3898eeSBarry Smith     //get the total elapsed time in ms
693caccb7e3SMatthew G Knepley     if (bDontUseGPUTiming) {
6948563dfccSBarry Smith       PetscTimeAdd(&cpuTimer);
69519816777SMark       times[1][k] = cpuTimer*1.e3;
696caccb7e3SMatthew G Knepley     }
697caccb7e3SMatthew G Knepley 
698caccb7e3SMatthew G Knepley     cpuTimer = 0.0;
6998563dfccSBarry Smith     PetscTimeSubtract(&cpuTimer);
700caccb7e3SMatthew G Knepley     STREAM_Scale_double<<<dimGrid,dimBlock>>>(d_b, d_c, scalar,  N);
70119816777SMark     cudaStreamSynchronize(NULL);
702*5f80ce2aSJacob Faibussowitsch     CHKERRMPI(MPI_Barrier(MPI_COMM_WORLD));
703df3898eeSBarry Smith     //get the total elapsed time in ms
7048563dfccSBarry Smith     PetscTimeAdd(&cpuTimer);
70519816777SMark     if (bDontUseGPUTiming) times[2][k] = cpuTimer*1.e3;
706caccb7e3SMatthew G Knepley 
707caccb7e3SMatthew G Knepley     cpuTimer = 0.0;
7088563dfccSBarry Smith     PetscTimeSubtract(&cpuTimer);
709caccb7e3SMatthew G Knepley     STREAM_Scale_Optimized_double<<<dimGrid,dimBlock>>>(d_b, d_c, scalar,  N);
71019816777SMark     cudaStreamSynchronize(NULL);
711*5f80ce2aSJacob Faibussowitsch     CHKERRMPI(MPI_Barrier(MPI_COMM_WORLD));
712df3898eeSBarry Smith     //get the total elapsed time in ms
7138563dfccSBarry Smith     PetscTimeAdd(&cpuTimer);
71419816777SMark     if (bDontUseGPUTiming) times[3][k] = cpuTimer*1.e3;
715caccb7e3SMatthew G Knepley 
716caccb7e3SMatthew G Knepley     cpuTimer = 0.0;
7178563dfccSBarry Smith     PetscTimeSubtract(&cpuTimer);
718caccb7e3SMatthew G Knepley     STREAM_Add_double<<<dimGrid,dimBlock>>>(d_a, d_b, d_c,  N);
71919816777SMark     cudaStreamSynchronize(NULL);
720*5f80ce2aSJacob Faibussowitsch     CHKERRMPI(MPI_Barrier(MPI_COMM_WORLD));
721df3898eeSBarry Smith     //get the total elapsed time in ms
7228563dfccSBarry Smith     PetscTimeAdd(&cpuTimer);
72319816777SMark     if (bDontUseGPUTiming) times[4][k] = cpuTimer*1.e3;
724caccb7e3SMatthew G Knepley 
725caccb7e3SMatthew G Knepley     cpuTimer = 0.0;
7268563dfccSBarry Smith     PetscTimeSubtract(&cpuTimer);
727caccb7e3SMatthew G Knepley     STREAM_Add_Optimized_double<<<dimGrid,dimBlock>>>(d_a, d_b, d_c,  N);
72819816777SMark     cudaStreamSynchronize(NULL);
729*5f80ce2aSJacob Faibussowitsch     CHKERRMPI(MPI_Barrier(MPI_COMM_WORLD));
730df3898eeSBarry Smith     //get the total elapsed time in ms
7318563dfccSBarry Smith     PetscTimeAdd(&cpuTimer);
73219816777SMark     if (bDontUseGPUTiming) times[5][k] = cpuTimer*1.e3;
733caccb7e3SMatthew G Knepley 
734caccb7e3SMatthew G Knepley     cpuTimer = 0.0;
7358563dfccSBarry Smith     PetscTimeSubtract(&cpuTimer);
736caccb7e3SMatthew G Knepley     STREAM_Triad_double<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar,  N);
73719816777SMark     cudaStreamSynchronize(NULL);
738*5f80ce2aSJacob Faibussowitsch     CHKERRMPI(MPI_Barrier(MPI_COMM_WORLD));
739df3898eeSBarry Smith     //get the total elapsed time in ms
7408563dfccSBarry Smith     PetscTimeAdd(&cpuTimer);
74119816777SMark     if (bDontUseGPUTiming) times[6][k] = cpuTimer*1.e3;
742caccb7e3SMatthew G Knepley 
743caccb7e3SMatthew G Knepley     cpuTimer = 0.0;
7448563dfccSBarry Smith     PetscTimeSubtract(&cpuTimer);
745caccb7e3SMatthew G Knepley     STREAM_Triad_Optimized_double<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar,  N);
74619816777SMark     cudaStreamSynchronize(NULL);
747*5f80ce2aSJacob Faibussowitsch     CHKERRMPI(MPI_Barrier(MPI_COMM_WORLD));
748df3898eeSBarry Smith     //get the total elapsed time in ms
7498563dfccSBarry Smith     PetscTimeAdd(&cpuTimer);
75019816777SMark     if (bDontUseGPUTiming) times[7][k] = cpuTimer*1.e3;
751caccb7e3SMatthew G Knepley   }
752caccb7e3SMatthew G Knepley 
75319816777SMark   if (1) { /* verify kernels */
754caccb7e3SMatthew G Knepley   double *h_a, *h_b, *h_c;
755caccb7e3SMatthew G Knepley   bool   errorSTREAMkernel = true;
756caccb7e3SMatthew G Knepley 
757caccb7e3SMatthew G Knepley   if ((h_a = (double*)calloc(N, sizeof(double))) == (double*)NULL) {
758caccb7e3SMatthew G Knepley     printf("Unable to allocate array h_a, exiting ...\n");
759caccb7e3SMatthew G Knepley     exit(1);
760caccb7e3SMatthew G Knepley   }
761caccb7e3SMatthew G Knepley   if ((h_b = (double*)calloc(N, sizeof(double))) == (double*)NULL) {
762caccb7e3SMatthew G Knepley     printf("Unable to allocate array h_b, exiting ...\n");
763caccb7e3SMatthew G Knepley     exit(1);
764caccb7e3SMatthew G Knepley   }
765caccb7e3SMatthew G Knepley 
766caccb7e3SMatthew G Knepley   if ((h_c = (double*)calloc(N, sizeof(double))) == (double*)NULL) {
767caccb7e3SMatthew G Knepley     printf("Unalbe to allocate array h_c, exiting ...\n");
768caccb7e3SMatthew G Knepley     exit(1);
769caccb7e3SMatthew G Knepley   }
770caccb7e3SMatthew G Knepley 
771caccb7e3SMatthew G Knepley   /*
772caccb7e3SMatthew G Knepley    * perform kernel, copy device memory into host memory and verify each
773caccb7e3SMatthew G Knepley    * device kernel output
774caccb7e3SMatthew G Knepley    */
775caccb7e3SMatthew G Knepley 
776caccb7e3SMatthew G Knepley   /* Initialize memory on the device */
777caccb7e3SMatthew G Knepley   set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N);
778caccb7e3SMatthew G Knepley   set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N);
779caccb7e3SMatthew G Knepley   set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N);
780caccb7e3SMatthew G Knepley 
781caccb7e3SMatthew G Knepley   STREAM_Copy_double<<<dimGrid,dimBlock>>>(d_a, d_c, N);
782*5f80ce2aSJacob Faibussowitsch   CHKERRCUDA(cudaMemcpy(h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost));
783*5f80ce2aSJacob Faibussowitsch   CHKERRCUDA(cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost));
784caccb7e3SMatthew G Knepley   errorSTREAMkernel = STREAM_Copy_verify_double(h_a, h_c, N);
785caccb7e3SMatthew G Knepley   if (errorSTREAMkernel) {
786*5f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy:\t\tError detected in device STREAM_Copy, exiting\n"));
787caccb7e3SMatthew G Knepley     exit(-2000);
788caccb7e3SMatthew G Knepley   }
789caccb7e3SMatthew G Knepley 
790caccb7e3SMatthew G Knepley   /* Initialize memory on the device */
791caccb7e3SMatthew G Knepley   set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N);
792caccb7e3SMatthew G Knepley   set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N);
793caccb7e3SMatthew G Knepley   set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N);
794caccb7e3SMatthew G Knepley 
795caccb7e3SMatthew G Knepley   STREAM_Copy_Optimized_double<<<dimGrid,dimBlock>>>(d_a, d_c, N);
796*5f80ce2aSJacob Faibussowitsch   CHKERRCUDA(cudaMemcpy(h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost));
797*5f80ce2aSJacob Faibussowitsch   CHKERRCUDA(cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost));
798caccb7e3SMatthew G Knepley   errorSTREAMkernel = STREAM_Copy_verify_double(h_a, h_c, N);
799caccb7e3SMatthew G Knepley   if (errorSTREAMkernel) {
800*5f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy_Optimized:\tError detected in device STREAM_Copy_Optimized, exiting\n"));
801caccb7e3SMatthew G Knepley     exit(-3000);
802caccb7e3SMatthew G Knepley   }
803caccb7e3SMatthew G Knepley 
804caccb7e3SMatthew G Knepley   /* Initialize memory on the device */
805caccb7e3SMatthew G Knepley   set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N);
806caccb7e3SMatthew G Knepley   set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N);
807caccb7e3SMatthew G Knepley 
808caccb7e3SMatthew G Knepley   STREAM_Scale_double<<<dimGrid,dimBlock>>>(d_b, d_c, scalar, N);
809*5f80ce2aSJacob Faibussowitsch   CHKERRCUDA(cudaMemcpy(h_b, d_b, sizeof(double) * N, cudaMemcpyDeviceToHost));
810*5f80ce2aSJacob Faibussowitsch   CHKERRCUDA(cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost));
811caccb7e3SMatthew G Knepley   errorSTREAMkernel = STREAM_Scale_verify_double(h_b, h_c, scalar, N);
812caccb7e3SMatthew G Knepley   if (errorSTREAMkernel) {
813*5f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Scale:\t\tError detected in device STREAM_Scale, exiting\n"));
814caccb7e3SMatthew G Knepley     exit(-4000);
815caccb7e3SMatthew G Knepley   }
816caccb7e3SMatthew G Knepley 
817caccb7e3SMatthew G Knepley   /* Initialize memory on the device */
818caccb7e3SMatthew G Knepley   set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N);
819caccb7e3SMatthew G Knepley   set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N);
820caccb7e3SMatthew G Knepley   set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N);
821caccb7e3SMatthew G Knepley 
822caccb7e3SMatthew G Knepley   STREAM_Add_double<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N);
823*5f80ce2aSJacob Faibussowitsch   CHKERRCUDA(cudaMemcpy(h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost));
824*5f80ce2aSJacob Faibussowitsch   CHKERRCUDA(cudaMemcpy(h_b, d_b, sizeof(double) * N, cudaMemcpyDeviceToHost));
825*5f80ce2aSJacob Faibussowitsch   CHKERRCUDA(cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost));
826caccb7e3SMatthew G Knepley   errorSTREAMkernel = STREAM_Add_verify_double(h_a, h_b, h_c, N);
827caccb7e3SMatthew G Knepley   if (errorSTREAMkernel) {
828*5f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Add:\t\tError detected in device STREAM_Add, exiting\n"));
829caccb7e3SMatthew G Knepley     exit(-5000);
830caccb7e3SMatthew G Knepley   }
831caccb7e3SMatthew G Knepley 
832caccb7e3SMatthew G Knepley   /* Initialize memory on the device */
833caccb7e3SMatthew G Knepley   set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N);
834caccb7e3SMatthew G Knepley   set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N);
835caccb7e3SMatthew G Knepley   set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N);
836caccb7e3SMatthew G Knepley 
837caccb7e3SMatthew G Knepley   STREAM_Triad_double<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N);
838*5f80ce2aSJacob Faibussowitsch   CHKERRCUDA(cudaMemcpy(h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost));
839*5f80ce2aSJacob Faibussowitsch   CHKERRCUDA(cudaMemcpy(h_b, d_b, sizeof(double) * N, cudaMemcpyDeviceToHost));
840*5f80ce2aSJacob Faibussowitsch   CHKERRCUDA(cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost));
841caccb7e3SMatthew G Knepley   errorSTREAMkernel = STREAM_Triad_verify_double(h_b, h_c, h_a, scalar, N);
842caccb7e3SMatthew G Knepley   if (errorSTREAMkernel) {
843*5f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Triad:\t\tError detected in device STREAM_Triad, exiting\n"));
844caccb7e3SMatthew G Knepley     exit(-6000);
845caccb7e3SMatthew G Knepley   }
846caccb7e3SMatthew G Knepley 
84719816777SMark   free(h_a);
84819816777SMark   free(h_b);
84919816777SMark   free(h_c);
85019816777SMark   }
851caccb7e3SMatthew G Knepley   /* continue from here */
85219816777SMark   printResultsReadable(times,sizeof(double));
853caccb7e3SMatthew G Knepley 
854caccb7e3SMatthew G Knepley   /* Free memory on device */
855*5f80ce2aSJacob Faibussowitsch   CHKERRCUDA(cudaFree(d_a));
856*5f80ce2aSJacob Faibussowitsch   CHKERRCUDA(cudaFree(d_b));
857*5f80ce2aSJacob Faibussowitsch   CHKERRCUDA(cudaFree(d_c));
85819816777SMark 
859caccb7e3SMatthew G Knepley   PetscFunctionReturn(0);
860caccb7e3SMatthew G Knepley }
861caccb7e3SMatthew G Knepley 
862403adfb6SMatthew G Knepley ///////////////////////////////////////////////////////////////////////////
863403adfb6SMatthew G Knepley //Print Results to Screen and File
864403adfb6SMatthew G Knepley ///////////////////////////////////////////////////////////////////////////
86519816777SMark PetscErrorCode printResultsReadable(float times[][NTIMES], const size_t bsize)
866a6dfd86eSKarl Rupp {
867403adfb6SMatthew G Knepley   PetscErrorCode ierr;
868403adfb6SMatthew G Knepley   PetscInt       j, k;
869caccb7e3SMatthew G Knepley   float          avgtime[8]          = {0., 0., 0., 0., 0., 0., 0., 0.};
870caccb7e3SMatthew G Knepley   float          maxtime[8]          = {0., 0., 0., 0., 0., 0., 0., 0.};
871caccb7e3SMatthew G Knepley   float          mintime[8]          = {1e30,1e30,1e30,1e30,1e30,1e30,1e30,1e30};
87219816777SMark   // char           *label[8]           = {"Copy:      ", "Copy Opt.: ", "Scale:     ", "Scale Opt: ", "Add:       ", "Add Opt:   ", "Triad:     ", "Triad Opt: "};
87319816777SMark   const float    bytes_per_kernel[8] = {
87419816777SMark     2. * bsize * N,
87519816777SMark     2. * bsize * N,
87619816777SMark     2. * bsize * N,
87719816777SMark     2. * bsize * N,
87819816777SMark     3. * bsize * N,
87919816777SMark     3. * bsize * N,
88019816777SMark     3. * bsize * N,
88119816777SMark     3. * bsize * N
882403adfb6SMatthew G Knepley   };
88319816777SMark   double         rate,irate;
88419816777SMark   int            rank,size;
885403adfb6SMatthew G Knepley   PetscFunctionBegin;
886*5f80ce2aSJacob Faibussowitsch   CHKERRMPI(MPI_Comm_rank(MPI_COMM_WORLD,&rank));
887*5f80ce2aSJacob Faibussowitsch   CHKERRMPI(MPI_Comm_size(MPI_COMM_WORLD,&size));
888403adfb6SMatthew G Knepley   /* --- SUMMARY --- */
88919816777SMark   for (k = 0; k < NTIMES; ++k) {
890caccb7e3SMatthew G Knepley     for (j = 0; j < 8; ++j) {
89119816777SMark       avgtime[j] = avgtime[j] + (1.e-03f * times[j][k]); // millisec --> sec
892403adfb6SMatthew G Knepley       mintime[j] = MIN(mintime[j], (1.e-03f * times[j][k]));
893403adfb6SMatthew G Knepley       maxtime[j] = MAX(maxtime[j], (1.e-03f * times[j][k]));
894403adfb6SMatthew G Knepley     }
89519816777SMark   }
896caccb7e3SMatthew G Knepley   for (j = 0; j < 8; ++j) {
897403adfb6SMatthew G Knepley     avgtime[j] = avgtime[j]/(float)(NTIMES-1);
898403adfb6SMatthew G Knepley   }
89919816777SMark   j = 7;
90019816777SMark   irate = 1.0E-06 * bytes_per_kernel[j]/mintime[j];
90119816777SMark   ierr = MPI_Reduce(&irate,&rate,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD);
902dd400576SPatrick Sanan   if (rank == 0) {
90319816777SMark     FILE *fd;
90419816777SMark     if (size == 1) {
90519816777SMark       printf("%d %11.4f   Rate (MB/s)\n",size, rate);
90619816777SMark       fd = fopen("flops","w");
90719816777SMark       fprintf(fd,"%g\n",rate);
90819816777SMark       fclose(fd);
90919816777SMark     } else {
91019816777SMark       double prate;
91119816777SMark       fd = fopen("flops","r");
91219816777SMark       fscanf(fd,"%lg",&prate);
91319816777SMark       fclose(fd);
91419816777SMark       printf("%d %11.4f   Rate (MB/s) %g \n", size, rate, rate/prate);
91519816777SMark     }
91619816777SMark   }
91719816777SMark 
918403adfb6SMatthew G Knepley   PetscFunctionReturn(0);
919403adfb6SMatthew G Knepley }
920