xref: /petsc/src/benchmarks/streams/CUDAVersion.cu (revision caccb7e3fadda7e48a0c0f5df967c025f2f35a4c)
1403adfb6SMatthew G Knepley /*
2403adfb6SMatthew G Knepley   STREAM benchmark implementation in CUDA.
3403adfb6SMatthew G Knepley 
4403adfb6SMatthew G Knepley     COPY:       a(i) = b(i)
5403adfb6SMatthew G Knepley     SCALE:      a(i) = q*b(i)
6403adfb6SMatthew G Knepley     SUM:        a(i) = b(i) + c(i)
7403adfb6SMatthew G Knepley     TRIAD:      a(i) = b(i) + q*c(i)
8403adfb6SMatthew G Knepley 
9403adfb6SMatthew G Knepley   It measures the memory system on the device.
10403adfb6SMatthew G Knepley   The implementation is in single precision.
11403adfb6SMatthew G Knepley 
12403adfb6SMatthew G Knepley   Code based on the code developed by John D. McCalpin
13403adfb6SMatthew G Knepley   http://www.cs.virginia.edu/stream/FTP/Code/stream.c
14403adfb6SMatthew G Knepley 
15403adfb6SMatthew G Knepley   Written by: Massimiliano Fatica, NVIDIA Corporation
16403adfb6SMatthew G Knepley   Modified by: Douglas Enright (dpephd-nvidia@yahoo.com), 1 December 2010
17403adfb6SMatthew G Knepley   Extensive Revisions, 4 December 2010
18403adfb6SMatthew G Knepley   Modified for PETSc by: Matthew G. Knepley 14 Aug 2011
19403adfb6SMatthew G Knepley 
20403adfb6SMatthew G Knepley   User interface motivated by bandwidthTest NVIDIA SDK example.
21403adfb6SMatthew G Knepley */
22403adfb6SMatthew G Knepley static char *help =  "Single-Precision STREAM Benchmark implementation in CUDA\n"
23403adfb6SMatthew G Knepley                      "Performs Copy, Scale, Add, and Triad single-precision kernels\n\n";
24403adfb6SMatthew G Knepley 
25403adfb6SMatthew G Knepley #include <petscconf.h>
26403adfb6SMatthew G Knepley #include <petscsys.h>
27403adfb6SMatthew G Knepley #include <petsctime.h>
28403adfb6SMatthew G Knepley 
29403adfb6SMatthew G Knepley #define N        2000000
30*caccb7e3SMatthew G Knepley #define N_DOUBLE 8000000
31403adfb6SMatthew G Knepley #define NTIMES   10
32403adfb6SMatthew G Knepley 
33403adfb6SMatthew G Knepley # ifndef MIN
34403adfb6SMatthew G Knepley # define MIN(x,y) ((x)<(y)?(x):(y))
35403adfb6SMatthew G Knepley # endif
36403adfb6SMatthew G Knepley # ifndef MAX
37403adfb6SMatthew G Knepley # define MAX(x,y) ((x)>(y)?(x):(y))
38403adfb6SMatthew G Knepley # endif
39403adfb6SMatthew G Knepley 
40403adfb6SMatthew G Knepley const float flt_eps  = 1.192092896e-07f;
41*caccb7e3SMatthew G Knepley const double dbl_eps = 2.2204460492503131e-16;
42403adfb6SMatthew G Knepley 
43403adfb6SMatthew G Knepley __global__ void set_array(float *a,  float value, size_t len)
44403adfb6SMatthew G Knepley {
45403adfb6SMatthew G Knepley   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
46403adfb6SMatthew G Knepley   while (idx < len) {
47403adfb6SMatthew G Knepley     a[idx] = value;
48403adfb6SMatthew G Knepley     idx += blockDim.x * gridDim.x;
49403adfb6SMatthew G Knepley   }
50403adfb6SMatthew G Knepley }
51403adfb6SMatthew G Knepley 
52*caccb7e3SMatthew G Knepley __global__ void set_array_double(double *a,  double value, size_t len)
53*caccb7e3SMatthew G Knepley {
54*caccb7e3SMatthew G Knepley   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
55*caccb7e3SMatthew G Knepley   while (idx < len) {
56*caccb7e3SMatthew G Knepley     a[idx] = value;
57*caccb7e3SMatthew G Knepley     idx += blockDim.x * gridDim.x;
58*caccb7e3SMatthew G Knepley   }
59*caccb7e3SMatthew G Knepley }
60*caccb7e3SMatthew G Knepley 
61403adfb6SMatthew G Knepley __global__ void STREAM_Copy(float *a, float *b, size_t len)
62403adfb6SMatthew G Knepley {
63403adfb6SMatthew G Knepley   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
64403adfb6SMatthew G Knepley   while (idx < len) {
65403adfb6SMatthew G Knepley     b[idx] = a[idx];
66403adfb6SMatthew G Knepley     idx += blockDim.x * gridDim.x;
67403adfb6SMatthew G Knepley   }
68403adfb6SMatthew G Knepley }
69403adfb6SMatthew G Knepley 
70*caccb7e3SMatthew G Knepley __global__ void STREAM_Copy_double(double *a, double *b, size_t len)
71*caccb7e3SMatthew G Knepley {
72*caccb7e3SMatthew G Knepley   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
73*caccb7e3SMatthew G Knepley   while (idx < len) {
74*caccb7e3SMatthew G Knepley     b[idx] = a[idx];
75*caccb7e3SMatthew G Knepley     idx += blockDim.x * gridDim.x;
76*caccb7e3SMatthew G Knepley   }
77*caccb7e3SMatthew G Knepley }
78*caccb7e3SMatthew G Knepley 
79403adfb6SMatthew G Knepley __global__ void STREAM_Copy_Optimized(float *a, float *b, size_t len)
80403adfb6SMatthew G Knepley {
81403adfb6SMatthew G Knepley   /*
82403adfb6SMatthew G Knepley    * Ensure size of thread index space is as large as or greater than
83403adfb6SMatthew G Knepley    * vector index space else return.
84403adfb6SMatthew G Knepley    */
85403adfb6SMatthew G Knepley   if (blockDim.x * gridDim.x < len) return;
86403adfb6SMatthew G Knepley   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
87403adfb6SMatthew G Knepley   if (idx < len) b[idx] = a[idx];
88403adfb6SMatthew G Knepley }
89403adfb6SMatthew G Knepley 
90*caccb7e3SMatthew G Knepley __global__ void STREAM_Copy_Optimized_double(double *a, double *b, size_t len)
91*caccb7e3SMatthew G Knepley {
92*caccb7e3SMatthew G Knepley   /*
93*caccb7e3SMatthew G Knepley    * Ensure size of thread index space is as large as or greater than
94*caccb7e3SMatthew G Knepley    * vector index space else return.
95*caccb7e3SMatthew G Knepley    */
96*caccb7e3SMatthew G Knepley   if (blockDim.x * gridDim.x < len) return;
97*caccb7e3SMatthew G Knepley   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
98*caccb7e3SMatthew G Knepley   if (idx < len) b[idx] = a[idx];
99*caccb7e3SMatthew G Knepley }
100*caccb7e3SMatthew G Knepley 
101403adfb6SMatthew G Knepley __global__ void STREAM_Scale(float *a, float *b, float scale,  size_t len)
102403adfb6SMatthew G Knepley {
103403adfb6SMatthew G Knepley   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
104403adfb6SMatthew G Knepley   while (idx < len) {
105403adfb6SMatthew G Knepley     b[idx] = scale* a[idx];
106403adfb6SMatthew G Knepley     idx += blockDim.x * gridDim.x;
107403adfb6SMatthew G Knepley   }
108403adfb6SMatthew G Knepley }
109403adfb6SMatthew G Knepley 
110*caccb7e3SMatthew G Knepley __global__ void STREAM_Scale_double(double *a, double *b, double scale,  size_t len)
111*caccb7e3SMatthew G Knepley {
112*caccb7e3SMatthew G Knepley   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
113*caccb7e3SMatthew G Knepley   while (idx < len) {
114*caccb7e3SMatthew G Knepley     b[idx] = scale* a[idx];
115*caccb7e3SMatthew G Knepley     idx += blockDim.x * gridDim.x;
116*caccb7e3SMatthew G Knepley   }
117*caccb7e3SMatthew G Knepley }
118*caccb7e3SMatthew G Knepley 
119*caccb7e3SMatthew G Knepley __global__ void STREAM_Scale_Optimized(float *a, float *b, float scale,  size_t len)
120*caccb7e3SMatthew G Knepley {
121*caccb7e3SMatthew G Knepley   /*
122*caccb7e3SMatthew G Knepley    * Ensure size of thread index space is as large as or greater than
123*caccb7e3SMatthew G Knepley    * vector index space else return.
124*caccb7e3SMatthew G Knepley    */
125*caccb7e3SMatthew G Knepley   if (blockDim.x * gridDim.x < len) return;
126*caccb7e3SMatthew G Knepley   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
127*caccb7e3SMatthew G Knepley   if (idx < len) b[idx] = scale* a[idx];
128*caccb7e3SMatthew G Knepley }
129*caccb7e3SMatthew G Knepley 
130*caccb7e3SMatthew G Knepley __global__ void STREAM_Scale_Optimized_double(double *a, double *b, double scale,  size_t len)
131*caccb7e3SMatthew G Knepley {
132*caccb7e3SMatthew G Knepley   /*
133*caccb7e3SMatthew G Knepley    * Ensure size of thread index space is as large as or greater than
134*caccb7e3SMatthew G Knepley    * vector index space else return.
135*caccb7e3SMatthew G Knepley    */
136*caccb7e3SMatthew G Knepley   if (blockDim.x * gridDim.x < len) return;
137*caccb7e3SMatthew G Knepley   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
138*caccb7e3SMatthew G Knepley   if (idx < len) b[idx] = scale* a[idx];
139*caccb7e3SMatthew G Knepley }
140*caccb7e3SMatthew G Knepley 
141403adfb6SMatthew G Knepley __global__ void STREAM_Add( float *a, float *b, float *c,  size_t len)
142403adfb6SMatthew G Knepley {
143403adfb6SMatthew G Knepley   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
144403adfb6SMatthew G Knepley   while (idx < len) {
145403adfb6SMatthew G Knepley     c[idx] = a[idx]+b[idx];
146403adfb6SMatthew G Knepley     idx += blockDim.x * gridDim.x;
147403adfb6SMatthew G Knepley   }
148403adfb6SMatthew G Knepley }
149403adfb6SMatthew G Knepley 
150*caccb7e3SMatthew G Knepley __global__ void STREAM_Add_double( double *a, double *b, double *c,  size_t len)
151*caccb7e3SMatthew G Knepley {
152*caccb7e3SMatthew G Knepley   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
153*caccb7e3SMatthew G Knepley   while (idx < len) {
154*caccb7e3SMatthew G Knepley     c[idx] = a[idx]+b[idx];
155*caccb7e3SMatthew G Knepley     idx += blockDim.x * gridDim.x;
156*caccb7e3SMatthew G Knepley   }
157*caccb7e3SMatthew G Knepley }
158*caccb7e3SMatthew G Knepley 
159*caccb7e3SMatthew G Knepley __global__ void STREAM_Add_Optimized( float *a, float *b, float *c,  size_t len)
160*caccb7e3SMatthew G Knepley {
161*caccb7e3SMatthew G Knepley   /*
162*caccb7e3SMatthew G Knepley    * Ensure size of thread index space is as large as or greater than
163*caccb7e3SMatthew G Knepley    * vector index space else return.
164*caccb7e3SMatthew G Knepley    */
165*caccb7e3SMatthew G Knepley   if (blockDim.x * gridDim.x < len) return;
166*caccb7e3SMatthew G Knepley   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
167*caccb7e3SMatthew G Knepley   if (idx < len) c[idx] = a[idx]+b[idx];
168*caccb7e3SMatthew G Knepley }
169*caccb7e3SMatthew G Knepley 
170*caccb7e3SMatthew G Knepley __global__ void STREAM_Add_Optimized_double( double *a, double *b, double *c,  size_t len)
171*caccb7e3SMatthew G Knepley {
172*caccb7e3SMatthew G Knepley   /*
173*caccb7e3SMatthew G Knepley    * Ensure size of thread index space is as large as or greater than
174*caccb7e3SMatthew G Knepley    * vector index space else return.
175*caccb7e3SMatthew G Knepley    */
176*caccb7e3SMatthew G Knepley   if (blockDim.x * gridDim.x < len) return;
177*caccb7e3SMatthew G Knepley   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
178*caccb7e3SMatthew G Knepley   if (idx < len) c[idx] = a[idx]+b[idx];
179*caccb7e3SMatthew G Knepley }
180*caccb7e3SMatthew G Knepley 
181403adfb6SMatthew G Knepley __global__ void STREAM_Triad( float *a, float *b, float *c, float scalar, size_t len)
182403adfb6SMatthew G Knepley {
183403adfb6SMatthew G Knepley   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
184403adfb6SMatthew G Knepley   while (idx < len) {
185403adfb6SMatthew G Knepley     c[idx] = a[idx]+scalar*b[idx];
186403adfb6SMatthew G Knepley     idx += blockDim.x * gridDim.x;
187403adfb6SMatthew G Knepley   }
188403adfb6SMatthew G Knepley }
189403adfb6SMatthew G Knepley 
190*caccb7e3SMatthew G Knepley __global__ void STREAM_Triad_double( double *a, double *b, double *c, double scalar, size_t len)
191*caccb7e3SMatthew G Knepley {
192*caccb7e3SMatthew G Knepley   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
193*caccb7e3SMatthew G Knepley   while (idx < len) {
194*caccb7e3SMatthew G Knepley     c[idx] = a[idx]+scalar*b[idx];
195*caccb7e3SMatthew G Knepley     idx += blockDim.x * gridDim.x;
196*caccb7e3SMatthew G Knepley   }
197*caccb7e3SMatthew G Knepley }
198*caccb7e3SMatthew G Knepley 
199*caccb7e3SMatthew G Knepley __global__ void STREAM_Triad_Optimized( float *a, float *b, float *c, float scalar, size_t len)
200*caccb7e3SMatthew G Knepley {
201*caccb7e3SMatthew G Knepley   /*
202*caccb7e3SMatthew G Knepley    * Ensure size of thread index space is as large as or greater than
203*caccb7e3SMatthew G Knepley    * vector index space else return.
204*caccb7e3SMatthew G Knepley    */
205*caccb7e3SMatthew G Knepley   if (blockDim.x * gridDim.x < len) return;
206*caccb7e3SMatthew G Knepley   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
207*caccb7e3SMatthew G Knepley   if (idx < len) c[idx] = a[idx]+scalar*b[idx];
208*caccb7e3SMatthew G Knepley }
209*caccb7e3SMatthew G Knepley 
210*caccb7e3SMatthew G Knepley __global__ void STREAM_Triad_Optimized_double( double *a, double *b, double *c, double scalar, size_t len)
211*caccb7e3SMatthew G Knepley {
212*caccb7e3SMatthew G Knepley   /*
213*caccb7e3SMatthew G Knepley    * Ensure size of thread index space is as large as or greater than
214*caccb7e3SMatthew G Knepley    * vector index space else return.
215*caccb7e3SMatthew G Knepley    */
216*caccb7e3SMatthew G Knepley   if (blockDim.x * gridDim.x < len) return;
217*caccb7e3SMatthew G Knepley   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
218*caccb7e3SMatthew G Knepley   if (idx < len) c[idx] = a[idx]+scalar*b[idx];
219*caccb7e3SMatthew G Knepley }
220*caccb7e3SMatthew G Knepley 
221403adfb6SMatthew G Knepley /* Host side verification routines */
222403adfb6SMatthew G Knepley bool STREAM_Copy_verify(float *a, float *b, size_t len) {
223403adfb6SMatthew G Knepley   size_t idx;
224403adfb6SMatthew G Knepley   bool bDifferent = false;
225403adfb6SMatthew G Knepley 
226403adfb6SMatthew G Knepley   for (idx = 0; idx < len && !bDifferent; idx++) {
227403adfb6SMatthew G Knepley     float expectedResult = a[idx];
228403adfb6SMatthew G Knepley     float diffResultExpected = (b[idx] - expectedResult);
229403adfb6SMatthew G Knepley     float relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps;
230403adfb6SMatthew G Knepley     /* element-wise relative error determination */
231403adfb6SMatthew G Knepley     bDifferent = (relErrorULPS > 2.f);
232403adfb6SMatthew G Knepley   }
233403adfb6SMatthew G Knepley 
234403adfb6SMatthew G Knepley   return bDifferent;
235403adfb6SMatthew G Knepley }
236403adfb6SMatthew G Knepley 
237*caccb7e3SMatthew G Knepley bool STREAM_Copy_verify_double(double *a, double *b, size_t len) {
238*caccb7e3SMatthew G Knepley   size_t idx;
239*caccb7e3SMatthew G Knepley   bool bDifferent = false;
240*caccb7e3SMatthew G Knepley 
241*caccb7e3SMatthew G Knepley   for (idx = 0; idx < len && !bDifferent; idx++) {
242*caccb7e3SMatthew G Knepley     double expectedResult = a[idx];
243*caccb7e3SMatthew G Knepley     double diffResultExpected = (b[idx] - expectedResult);
244*caccb7e3SMatthew G Knepley     double relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps;
245*caccb7e3SMatthew G Knepley     /* element-wise relative error determination */
246*caccb7e3SMatthew G Knepley     bDifferent = (relErrorULPS > 2.);
247*caccb7e3SMatthew G Knepley   }
248*caccb7e3SMatthew G Knepley 
249*caccb7e3SMatthew G Knepley   return bDifferent;
250*caccb7e3SMatthew G Knepley }
251*caccb7e3SMatthew G Knepley 
252403adfb6SMatthew G Knepley bool STREAM_Scale_verify(float *a, float *b, float scale, size_t len) {
253403adfb6SMatthew G Knepley   size_t idx;
254403adfb6SMatthew G Knepley   bool bDifferent = false;
255403adfb6SMatthew G Knepley 
256403adfb6SMatthew G Knepley   for (idx = 0; idx < len && !bDifferent; idx++) {
257403adfb6SMatthew G Knepley     float expectedResult = scale*a[idx];
258403adfb6SMatthew G Knepley     float diffResultExpected = (b[idx] - expectedResult);
259403adfb6SMatthew G Knepley     float relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps;
260403adfb6SMatthew G Knepley     /* element-wise relative error determination */
261403adfb6SMatthew G Knepley     bDifferent = (relErrorULPS > 2.f);
262403adfb6SMatthew G Knepley   }
263403adfb6SMatthew G Knepley 
264403adfb6SMatthew G Knepley   return bDifferent;
265403adfb6SMatthew G Knepley }
266403adfb6SMatthew G Knepley 
267*caccb7e3SMatthew G Knepley bool STREAM_Scale_verify_double(double *a, double *b, double scale, size_t len) {
268*caccb7e3SMatthew G Knepley   size_t idx;
269*caccb7e3SMatthew G Knepley   bool bDifferent = false;
270*caccb7e3SMatthew G Knepley 
271*caccb7e3SMatthew G Knepley   for (idx = 0; idx < len && !bDifferent; idx++) {
272*caccb7e3SMatthew G Knepley     double expectedResult = scale*a[idx];
273*caccb7e3SMatthew G Knepley     double diffResultExpected = (b[idx] - expectedResult);
274*caccb7e3SMatthew G Knepley     double relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps;
275*caccb7e3SMatthew G Knepley     /* element-wise relative error determination */
276*caccb7e3SMatthew G Knepley     bDifferent = (relErrorULPS > 2.);
277*caccb7e3SMatthew G Knepley   }
278*caccb7e3SMatthew G Knepley 
279*caccb7e3SMatthew G Knepley   return bDifferent;
280*caccb7e3SMatthew G Knepley }
281*caccb7e3SMatthew G Knepley 
282403adfb6SMatthew G Knepley bool STREAM_Add_verify(float *a, float *b, float *c, size_t len) {
283403adfb6SMatthew G Knepley   size_t idx;
284403adfb6SMatthew G Knepley   bool bDifferent = false;
285403adfb6SMatthew G Knepley 
286403adfb6SMatthew G Knepley   for (idx = 0; idx < len && !bDifferent; idx++) {
287403adfb6SMatthew G Knepley     float expectedResult = a[idx] + b[idx];
288403adfb6SMatthew G Knepley     float diffResultExpected = (c[idx] - expectedResult);
289403adfb6SMatthew G Knepley     float relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps;
290403adfb6SMatthew G Knepley     /* element-wise relative error determination */
291403adfb6SMatthew G Knepley     bDifferent = (relErrorULPS > 2.f);
292403adfb6SMatthew G Knepley   }
293403adfb6SMatthew G Knepley 
294403adfb6SMatthew G Knepley   return bDifferent;
295403adfb6SMatthew G Knepley }
296403adfb6SMatthew G Knepley 
297*caccb7e3SMatthew G Knepley bool STREAM_Add_verify_double(double *a, double *b, double *c, size_t len) {
298*caccb7e3SMatthew G Knepley   size_t idx;
299*caccb7e3SMatthew G Knepley   bool bDifferent = false;
300*caccb7e3SMatthew G Knepley 
301*caccb7e3SMatthew G Knepley   for (idx = 0; idx < len && !bDifferent; idx++) {
302*caccb7e3SMatthew G Knepley     double expectedResult = a[idx] + b[idx];
303*caccb7e3SMatthew G Knepley     double diffResultExpected = (c[idx] - expectedResult);
304*caccb7e3SMatthew G Knepley     double relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps;
305*caccb7e3SMatthew G Knepley     /* element-wise relative error determination */
306*caccb7e3SMatthew G Knepley     bDifferent = (relErrorULPS > 2.);
307*caccb7e3SMatthew G Knepley   }
308*caccb7e3SMatthew G Knepley 
309*caccb7e3SMatthew G Knepley   return bDifferent;
310*caccb7e3SMatthew G Knepley }
311*caccb7e3SMatthew G Knepley 
312403adfb6SMatthew G Knepley bool STREAM_Triad_verify(float *a, float *b, float *c, float scalar, size_t len) {
313403adfb6SMatthew G Knepley   size_t idx;
314403adfb6SMatthew G Knepley   bool bDifferent = false;
315403adfb6SMatthew G Knepley 
316403adfb6SMatthew G Knepley   for (idx = 0; idx < len && !bDifferent; idx++) {
317403adfb6SMatthew G Knepley     float expectedResult = a[idx] + scalar*b[idx];
318403adfb6SMatthew G Knepley     float diffResultExpected = (c[idx] - expectedResult);
319403adfb6SMatthew G Knepley     float relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps;
320403adfb6SMatthew G Knepley     /* element-wise relative error determination */
321403adfb6SMatthew G Knepley     bDifferent = (relErrorULPS > 3.f);
322403adfb6SMatthew G Knepley   }
323403adfb6SMatthew G Knepley 
324403adfb6SMatthew G Knepley   return bDifferent;
325403adfb6SMatthew G Knepley }
326403adfb6SMatthew G Knepley 
327*caccb7e3SMatthew G Knepley bool STREAM_Triad_verify_double(double *a, double *b, double *c, double scalar, size_t len) {
328*caccb7e3SMatthew G Knepley   size_t idx;
329*caccb7e3SMatthew G Knepley   bool bDifferent = false;
330*caccb7e3SMatthew G Knepley 
331*caccb7e3SMatthew G Knepley   for (idx = 0; idx < len && !bDifferent; idx++) {
332*caccb7e3SMatthew G Knepley     double expectedResult = a[idx] + scalar*b[idx];
333*caccb7e3SMatthew G Knepley     double diffResultExpected = (c[idx] - expectedResult);
334*caccb7e3SMatthew G Knepley     double relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps;
335*caccb7e3SMatthew G Knepley     /* element-wise relative error determination */
336*caccb7e3SMatthew G Knepley     bDifferent = (relErrorULPS > 3.);
337*caccb7e3SMatthew G Knepley   }
338*caccb7e3SMatthew G Knepley 
339*caccb7e3SMatthew G Knepley   return bDifferent;
340*caccb7e3SMatthew G Knepley }
341*caccb7e3SMatthew G Knepley 
342403adfb6SMatthew G Knepley /* forward declarations */
343*caccb7e3SMatthew G Knepley PetscErrorCode setupStream(PetscInt device, PetscBool runDouble, PetscBool cpuTiming);
344403adfb6SMatthew G Knepley PetscErrorCode runStream(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming);
345*caccb7e3SMatthew G Knepley PetscErrorCode runStreamDouble(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming);
346403adfb6SMatthew G Knepley PetscErrorCode printResultsReadable(float times[][NTIMES]);
347403adfb6SMatthew G Knepley 
348403adfb6SMatthew G Knepley int main(int argc, char *argv[])
349403adfb6SMatthew G Knepley {
350403adfb6SMatthew G Knepley   PetscInt       device    = 0;
351*caccb7e3SMatthew G Knepley   PetscBool      runDouble = PETSC_FALSE;
352403adfb6SMatthew G Knepley   PetscBool      cpuTiming = PETSC_FALSE;
353403adfb6SMatthew G Knepley   PetscErrorCode ierr;
354403adfb6SMatthew G Knepley 
355403adfb6SMatthew G Knepley   ierr = PetscInitialize(&argc, &argv, 0, help);CHKERRQ(ierr);
356*caccb7e3SMatthew G Knepley   ierr = PetscPrintf(PETSC_COMM_SELF, "[Single and Double-Precision Device-Only STREAM Benchmark implementation in CUDA]\n");CHKERRQ(ierr);
357403adfb6SMatthew G Knepley   ierr = PetscPrintf(PETSC_COMM_SELF, "%s Starting...\n\n", argv[0]);CHKERRQ(ierr);
358403adfb6SMatthew G Knepley 
359403adfb6SMatthew G Knepley   ierr = PetscOptionsBegin(PETSC_COMM_WORLD, "", "STREAM Benchmark Options", "STREAM");CHKERRQ(ierr);
360403adfb6SMatthew G Knepley     ierr = PetscOptionsInt("-device", "Specify the CUDA device to be used", "STREAM", device, &device, PETSC_NULL);CHKERRQ(ierr);
361*caccb7e3SMatthew G Knepley     ierr = PetscOptionsBool("-double",    "Also run double precision tests",   "STREAM", runDouble, &runDouble, PETSC_NULL);CHKERRQ(ierr);
362403adfb6SMatthew G Knepley     ierr = PetscOptionsBool("-cputiming", "Force CPU-based timing to be used", "STREAM", cpuTiming, &cpuTiming, PETSC_NULL);CHKERRQ(ierr);
363403adfb6SMatthew G Knepley   ierr = PetscOptionsEnd();
364403adfb6SMatthew G Knepley 
365*caccb7e3SMatthew G Knepley   ierr = setupStream(device, runDouble, cpuTiming);
366403adfb6SMatthew G Knepley   if (ierr >= 0) {
367403adfb6SMatthew G Knepley     PetscErrorCode ierr2 = PetscPrintf(PETSC_COMM_SELF, "\n[streamBenchmark] - results:\t%s\n\n", (ierr == 0) ? "PASSES" : "FAILED");CHKERRQ(ierr2);
368403adfb6SMatthew G Knepley   }
369403adfb6SMatthew G Knepley   PetscFinalize();
370403adfb6SMatthew G Knepley   return 0;
371403adfb6SMatthew G Knepley }
372403adfb6SMatthew G Knepley 
373403adfb6SMatthew G Knepley ///////////////////////////////////////////////////////////////////////////////
374403adfb6SMatthew G Knepley //Run the appropriate tests
375403adfb6SMatthew G Knepley ///////////////////////////////////////////////////////////////////////////////
376*caccb7e3SMatthew G Knepley PetscErrorCode setupStream(PetscInt deviceNum, PetscBool runDouble, PetscBool cpuTiming)
377403adfb6SMatthew G Knepley {
378403adfb6SMatthew G Knepley   PetscInt       iNumThreadsPerBlock = 128;
379403adfb6SMatthew G Knepley   PetscErrorCode ierr;
380403adfb6SMatthew G Knepley 
381403adfb6SMatthew G Knepley   PetscFunctionBegin;
382403adfb6SMatthew G Knepley   // Check device
383403adfb6SMatthew G Knepley   {
384403adfb6SMatthew G Knepley     int deviceCount;
385403adfb6SMatthew G Knepley 
386403adfb6SMatthew G Knepley     cudaGetDeviceCount(&deviceCount);
387403adfb6SMatthew G Knepley     if (deviceCount == 0) {
388403adfb6SMatthew G Knepley       ierr = PetscPrintf(PETSC_COMM_SELF, "!!!!!No devices found!!!!!\n");CHKERRQ(ierr);
389403adfb6SMatthew G Knepley       return -1000;
390403adfb6SMatthew G Knepley     }
391403adfb6SMatthew G Knepley 
392403adfb6SMatthew G Knepley     if (deviceNum >= deviceCount || deviceNum < 0) {
393403adfb6SMatthew G Knepley       ierr = PetscPrintf(PETSC_COMM_SELF, "\n!!!!!Invalid GPU number %d given hence default gpu %d will be used !!!!!\n", deviceNum, 0);CHKERRQ(ierr);
394403adfb6SMatthew G Knepley       deviceNum = 0;
395403adfb6SMatthew G Knepley     }
396403adfb6SMatthew G Knepley   }
397403adfb6SMatthew G Knepley 
398403adfb6SMatthew G Knepley   cudaSetDevice(deviceNum);
399403adfb6SMatthew G Knepley   ierr = PetscPrintf(PETSC_COMM_SELF, "Running on...\n\n");CHKERRQ(ierr);
400403adfb6SMatthew G Knepley   cudaDeviceProp deviceProp;
401403adfb6SMatthew G Knepley   if (cudaGetDeviceProperties(&deviceProp, deviceNum) == cudaSuccess) {
402403adfb6SMatthew G Knepley     ierr = PetscPrintf(PETSC_COMM_SELF, " Device %d: %s\n", deviceNum, deviceProp.name);CHKERRQ(ierr);
403403adfb6SMatthew G Knepley   } else {
404403adfb6SMatthew G Knepley     ierr = PetscPrintf(PETSC_COMM_SELF, " Unable to determine device %d properties, exiting\n");CHKERRQ(ierr);
405403adfb6SMatthew G Knepley     return -1;
406403adfb6SMatthew G Knepley   }
407403adfb6SMatthew G Knepley 
408*caccb7e3SMatthew G Knepley   if (runDouble && deviceProp.major == 1 && deviceProp.minor < 3) {
409*caccb7e3SMatthew G Knepley     ierr = PetscPrintf(PETSC_COMM_SELF, " Unable to run double-precision STREAM benchmark on a compute capability GPU less than 1.3\n");CHKERRQ(ierr);
410*caccb7e3SMatthew G Knepley     return -1;
411*caccb7e3SMatthew G Knepley   }
412403adfb6SMatthew G Knepley   if (deviceProp.major == 2 && deviceProp.minor == 1) {
413403adfb6SMatthew G Knepley     iNumThreadsPerBlock = 192; /* GF104 architecture / 48 CUDA Cores per MP */
414403adfb6SMatthew G Knepley   } else {
415403adfb6SMatthew G Knepley     iNumThreadsPerBlock = 128; /* GF100 architecture / 32 CUDA Cores per MP */
416403adfb6SMatthew G Knepley   }
417403adfb6SMatthew G Knepley 
418403adfb6SMatthew G Knepley   if (cpuTiming) {
419403adfb6SMatthew G Knepley     ierr = PetscPrintf(PETSC_COMM_SELF, " Using cpu-only timer.\n");CHKERRQ(ierr);
420403adfb6SMatthew G Knepley   }
421403adfb6SMatthew G Knepley 
422403adfb6SMatthew G Knepley   ierr = runStream(iNumThreadsPerBlock, cpuTiming);CHKERRQ(ierr);
423*caccb7e3SMatthew G Knepley   if (runDouble) {
424*caccb7e3SMatthew G Knepley     ierr = cudaSetDeviceFlags(cudaDeviceBlockingSync);CHKERRQ(ierr);
425*caccb7e3SMatthew G Knepley     ierr = runStreamDouble(iNumThreadsPerBlock, cpuTiming);CHKERRQ(ierr);
426*caccb7e3SMatthew G Knepley   }
427403adfb6SMatthew G Knepley   PetscFunctionReturn(0);
428403adfb6SMatthew G Knepley }
429403adfb6SMatthew G Knepley 
430403adfb6SMatthew G Knepley ///////////////////////////////////////////////////////////////////////////
431403adfb6SMatthew G Knepley // runStream
432403adfb6SMatthew G Knepley ///////////////////////////////////////////////////////////////////////////
433403adfb6SMatthew G Knepley PetscErrorCode runStream(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming)
434403adfb6SMatthew G Knepley {
435403adfb6SMatthew G Knepley   float *d_a, *d_b, *d_c;
436403adfb6SMatthew G Knepley   int k;
437*caccb7e3SMatthew G Knepley   float times[8][NTIMES];
438403adfb6SMatthew G Knepley   float scalar;
439403adfb6SMatthew G Knepley   PetscErrorCode ierr;
440403adfb6SMatthew G Knepley 
441403adfb6SMatthew G Knepley   PetscFunctionBegin;
442403adfb6SMatthew G Knepley   /* Allocate memory on device */
443403adfb6SMatthew G Knepley   ierr = cudaMalloc((void**)&d_a, sizeof(float)*N);CHKERRQ(ierr);
444403adfb6SMatthew G Knepley   ierr = cudaMalloc((void**)&d_b, sizeof(float)*N);CHKERRQ(ierr);
445403adfb6SMatthew G Knepley   ierr = cudaMalloc((void**)&d_c, sizeof(float)*N);CHKERRQ(ierr);
446403adfb6SMatthew G Knepley 
447403adfb6SMatthew G Knepley   /* Compute execution configuration */
448403adfb6SMatthew G Knepley 
449403adfb6SMatthew G Knepley   dim3 dimBlock(iNumThreadsPerBlock); /* (iNumThreadsPerBlock,1,1) */
450403adfb6SMatthew G Knepley   dim3 dimGrid(N/dimBlock.x); /* (N/dimBlock.x,1,1) */
451403adfb6SMatthew G Knepley   if (N % dimBlock.x != 0) dimGrid.x+=1;
452403adfb6SMatthew G Knepley 
453403adfb6SMatthew G Knepley   ierr = PetscPrintf(PETSC_COMM_SELF, " Array size (single precision) = %u\n",N);CHKERRQ(ierr);
454403adfb6SMatthew G Knepley   ierr = PetscPrintf(PETSC_COMM_SELF, " using %u threads per block, %u blocks\n",dimBlock.x,dimGrid.x);CHKERRQ(ierr);
455403adfb6SMatthew G Knepley 
456403adfb6SMatthew G Knepley   /* Initialize memory on the device */
457403adfb6SMatthew G Knepley   set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N);
458403adfb6SMatthew G Knepley   set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N);
459403adfb6SMatthew G Knepley   set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N);
460403adfb6SMatthew G Knepley 
461403adfb6SMatthew G Knepley   /*	--- MAIN LOOP --- repeat test cases NTIMES times --- */
462403adfb6SMatthew G Knepley   PetscLogDouble cpuTimer = 0.0;
463403adfb6SMatthew G Knepley   cudaEvent_t start, stop;
464403adfb6SMatthew G Knepley 
465403adfb6SMatthew G Knepley   /* both timers report msec */
466403adfb6SMatthew G Knepley   ierr = cudaEventCreate( &start );CHKERRQ(ierr); /* gpu timer facility */
467403adfb6SMatthew G Knepley   ierr = cudaEventCreate( &stop );CHKERRQ(ierr);  /* gpu timer facility */
468403adfb6SMatthew G Knepley 
469403adfb6SMatthew G Knepley   scalar=3.0f;
470403adfb6SMatthew G Knepley   for(k = 0; k < NTIMES; ++k) {
471403adfb6SMatthew G Knepley     PetscTimeSubtract(cpuTimer);
472403adfb6SMatthew G Knepley     ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr);
473403adfb6SMatthew G Knepley     STREAM_Copy<<<dimGrid,dimBlock>>>(d_a, d_c, N);
474403adfb6SMatthew G Knepley     ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr);
475403adfb6SMatthew G Knepley     ierr = cudaEventSynchronize(stop);CHKERRQ(ierr);
476403adfb6SMatthew G Knepley     //get the the total elapsed time in ms
477403adfb6SMatthew G Knepley     PetscTimeAdd(cpuTimer);
478*caccb7e3SMatthew G Knepley     if (bDontUseGPUTiming) {
479403adfb6SMatthew G Knepley       times[0][k] = cpuTimer;
480403adfb6SMatthew G Knepley     } else {
481403adfb6SMatthew G Knepley       ierr = cudaEventElapsedTime( &times[0][k], start, stop );CHKERRQ(ierr);
482403adfb6SMatthew G Knepley     }
483403adfb6SMatthew G Knepley 
484403adfb6SMatthew G Knepley     cpuTimer = 0.0;
485403adfb6SMatthew G Knepley     PetscTimeSubtract(cpuTimer);
486403adfb6SMatthew G Knepley     ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr);
487403adfb6SMatthew G Knepley     STREAM_Copy_Optimized<<<dimGrid,dimBlock>>>(d_a, d_c, N);
488403adfb6SMatthew G Knepley     ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr);
489403adfb6SMatthew G Knepley     ierr = cudaEventSynchronize(stop);CHKERRQ(ierr);
490403adfb6SMatthew G Knepley     //get the the total elapsed time in ms
491403adfb6SMatthew G Knepley     PetscTimeAdd(cpuTimer);
492*caccb7e3SMatthew G Knepley     if (bDontUseGPUTiming) {
493403adfb6SMatthew G Knepley       times[1][k] = cpuTimer;
494403adfb6SMatthew G Knepley     } else {
495403adfb6SMatthew G Knepley       ierr = cudaEventElapsedTime( &times[1][k], start, stop );CHKERRQ(ierr);
496403adfb6SMatthew G Knepley     }
497403adfb6SMatthew G Knepley 
498403adfb6SMatthew G Knepley     cpuTimer = 0.0;
499403adfb6SMatthew G Knepley     PetscTimeSubtract(cpuTimer);
500403adfb6SMatthew G Knepley     ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr);
501403adfb6SMatthew G Knepley     STREAM_Scale<<<dimGrid,dimBlock>>>(d_b, d_c, scalar,  N);
502403adfb6SMatthew G Knepley     ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr);
503403adfb6SMatthew G Knepley     ierr = cudaEventSynchronize(stop);CHKERRQ(ierr);
504403adfb6SMatthew G Knepley     //get the the total elapsed time in ms
505403adfb6SMatthew G Knepley     PetscTimeAdd(cpuTimer);
506403adfb6SMatthew G Knepley     if (bDontUseGPUTiming) {
507403adfb6SMatthew G Knepley       times[2][k] = cpuTimer;
508403adfb6SMatthew G Knepley     } else {
509403adfb6SMatthew G Knepley       ierr = cudaEventElapsedTime( &times[2][k], start, stop );CHKERRQ(ierr);
510403adfb6SMatthew G Knepley     }
511403adfb6SMatthew G Knepley 
512403adfb6SMatthew G Knepley     cpuTimer = 0.0;
513403adfb6SMatthew G Knepley     PetscTimeSubtract(cpuTimer);
514403adfb6SMatthew G Knepley     ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr);
515*caccb7e3SMatthew G Knepley     STREAM_Scale_Optimized<<<dimGrid,dimBlock>>>(d_b, d_c, scalar,  N);
516403adfb6SMatthew G Knepley     ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr);
517403adfb6SMatthew G Knepley     ierr = cudaEventSynchronize(stop);CHKERRQ(ierr);
518403adfb6SMatthew G Knepley     //get the the total elapsed time in ms
519403adfb6SMatthew G Knepley     PetscTimeAdd(cpuTimer);
520403adfb6SMatthew G Knepley     if (bDontUseGPUTiming) {
521403adfb6SMatthew G Knepley       times[3][k] = cpuTimer;
522403adfb6SMatthew G Knepley     } else {
523403adfb6SMatthew G Knepley       ierr = cudaEventElapsedTime( &times[3][k], start, stop );CHKERRQ(ierr);
524403adfb6SMatthew G Knepley     }
525403adfb6SMatthew G Knepley 
526403adfb6SMatthew G Knepley     cpuTimer = 0.0;
527403adfb6SMatthew G Knepley     PetscTimeSubtract(cpuTimer);
528403adfb6SMatthew G Knepley     ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr);
529*caccb7e3SMatthew G Knepley     STREAM_Add<<<dimGrid,dimBlock>>>(d_a, d_b, d_c,  N);
530403adfb6SMatthew G Knepley     ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr);
531403adfb6SMatthew G Knepley     ierr = cudaEventSynchronize(stop);CHKERRQ(ierr);
532403adfb6SMatthew G Knepley     //get the the total elapsed time in ms
533403adfb6SMatthew G Knepley     PetscTimeAdd(cpuTimer);
534403adfb6SMatthew G Knepley     if (bDontUseGPUTiming) {
535403adfb6SMatthew G Knepley       times[4][k] = cpuTimer;
536403adfb6SMatthew G Knepley     } else {
537403adfb6SMatthew G Knepley       ierr = cudaEventElapsedTime( &times[4][k], start, stop );CHKERRQ(ierr);
538403adfb6SMatthew G Knepley     }
539403adfb6SMatthew G Knepley 
540*caccb7e3SMatthew G Knepley     cpuTimer = 0.0;
541*caccb7e3SMatthew G Knepley     PetscTimeSubtract(cpuTimer);
542*caccb7e3SMatthew G Knepley     ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr);
543*caccb7e3SMatthew G Knepley     STREAM_Add_Optimized<<<dimGrid,dimBlock>>>(d_a, d_b, d_c,  N);
544*caccb7e3SMatthew G Knepley     ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr);
545*caccb7e3SMatthew G Knepley     ierr = cudaEventSynchronize(stop);CHKERRQ(ierr);
546*caccb7e3SMatthew G Knepley     //get the the total elapsed time in ms
547*caccb7e3SMatthew G Knepley     PetscTimeAdd(cpuTimer);
548*caccb7e3SMatthew G Knepley     if (bDontUseGPUTiming) {
549*caccb7e3SMatthew G Knepley       times[5][k] = cpuTimer;
550*caccb7e3SMatthew G Knepley     } else {
551*caccb7e3SMatthew G Knepley       ierr = cudaEventElapsedTime( &times[5][k], start, stop );CHKERRQ(ierr);
552*caccb7e3SMatthew G Knepley     }
553*caccb7e3SMatthew G Knepley 
554*caccb7e3SMatthew G Knepley     cpuTimer = 0.0;
555*caccb7e3SMatthew G Knepley     PetscTimeSubtract(cpuTimer);
556*caccb7e3SMatthew G Knepley     ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr);
557*caccb7e3SMatthew G Knepley     STREAM_Triad<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar,  N);
558*caccb7e3SMatthew G Knepley     ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr);
559*caccb7e3SMatthew G Knepley     ierr = cudaEventSynchronize(stop);CHKERRQ(ierr);
560*caccb7e3SMatthew G Knepley     //get the the total elapsed time in ms
561*caccb7e3SMatthew G Knepley     PetscTimeAdd(cpuTimer);
562*caccb7e3SMatthew G Knepley     if (bDontUseGPUTiming) {
563*caccb7e3SMatthew G Knepley       times[6][k] = cpuTimer;
564*caccb7e3SMatthew G Knepley     } else {
565*caccb7e3SMatthew G Knepley       ierr = cudaEventElapsedTime( &times[6][k], start, stop );CHKERRQ(ierr);
566*caccb7e3SMatthew G Knepley     }
567*caccb7e3SMatthew G Knepley 
568*caccb7e3SMatthew G Knepley     cpuTimer = 0.0;
569*caccb7e3SMatthew G Knepley     PetscTimeSubtract(cpuTimer);
570*caccb7e3SMatthew G Knepley     ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr);
571*caccb7e3SMatthew G Knepley     STREAM_Triad_Optimized<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar,  N);
572*caccb7e3SMatthew G Knepley     ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr);
573*caccb7e3SMatthew G Knepley     ierr = cudaEventSynchronize(stop);CHKERRQ(ierr);
574*caccb7e3SMatthew G Knepley     //get the the total elapsed time in ms
575*caccb7e3SMatthew G Knepley     PetscTimeAdd(cpuTimer);
576*caccb7e3SMatthew G Knepley     if (bDontUseGPUTiming) {
577*caccb7e3SMatthew G Knepley       times[7][k] = cpuTimer;
578*caccb7e3SMatthew G Knepley     } else {
579*caccb7e3SMatthew G Knepley       ierr = cudaEventElapsedTime( &times[7][k], start, stop );CHKERRQ(ierr);
580*caccb7e3SMatthew G Knepley     }
581*caccb7e3SMatthew G Knepley 
582403adfb6SMatthew G Knepley   }
583403adfb6SMatthew G Knepley 
584403adfb6SMatthew G Knepley   /* verify kernels */
585403adfb6SMatthew G Knepley   float *h_a, *h_b, *h_c;
586403adfb6SMatthew G Knepley   bool errorSTREAMkernel = true;
587403adfb6SMatthew G Knepley 
588403adfb6SMatthew G Knepley   if ( (h_a = (float*)calloc( N, sizeof(float) )) == (float*)NULL ) {
589403adfb6SMatthew G Knepley     printf("Unable to allocate array h_a, exiting ...\n");
590403adfb6SMatthew G Knepley     exit(1);
591403adfb6SMatthew G Knepley   }
592403adfb6SMatthew G Knepley   if ( (h_b = (float*)calloc( N, sizeof(float) )) == (float*)NULL ) {
593403adfb6SMatthew G Knepley     printf("Unable to allocate array h_b, exiting ...\n");
594403adfb6SMatthew G Knepley     exit(1);
595403adfb6SMatthew G Knepley   }
596403adfb6SMatthew G Knepley 
597403adfb6SMatthew G Knepley   if ( (h_c = (float*)calloc( N, sizeof(float) )) == (float*)NULL ) {
598403adfb6SMatthew G Knepley     printf("Unalbe to allocate array h_c, exiting ...\n");
599403adfb6SMatthew G Knepley     exit(1);
600403adfb6SMatthew G Knepley   }
601403adfb6SMatthew G Knepley 
602403adfb6SMatthew G Knepley   /*
603403adfb6SMatthew G Knepley    * perform kernel, copy device memory into host memory and verify each
604403adfb6SMatthew G Knepley    * device kernel output
605403adfb6SMatthew G Knepley    */
606403adfb6SMatthew G Knepley 
607403adfb6SMatthew G Knepley   /* Initialize memory on the device */
608403adfb6SMatthew G Knepley   set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N);
609403adfb6SMatthew G Knepley   set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N);
610403adfb6SMatthew G Knepley   set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N);
611403adfb6SMatthew G Knepley 
612403adfb6SMatthew G Knepley   STREAM_Copy<<<dimGrid,dimBlock>>>(d_a, d_c, N);
613403adfb6SMatthew G Knepley   ierr = cudaMemcpy( h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
614403adfb6SMatthew G Knepley   ierr = cudaMemcpy( h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
615403adfb6SMatthew G Knepley   errorSTREAMkernel = STREAM_Copy_verify(h_a, h_c, N);
616403adfb6SMatthew G Knepley   if (errorSTREAMkernel) {
617403adfb6SMatthew G Knepley     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy:\t\tError detected in device STREAM_Copy, exiting\n");CHKERRQ(ierr);
618403adfb6SMatthew G Knepley     exit(-2000);
619403adfb6SMatthew G Knepley   } else {
620403adfb6SMatthew G Knepley     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy:\t\tPass\n");CHKERRQ(ierr);
621403adfb6SMatthew G Knepley   }
622403adfb6SMatthew G Knepley 
623403adfb6SMatthew G Knepley   /* Initialize memory on the device */
624403adfb6SMatthew G Knepley   set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N);
625403adfb6SMatthew G Knepley   set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N);
626403adfb6SMatthew G Knepley   set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N);
627403adfb6SMatthew G Knepley 
628403adfb6SMatthew G Knepley   STREAM_Copy_Optimized<<<dimGrid,dimBlock>>>(d_a, d_c, N);
629403adfb6SMatthew G Knepley   ierr = cudaMemcpy( h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
630403adfb6SMatthew G Knepley   ierr = cudaMemcpy( h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
631403adfb6SMatthew G Knepley   errorSTREAMkernel = STREAM_Copy_verify(h_a, h_c, N);
632403adfb6SMatthew G Knepley   if (errorSTREAMkernel) {
633403adfb6SMatthew G Knepley     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy_Optimized:\tError detected in device STREAM_Copy_Optimized, exiting\n");CHKERRQ(ierr);
634403adfb6SMatthew G Knepley     exit(-3000);
635403adfb6SMatthew G Knepley   } else {
636403adfb6SMatthew G Knepley     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy_Optimized:\tPass\n");CHKERRQ(ierr);
637403adfb6SMatthew G Knepley   }
638403adfb6SMatthew G Knepley 
639403adfb6SMatthew G Knepley   /* Initialize memory on the device */
640403adfb6SMatthew G Knepley   set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N);
641403adfb6SMatthew G Knepley   set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N);
642403adfb6SMatthew G Knepley 
643403adfb6SMatthew G Knepley   STREAM_Scale<<<dimGrid,dimBlock>>>(d_b, d_c, scalar, N);
644403adfb6SMatthew G Knepley   ierr = cudaMemcpy( h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
645403adfb6SMatthew G Knepley   ierr = cudaMemcpy( h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
646403adfb6SMatthew G Knepley   errorSTREAMkernel = STREAM_Scale_verify(h_b, h_c, scalar, N);
647403adfb6SMatthew G Knepley   if (errorSTREAMkernel) {
648403adfb6SMatthew G Knepley     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Scale:\t\tError detected in device STREAM_Scale, exiting\n");CHKERRQ(ierr);
649403adfb6SMatthew G Knepley     exit(-4000);
650403adfb6SMatthew G Knepley   } else {
651403adfb6SMatthew G Knepley     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Scale:\t\tPass\n");CHKERRQ(ierr);
652403adfb6SMatthew G Knepley   }
653403adfb6SMatthew G Knepley 
654403adfb6SMatthew G Knepley   /* Initialize memory on the device */
655403adfb6SMatthew G Knepley   set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N);
656403adfb6SMatthew G Knepley   set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N);
657403adfb6SMatthew G Knepley   set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N);
658403adfb6SMatthew G Knepley 
659403adfb6SMatthew G Knepley   STREAM_Add<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N);
660403adfb6SMatthew G Knepley   ierr = cudaMemcpy( h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
661403adfb6SMatthew G Knepley   ierr = cudaMemcpy( h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
662403adfb6SMatthew G Knepley   ierr = cudaMemcpy( h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
663403adfb6SMatthew G Knepley   errorSTREAMkernel = STREAM_Add_verify(h_a, h_b, h_c, N);
664403adfb6SMatthew G Knepley   if (errorSTREAMkernel) {
665403adfb6SMatthew G Knepley     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Add:\t\tError detected in device STREAM_Add, exiting\n");CHKERRQ(ierr);
666403adfb6SMatthew G Knepley     exit(-5000);
667403adfb6SMatthew G Knepley   } else {
668403adfb6SMatthew G Knepley     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Add:\t\tPass\n");CHKERRQ(ierr);
669403adfb6SMatthew G Knepley   }
670403adfb6SMatthew G Knepley 
671403adfb6SMatthew G Knepley   /* Initialize memory on the device */
672403adfb6SMatthew G Knepley   set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N);
673403adfb6SMatthew G Knepley   set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N);
674403adfb6SMatthew G Knepley   set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N);
675403adfb6SMatthew G Knepley 
676403adfb6SMatthew G Knepley   STREAM_Triad<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N);
677403adfb6SMatthew G Knepley   ierr = cudaMemcpy( h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
678403adfb6SMatthew G Knepley   ierr = cudaMemcpy( h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
679403adfb6SMatthew G Knepley   ierr = cudaMemcpy( h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
680403adfb6SMatthew G Knepley   errorSTREAMkernel = STREAM_Triad_verify(h_b, h_c, h_a, scalar, N);
681403adfb6SMatthew G Knepley   if (errorSTREAMkernel) {
682403adfb6SMatthew G Knepley     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Triad:\t\tError detected in device STREAM_Triad, exiting\n");CHKERRQ(ierr);
683403adfb6SMatthew G Knepley     exit(-6000);
684403adfb6SMatthew G Knepley   } else {
685403adfb6SMatthew G Knepley     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Triad:\t\tPass\n");CHKERRQ(ierr);
686403adfb6SMatthew G Knepley   }
687403adfb6SMatthew G Knepley 
688403adfb6SMatthew G Knepley   /* continue from here */
689403adfb6SMatthew G Knepley   printResultsReadable(times);
690403adfb6SMatthew G Knepley 
691403adfb6SMatthew G Knepley   //clean up timers
692403adfb6SMatthew G Knepley   ierr = cudaEventDestroy( stop );CHKERRQ(ierr);
693403adfb6SMatthew G Knepley   ierr = cudaEventDestroy( start );CHKERRQ(ierr);
694403adfb6SMatthew G Knepley 
695403adfb6SMatthew G Knepley   /* Free memory on device */
696403adfb6SMatthew G Knepley   ierr = cudaFree(d_a);CHKERRQ(ierr);
697403adfb6SMatthew G Knepley   ierr = cudaFree(d_b);CHKERRQ(ierr);
698403adfb6SMatthew G Knepley   ierr = cudaFree(d_c);CHKERRQ(ierr);
699403adfb6SMatthew G Knepley   PetscFunctionReturn(0);
700403adfb6SMatthew G Knepley }
701403adfb6SMatthew G Knepley 
702*caccb7e3SMatthew G Knepley PetscErrorCode runStreamDouble(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming)
703*caccb7e3SMatthew G Knepley {
704*caccb7e3SMatthew G Knepley   double *d_a, *d_b, *d_c;
705*caccb7e3SMatthew G Knepley   int k;
706*caccb7e3SMatthew G Knepley   float times[8][NTIMES];
707*caccb7e3SMatthew G Knepley   double scalar;
708*caccb7e3SMatthew G Knepley   PetscErrorCode ierr;
709*caccb7e3SMatthew G Knepley 
710*caccb7e3SMatthew G Knepley   PetscFunctionBegin;
711*caccb7e3SMatthew G Knepley   /* Allocate memory on device */
712*caccb7e3SMatthew G Knepley   ierr = cudaMalloc((void**)&d_a, sizeof(double)*N);CHKERRQ(ierr);
713*caccb7e3SMatthew G Knepley   ierr = cudaMalloc((void**)&d_b, sizeof(double)*N);CHKERRQ(ierr);
714*caccb7e3SMatthew G Knepley   ierr = cudaMalloc((void**)&d_c, sizeof(double)*N);CHKERRQ(ierr);
715*caccb7e3SMatthew G Knepley 
716*caccb7e3SMatthew G Knepley   /* Compute execution configuration */
717*caccb7e3SMatthew G Knepley 
718*caccb7e3SMatthew G Knepley   dim3 dimBlock(iNumThreadsPerBlock); /* (iNumThreadsPerBlock,1,1) */
719*caccb7e3SMatthew G Knepley   dim3 dimGrid(N/dimBlock.x); /* (N/dimBlock.x,1,1) */
720*caccb7e3SMatthew G Knepley   if (N % dimBlock.x != 0) dimGrid.x+=1;
721*caccb7e3SMatthew G Knepley 
722*caccb7e3SMatthew G Knepley   ierr = PetscPrintf(PETSC_COMM_SELF, " Array size (double precision) = %u\n",N);CHKERRQ(ierr);
723*caccb7e3SMatthew G Knepley   ierr = PetscPrintf(PETSC_COMM_SELF, " using %u threads per block, %u blocks\n",dimBlock.x,dimGrid.x);CHKERRQ(ierr);
724*caccb7e3SMatthew G Knepley 
725*caccb7e3SMatthew G Knepley   /* Initialize memory on the device */
726*caccb7e3SMatthew G Knepley   set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N);
727*caccb7e3SMatthew G Knepley   set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N);
728*caccb7e3SMatthew G Knepley   set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N);
729*caccb7e3SMatthew G Knepley 
730*caccb7e3SMatthew G Knepley   /*	--- MAIN LOOP --- repeat test cases NTIMES times --- */
731*caccb7e3SMatthew G Knepley   PetscLogDouble cpuTimer = 0.0;
732*caccb7e3SMatthew G Knepley   cudaEvent_t start, stop;
733*caccb7e3SMatthew G Knepley 
734*caccb7e3SMatthew G Knepley   /* both timers report msec */
735*caccb7e3SMatthew G Knepley   ierr = cudaEventCreate( &start );CHKERRQ(ierr); /* gpu timer facility */
736*caccb7e3SMatthew G Knepley   ierr = cudaEventCreate( &stop );CHKERRQ(ierr);  /* gpu timer facility */
737*caccb7e3SMatthew G Knepley 
738*caccb7e3SMatthew G Knepley   scalar=3.0;
739*caccb7e3SMatthew G Knepley   for(k = 0; k < NTIMES; ++k) {
740*caccb7e3SMatthew G Knepley     PetscTimeSubtract(cpuTimer);
741*caccb7e3SMatthew G Knepley     ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr);
742*caccb7e3SMatthew G Knepley     STREAM_Copy_double<<<dimGrid,dimBlock>>>(d_a, d_c, N);
743*caccb7e3SMatthew G Knepley     ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr);
744*caccb7e3SMatthew G Knepley     ierr = cudaEventSynchronize(stop);CHKERRQ(ierr);
745*caccb7e3SMatthew G Knepley     //get the the total elapsed time in ms
746*caccb7e3SMatthew G Knepley     if (bDontUseGPUTiming) {
747*caccb7e3SMatthew G Knepley       PetscTimeAdd(cpuTimer);
748*caccb7e3SMatthew G Knepley       times[0][k] = cpuTimer;
749*caccb7e3SMatthew G Knepley     } else {
750*caccb7e3SMatthew G Knepley       ierr = cudaEventElapsedTime( &times[0][k], start, stop );CHKERRQ(ierr);
751*caccb7e3SMatthew G Knepley     }
752*caccb7e3SMatthew G Knepley 
753*caccb7e3SMatthew G Knepley     cpuTimer = 0.0;
754*caccb7e3SMatthew G Knepley     PetscTimeSubtract(cpuTimer);
755*caccb7e3SMatthew G Knepley     ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr);
756*caccb7e3SMatthew G Knepley     STREAM_Copy_Optimized_double<<<dimGrid,dimBlock>>>(d_a, d_c, N);
757*caccb7e3SMatthew G Knepley     ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr);
758*caccb7e3SMatthew G Knepley     ierr = cudaEventSynchronize(stop);CHKERRQ(ierr);
759*caccb7e3SMatthew G Knepley     //get the the total elapsed time in ms
760*caccb7e3SMatthew G Knepley     if (bDontUseGPUTiming) {
761*caccb7e3SMatthew G Knepley       PetscTimeAdd(cpuTimer);
762*caccb7e3SMatthew G Knepley       times[1][k] = cpuTimer;
763*caccb7e3SMatthew G Knepley     } else {
764*caccb7e3SMatthew G Knepley       ierr = cudaEventElapsedTime( &times[1][k], start, stop );CHKERRQ(ierr);
765*caccb7e3SMatthew G Knepley     }
766*caccb7e3SMatthew G Knepley 
767*caccb7e3SMatthew G Knepley     cpuTimer = 0.0;
768*caccb7e3SMatthew G Knepley     PetscTimeSubtract(cpuTimer);
769*caccb7e3SMatthew G Knepley     ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr);
770*caccb7e3SMatthew G Knepley     STREAM_Scale_double<<<dimGrid,dimBlock>>>(d_b, d_c, scalar,  N);
771*caccb7e3SMatthew G Knepley     ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr);
772*caccb7e3SMatthew G Knepley     ierr = cudaEventSynchronize(stop);CHKERRQ(ierr);
773*caccb7e3SMatthew G Knepley     //get the the total elapsed time in ms
774*caccb7e3SMatthew G Knepley     PetscTimeAdd(cpuTimer);
775*caccb7e3SMatthew G Knepley     if (bDontUseGPUTiming) {
776*caccb7e3SMatthew G Knepley       times[2][k] = cpuTimer;
777*caccb7e3SMatthew G Knepley     } else {
778*caccb7e3SMatthew G Knepley       ierr = cudaEventElapsedTime( &times[2][k], start, stop );CHKERRQ(ierr);
779*caccb7e3SMatthew G Knepley     }
780*caccb7e3SMatthew G Knepley 
781*caccb7e3SMatthew G Knepley     cpuTimer = 0.0;
782*caccb7e3SMatthew G Knepley     PetscTimeSubtract(cpuTimer);
783*caccb7e3SMatthew G Knepley     ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr);
784*caccb7e3SMatthew G Knepley     STREAM_Scale_Optimized_double<<<dimGrid,dimBlock>>>(d_b, d_c, scalar,  N);
785*caccb7e3SMatthew G Knepley     ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr);
786*caccb7e3SMatthew G Knepley     ierr = cudaEventSynchronize(stop);CHKERRQ(ierr);
787*caccb7e3SMatthew G Knepley     //get the the total elapsed time in ms
788*caccb7e3SMatthew G Knepley     PetscTimeAdd(cpuTimer);
789*caccb7e3SMatthew G Knepley     if (bDontUseGPUTiming) {
790*caccb7e3SMatthew G Knepley       times[3][k] = cpuTimer;
791*caccb7e3SMatthew G Knepley     } else {
792*caccb7e3SMatthew G Knepley       ierr = cudaEventElapsedTime( &times[2][k], start, stop );CHKERRQ(ierr);
793*caccb7e3SMatthew G Knepley     }
794*caccb7e3SMatthew G Knepley 
795*caccb7e3SMatthew G Knepley     cpuTimer = 0.0;
796*caccb7e3SMatthew G Knepley     PetscTimeSubtract(cpuTimer);
797*caccb7e3SMatthew G Knepley     ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr);
798*caccb7e3SMatthew G Knepley     STREAM_Add_double<<<dimGrid,dimBlock>>>(d_a, d_b, d_c,  N);
799*caccb7e3SMatthew G Knepley     ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr);
800*caccb7e3SMatthew G Knepley     ierr = cudaEventSynchronize(stop);CHKERRQ(ierr);
801*caccb7e3SMatthew G Knepley     //get the the total elapsed time in ms
802*caccb7e3SMatthew G Knepley     PetscTimeAdd(cpuTimer);
803*caccb7e3SMatthew G Knepley     if (bDontUseGPUTiming) {
804*caccb7e3SMatthew G Knepley       times[4][k] = cpuTimer;
805*caccb7e3SMatthew G Knepley     } else {
806*caccb7e3SMatthew G Knepley       ierr = cudaEventElapsedTime( &times[3][k], start, stop );CHKERRQ(ierr);
807*caccb7e3SMatthew G Knepley     }
808*caccb7e3SMatthew G Knepley 
809*caccb7e3SMatthew G Knepley     cpuTimer = 0.0;
810*caccb7e3SMatthew G Knepley     PetscTimeSubtract(cpuTimer);
811*caccb7e3SMatthew G Knepley     ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr);
812*caccb7e3SMatthew G Knepley     STREAM_Add_Optimized_double<<<dimGrid,dimBlock>>>(d_a, d_b, d_c,  N);
813*caccb7e3SMatthew G Knepley     ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr);
814*caccb7e3SMatthew G Knepley     ierr = cudaEventSynchronize(stop);CHKERRQ(ierr);
815*caccb7e3SMatthew G Knepley     //get the the total elapsed time in ms
816*caccb7e3SMatthew G Knepley     PetscTimeAdd(cpuTimer);
817*caccb7e3SMatthew G Knepley     if (bDontUseGPUTiming) {
818*caccb7e3SMatthew G Knepley       times[5][k] = cpuTimer;
819*caccb7e3SMatthew G Knepley     } else {
820*caccb7e3SMatthew G Knepley       ierr = cudaEventElapsedTime( &times[3][k], start, stop );CHKERRQ(ierr);
821*caccb7e3SMatthew G Knepley     }
822*caccb7e3SMatthew G Knepley 
823*caccb7e3SMatthew G Knepley     cpuTimer = 0.0;
824*caccb7e3SMatthew G Knepley     PetscTimeSubtract(cpuTimer);
825*caccb7e3SMatthew G Knepley     ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr);
826*caccb7e3SMatthew G Knepley     STREAM_Triad_double<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar,  N);
827*caccb7e3SMatthew G Knepley     ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr);
828*caccb7e3SMatthew G Knepley     ierr = cudaEventSynchronize(stop);CHKERRQ(ierr);
829*caccb7e3SMatthew G Knepley     //get the the total elapsed time in ms
830*caccb7e3SMatthew G Knepley     PetscTimeAdd(cpuTimer);
831*caccb7e3SMatthew G Knepley     if (bDontUseGPUTiming) {
832*caccb7e3SMatthew G Knepley       times[6][k] = cpuTimer;
833*caccb7e3SMatthew G Knepley     } else {
834*caccb7e3SMatthew G Knepley       ierr = cudaEventElapsedTime( &times[4][k], start, stop );CHKERRQ(ierr);
835*caccb7e3SMatthew G Knepley     }
836*caccb7e3SMatthew G Knepley 
837*caccb7e3SMatthew G Knepley     cpuTimer = 0.0;
838*caccb7e3SMatthew G Knepley     PetscTimeSubtract(cpuTimer);
839*caccb7e3SMatthew G Knepley     ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr);
840*caccb7e3SMatthew G Knepley     STREAM_Triad_Optimized_double<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar,  N);
841*caccb7e3SMatthew G Knepley     ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr);
842*caccb7e3SMatthew G Knepley     ierr = cudaEventSynchronize(stop);CHKERRQ(ierr);
843*caccb7e3SMatthew G Knepley     //get the the total elapsed time in ms
844*caccb7e3SMatthew G Knepley     PetscTimeAdd(cpuTimer);
845*caccb7e3SMatthew G Knepley     if (bDontUseGPUTiming) {
846*caccb7e3SMatthew G Knepley       times[7][k] = cpuTimer;
847*caccb7e3SMatthew G Knepley     } else {
848*caccb7e3SMatthew G Knepley       ierr = cudaEventElapsedTime( &times[4][k], start, stop );CHKERRQ(ierr);
849*caccb7e3SMatthew G Knepley     }
850*caccb7e3SMatthew G Knepley 
851*caccb7e3SMatthew G Knepley   }
852*caccb7e3SMatthew G Knepley 
853*caccb7e3SMatthew G Knepley   /* verify kernels */
854*caccb7e3SMatthew G Knepley   double *h_a, *h_b, *h_c;
855*caccb7e3SMatthew G Knepley   bool errorSTREAMkernel = true;
856*caccb7e3SMatthew G Knepley 
857*caccb7e3SMatthew G Knepley   if ( (h_a = (double*)calloc( N, sizeof(double) )) == (double*)NULL ) {
858*caccb7e3SMatthew G Knepley     printf("Unable to allocate array h_a, exiting ...\n");
859*caccb7e3SMatthew G Knepley     exit(1);
860*caccb7e3SMatthew G Knepley   }
861*caccb7e3SMatthew G Knepley   if ( (h_b = (double*)calloc( N, sizeof(double) )) == (double*)NULL ) {
862*caccb7e3SMatthew G Knepley     printf("Unable to allocate array h_b, exiting ...\n");
863*caccb7e3SMatthew G Knepley     exit(1);
864*caccb7e3SMatthew G Knepley   }
865*caccb7e3SMatthew G Knepley 
866*caccb7e3SMatthew G Knepley   if ( (h_c = (double*)calloc( N, sizeof(double) )) == (double*)NULL ) {
867*caccb7e3SMatthew G Knepley     printf("Unalbe to allocate array h_c, exiting ...\n");
868*caccb7e3SMatthew G Knepley     exit(1);
869*caccb7e3SMatthew G Knepley   }
870*caccb7e3SMatthew G Knepley 
871*caccb7e3SMatthew G Knepley   /*
872*caccb7e3SMatthew G Knepley    * perform kernel, copy device memory into host memory and verify each
873*caccb7e3SMatthew G Knepley    * device kernel output
874*caccb7e3SMatthew G Knepley    */
875*caccb7e3SMatthew G Knepley 
876*caccb7e3SMatthew G Knepley   /* Initialize memory on the device */
877*caccb7e3SMatthew G Knepley   set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N);
878*caccb7e3SMatthew G Knepley   set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N);
879*caccb7e3SMatthew G Knepley   set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N);
880*caccb7e3SMatthew G Knepley 
881*caccb7e3SMatthew G Knepley   STREAM_Copy_double<<<dimGrid,dimBlock>>>(d_a, d_c, N);
882*caccb7e3SMatthew G Knepley   ierr = cudaMemcpy( h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
883*caccb7e3SMatthew G Knepley   ierr = cudaMemcpy( h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
884*caccb7e3SMatthew G Knepley   errorSTREAMkernel = STREAM_Copy_verify_double(h_a, h_c, N);
885*caccb7e3SMatthew G Knepley   if (errorSTREAMkernel) {
886*caccb7e3SMatthew G Knepley     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy:\t\tError detected in device STREAM_Copy, exiting\n");CHKERRQ(ierr);
887*caccb7e3SMatthew G Knepley     exit(-2000);
888*caccb7e3SMatthew G Knepley   } else {
889*caccb7e3SMatthew G Knepley     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy:\t\tPass\n");CHKERRQ(ierr);
890*caccb7e3SMatthew G Knepley   }
891*caccb7e3SMatthew G Knepley 
892*caccb7e3SMatthew G Knepley   /* Initialize memory on the device */
893*caccb7e3SMatthew G Knepley   set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N);
894*caccb7e3SMatthew G Knepley   set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N);
895*caccb7e3SMatthew G Knepley   set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N);
896*caccb7e3SMatthew G Knepley 
897*caccb7e3SMatthew G Knepley   STREAM_Copy_Optimized_double<<<dimGrid,dimBlock>>>(d_a, d_c, N);
898*caccb7e3SMatthew G Knepley   ierr = cudaMemcpy( h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
899*caccb7e3SMatthew G Knepley   ierr = cudaMemcpy( h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
900*caccb7e3SMatthew G Knepley   errorSTREAMkernel = STREAM_Copy_verify_double(h_a, h_c, N);
901*caccb7e3SMatthew G Knepley   if (errorSTREAMkernel) {
902*caccb7e3SMatthew G Knepley     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy_Optimized:\tError detected in device STREAM_Copy_Optimized, exiting\n");CHKERRQ(ierr);
903*caccb7e3SMatthew G Knepley     exit(-3000);
904*caccb7e3SMatthew G Knepley   } else {
905*caccb7e3SMatthew G Knepley     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy_Optimized:\tPass\n");CHKERRQ(ierr);
906*caccb7e3SMatthew G Knepley   }
907*caccb7e3SMatthew G Knepley 
908*caccb7e3SMatthew G Knepley   /* Initialize memory on the device */
909*caccb7e3SMatthew G Knepley   set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N);
910*caccb7e3SMatthew G Knepley   set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N);
911*caccb7e3SMatthew G Knepley 
912*caccb7e3SMatthew G Knepley   STREAM_Scale_double<<<dimGrid,dimBlock>>>(d_b, d_c, scalar, N);
913*caccb7e3SMatthew G Knepley   ierr = cudaMemcpy( h_b, d_b, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
914*caccb7e3SMatthew G Knepley   ierr = cudaMemcpy( h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
915*caccb7e3SMatthew G Knepley   errorSTREAMkernel = STREAM_Scale_verify_double(h_b, h_c, scalar, N);
916*caccb7e3SMatthew G Knepley   if (errorSTREAMkernel) {
917*caccb7e3SMatthew G Knepley     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Scale:\t\tError detected in device STREAM_Scale, exiting\n");CHKERRQ(ierr);
918*caccb7e3SMatthew G Knepley     exit(-4000);
919*caccb7e3SMatthew G Knepley   } else {
920*caccb7e3SMatthew G Knepley     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Scale:\t\tPass\n");CHKERRQ(ierr);
921*caccb7e3SMatthew G Knepley   }
922*caccb7e3SMatthew G Knepley 
923*caccb7e3SMatthew G Knepley   /* Initialize memory on the device */
924*caccb7e3SMatthew G Knepley   set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N);
925*caccb7e3SMatthew G Knepley   set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N);
926*caccb7e3SMatthew G Knepley   set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N);
927*caccb7e3SMatthew G Knepley 
928*caccb7e3SMatthew G Knepley   STREAM_Add_double<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N);
929*caccb7e3SMatthew G Knepley   ierr = cudaMemcpy( h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
930*caccb7e3SMatthew G Knepley   ierr = cudaMemcpy( h_b, d_b, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
931*caccb7e3SMatthew G Knepley   ierr = cudaMemcpy( h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
932*caccb7e3SMatthew G Knepley   errorSTREAMkernel = STREAM_Add_verify_double(h_a, h_b, h_c, N);
933*caccb7e3SMatthew G Knepley   if (errorSTREAMkernel) {
934*caccb7e3SMatthew G Knepley     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Add:\t\tError detected in device STREAM_Add, exiting\n");CHKERRQ(ierr);
935*caccb7e3SMatthew G Knepley     exit(-5000);
936*caccb7e3SMatthew G Knepley   } else {
937*caccb7e3SMatthew G Knepley     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Add:\t\tPass\n");CHKERRQ(ierr);
938*caccb7e3SMatthew G Knepley   }
939*caccb7e3SMatthew G Knepley 
940*caccb7e3SMatthew G Knepley   /* Initialize memory on the device */
941*caccb7e3SMatthew G Knepley   set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N);
942*caccb7e3SMatthew G Knepley   set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N);
943*caccb7e3SMatthew G Knepley   set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N);
944*caccb7e3SMatthew G Knepley 
945*caccb7e3SMatthew G Knepley   STREAM_Triad_double<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N);
946*caccb7e3SMatthew G Knepley   ierr = cudaMemcpy( h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
947*caccb7e3SMatthew G Knepley   ierr = cudaMemcpy( h_b, d_b, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
948*caccb7e3SMatthew G Knepley   ierr = cudaMemcpy( h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
949*caccb7e3SMatthew G Knepley   errorSTREAMkernel = STREAM_Triad_verify_double(h_b, h_c, h_a, scalar, N);
950*caccb7e3SMatthew G Knepley   if (errorSTREAMkernel) {
951*caccb7e3SMatthew G Knepley     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Triad:\t\tError detected in device STREAM_Triad, exiting\n");CHKERRQ(ierr);
952*caccb7e3SMatthew G Knepley     exit(-6000);
953*caccb7e3SMatthew G Knepley   } else {
954*caccb7e3SMatthew G Knepley     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Triad:\t\tPass\n");CHKERRQ(ierr);
955*caccb7e3SMatthew G Knepley   }
956*caccb7e3SMatthew G Knepley 
957*caccb7e3SMatthew G Knepley   /* continue from here */
958*caccb7e3SMatthew G Knepley   printResultsReadable(times);
959*caccb7e3SMatthew G Knepley 
960*caccb7e3SMatthew G Knepley   //clean up timers
961*caccb7e3SMatthew G Knepley   ierr = cudaEventDestroy( stop );CHKERRQ(ierr);
962*caccb7e3SMatthew G Knepley   ierr = cudaEventDestroy( start );CHKERRQ(ierr);
963*caccb7e3SMatthew G Knepley 
964*caccb7e3SMatthew G Knepley   /* Free memory on device */
965*caccb7e3SMatthew G Knepley   ierr = cudaFree(d_a);CHKERRQ(ierr);
966*caccb7e3SMatthew G Knepley   ierr = cudaFree(d_b);CHKERRQ(ierr);
967*caccb7e3SMatthew G Knepley   ierr = cudaFree(d_c);CHKERRQ(ierr);
968*caccb7e3SMatthew G Knepley   PetscFunctionReturn(0);
969*caccb7e3SMatthew G Knepley }
970*caccb7e3SMatthew G Knepley 
971403adfb6SMatthew G Knepley ///////////////////////////////////////////////////////////////////////////
972403adfb6SMatthew G Knepley //Print Results to Screen and File
973403adfb6SMatthew G Knepley ///////////////////////////////////////////////////////////////////////////
974403adfb6SMatthew G Knepley PetscErrorCode printResultsReadable(float times[][NTIMES]) {
975403adfb6SMatthew G Knepley   PetscErrorCode ierr;
976403adfb6SMatthew G Knepley   PetscInt       j, k;
977*caccb7e3SMatthew G Knepley   float	avgtime[8] = {0., 0., 0., 0., 0., 0., 0., 0.};
978*caccb7e3SMatthew G Knepley   float	maxtime[8] = {0., 0., 0., 0., 0., 0., 0., 0.};
979*caccb7e3SMatthew G Knepley   float	mintime[8] = {1e30,1e30,1e30,1e30,1e30,1e30,1e30,1e30};
980*caccb7e3SMatthew G Knepley   char *label[8]   = {"Copy:      ", "Copy Opt.: ", "Scale:     ", "Scale Opt: ", "Add:       ", "Add Opt:   ", "Triad:     ", "Triad Opt: "};
981*caccb7e3SMatthew G Knepley   float	bytes_per_kernel[8] = {
982403adfb6SMatthew G Knepley     2. * sizeof(float) * N,
983403adfb6SMatthew G Knepley     2. * sizeof(float) * N,
984403adfb6SMatthew G Knepley     2. * sizeof(float) * N,
985*caccb7e3SMatthew G Knepley     2. * sizeof(float) * N,
986*caccb7e3SMatthew G Knepley     3. * sizeof(float) * N,
987*caccb7e3SMatthew G Knepley     3. * sizeof(float) * N,
988403adfb6SMatthew G Knepley     3. * sizeof(float) * N,
989403adfb6SMatthew G Knepley     3. * sizeof(float) * N
990403adfb6SMatthew G Knepley   };
991403adfb6SMatthew G Knepley 
992403adfb6SMatthew G Knepley   PetscFunctionBegin;
993403adfb6SMatthew G Knepley   /* --- SUMMARY --- */
994403adfb6SMatthew G Knepley   for(k = 1; k < NTIMES; ++k) { /* note -- skip first iteration */
995*caccb7e3SMatthew G Knepley     for(j = 0; j < 8; ++j) {
996403adfb6SMatthew G Knepley       avgtime[j] = avgtime[j] + (1.e-03f * times[j][k]);
997403adfb6SMatthew G Knepley       mintime[j] = MIN(mintime[j], (1.e-03f * times[j][k]));
998403adfb6SMatthew G Knepley       maxtime[j] = MAX(maxtime[j], (1.e-03f * times[j][k]));
999403adfb6SMatthew G Knepley     }
1000403adfb6SMatthew G Knepley   }
1001403adfb6SMatthew G Knepley 
1002403adfb6SMatthew G Knepley   ierr = PetscPrintf(PETSC_COMM_SELF, "Function    Rate (MB/s)    Avg time      Min time      Max time\n");CHKERRQ(ierr);
1003403adfb6SMatthew G Knepley 
1004*caccb7e3SMatthew G Knepley   for(j = 0; j < 8; ++j) {
1005403adfb6SMatthew G Knepley      avgtime[j] = avgtime[j]/(float)(NTIMES-1);
1006403adfb6SMatthew G Knepley      ierr = PetscPrintf(PETSC_COMM_SELF, "%s%11.4f  %11.6f  %12.6f  %12.6f\n", label[j], 1.0E-06 * bytes_per_kernel[j]/mintime[j], avgtime[j], mintime[j], maxtime[j]);CHKERRQ(ierr);
1007403adfb6SMatthew G Knepley   }
1008403adfb6SMatthew G Knepley   PetscFunctionReturn(0);
1009403adfb6SMatthew G Knepley }
1010