xref: /petsc/src/benchmarks/streams/CUDAVersion.cu (revision a6dfd86ebdf75fa024070af26d51b62fd16650a3)
1403adfb6SMatthew G Knepley /*
2403adfb6SMatthew G Knepley   STREAM benchmark implementation in CUDA.
3403adfb6SMatthew G Knepley 
4403adfb6SMatthew G Knepley     COPY:       a(i) = b(i)
5403adfb6SMatthew G Knepley     SCALE:      a(i) = q*b(i)
6403adfb6SMatthew G Knepley     SUM:        a(i) = b(i) + c(i)
7403adfb6SMatthew G Knepley     TRIAD:      a(i) = b(i) + q*c(i)
8403adfb6SMatthew G Knepley 
9403adfb6SMatthew G Knepley   It measures the memory system on the device.
10403adfb6SMatthew G Knepley   The implementation is in single precision.
11403adfb6SMatthew G Knepley 
12403adfb6SMatthew G Knepley   Code based on the code developed by John D. McCalpin
13403adfb6SMatthew G Knepley   http://www.cs.virginia.edu/stream/FTP/Code/stream.c
14403adfb6SMatthew G Knepley 
15403adfb6SMatthew G Knepley   Written by: Massimiliano Fatica, NVIDIA Corporation
16403adfb6SMatthew G Knepley   Modified by: Douglas Enright (dpephd-nvidia@yahoo.com), 1 December 2010
17403adfb6SMatthew G Knepley   Extensive Revisions, 4 December 2010
18403adfb6SMatthew G Knepley   Modified for PETSc by: Matthew G. Knepley 14 Aug 2011
19403adfb6SMatthew G Knepley 
20403adfb6SMatthew G Knepley   User interface motivated by bandwidthTest NVIDIA SDK example.
21403adfb6SMatthew G Knepley */
22403adfb6SMatthew G Knepley static char *help =  "Single-Precision STREAM Benchmark implementation in CUDA\n"
23403adfb6SMatthew G Knepley                      "Performs Copy, Scale, Add, and Triad single-precision kernels\n\n";
24403adfb6SMatthew G Knepley 
25403adfb6SMatthew G Knepley #include <petscconf.h>
26403adfb6SMatthew G Knepley #include <petscsys.h>
27403adfb6SMatthew G Knepley #include <petsctime.h>
28403adfb6SMatthew G Knepley 
29403adfb6SMatthew G Knepley #define N        2000000
30caccb7e3SMatthew G Knepley #define N_DOUBLE 8000000
31403adfb6SMatthew G Knepley #define NTIMES   10
32403adfb6SMatthew G Knepley 
33403adfb6SMatthew G Knepley # ifndef MIN
34403adfb6SMatthew G Knepley # define MIN(x,y) ((x)<(y)?(x):(y))
35403adfb6SMatthew G Knepley # endif
36403adfb6SMatthew G Knepley # ifndef MAX
37403adfb6SMatthew G Knepley # define MAX(x,y) ((x)>(y)?(x):(y))
38403adfb6SMatthew G Knepley # endif
39403adfb6SMatthew G Knepley 
40403adfb6SMatthew G Knepley const float flt_eps  = 1.192092896e-07f;
41caccb7e3SMatthew G Knepley const double dbl_eps = 2.2204460492503131e-16;
42403adfb6SMatthew G Knepley 
43403adfb6SMatthew G Knepley __global__ void set_array(float *a,  float value, size_t len)
44403adfb6SMatthew G Knepley {
45403adfb6SMatthew G Knepley   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
46403adfb6SMatthew G Knepley   while (idx < len) {
47403adfb6SMatthew G Knepley     a[idx] = value;
48403adfb6SMatthew G Knepley     idx += blockDim.x * gridDim.x;
49403adfb6SMatthew G Knepley   }
50403adfb6SMatthew G Knepley }
51403adfb6SMatthew G Knepley 
52caccb7e3SMatthew G Knepley __global__ void set_array_double(double *a,  double value, size_t len)
53caccb7e3SMatthew G Knepley {
54caccb7e3SMatthew G Knepley   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
55caccb7e3SMatthew G Knepley   while (idx < len) {
56caccb7e3SMatthew G Knepley     a[idx] = value;
57caccb7e3SMatthew G Knepley     idx += blockDim.x * gridDim.x;
58caccb7e3SMatthew G Knepley   }
59caccb7e3SMatthew G Knepley }
60caccb7e3SMatthew G Knepley 
61403adfb6SMatthew G Knepley __global__ void STREAM_Copy(float *a, float *b, size_t len)
62403adfb6SMatthew G Knepley {
63403adfb6SMatthew G Knepley   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
64403adfb6SMatthew G Knepley   while (idx < len) {
65403adfb6SMatthew G Knepley     b[idx] = a[idx];
66403adfb6SMatthew G Knepley     idx += blockDim.x * gridDim.x;
67403adfb6SMatthew G Knepley   }
68403adfb6SMatthew G Knepley }
69403adfb6SMatthew G Knepley 
70caccb7e3SMatthew G Knepley __global__ void STREAM_Copy_double(double *a, double *b, size_t len)
71caccb7e3SMatthew G Knepley {
72caccb7e3SMatthew G Knepley   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
73caccb7e3SMatthew G Knepley   while (idx < len) {
74caccb7e3SMatthew G Knepley     b[idx] = a[idx];
75caccb7e3SMatthew G Knepley     idx += blockDim.x * gridDim.x;
76caccb7e3SMatthew G Knepley   }
77caccb7e3SMatthew G Knepley }
78caccb7e3SMatthew G Knepley 
79403adfb6SMatthew G Knepley __global__ void STREAM_Copy_Optimized(float *a, float *b, size_t len)
80403adfb6SMatthew G Knepley {
81403adfb6SMatthew G Knepley   /*
82403adfb6SMatthew G Knepley    * Ensure size of thread index space is as large as or greater than
83403adfb6SMatthew G Knepley    * vector index space else return.
84403adfb6SMatthew G Knepley    */
85403adfb6SMatthew G Knepley   if (blockDim.x * gridDim.x < len) return;
86403adfb6SMatthew G Knepley   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
87403adfb6SMatthew G Knepley   if (idx < len) b[idx] = a[idx];
88403adfb6SMatthew G Knepley }
89403adfb6SMatthew G Knepley 
90caccb7e3SMatthew G Knepley __global__ void STREAM_Copy_Optimized_double(double *a, double *b, size_t len)
91caccb7e3SMatthew G Knepley {
92caccb7e3SMatthew G Knepley   /*
93caccb7e3SMatthew G Knepley    * Ensure size of thread index space is as large as or greater than
94caccb7e3SMatthew G Knepley    * vector index space else return.
95caccb7e3SMatthew G Knepley    */
96caccb7e3SMatthew G Knepley   if (blockDim.x * gridDim.x < len) return;
97caccb7e3SMatthew G Knepley   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
98caccb7e3SMatthew G Knepley   if (idx < len) b[idx] = a[idx];
99caccb7e3SMatthew G Knepley }
100caccb7e3SMatthew G Knepley 
101403adfb6SMatthew G Knepley __global__ void STREAM_Scale(float *a, float *b, float scale,  size_t len)
102403adfb6SMatthew G Knepley {
103403adfb6SMatthew G Knepley   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
104403adfb6SMatthew G Knepley   while (idx < len) {
105403adfb6SMatthew G Knepley     b[idx] = scale* a[idx];
106403adfb6SMatthew G Knepley     idx += blockDim.x * gridDim.x;
107403adfb6SMatthew G Knepley   }
108403adfb6SMatthew G Knepley }
109403adfb6SMatthew G Knepley 
110caccb7e3SMatthew G Knepley __global__ void STREAM_Scale_double(double *a, double *b, double scale,  size_t len)
111caccb7e3SMatthew G Knepley {
112caccb7e3SMatthew G Knepley   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
113caccb7e3SMatthew G Knepley   while (idx < len) {
114caccb7e3SMatthew G Knepley     b[idx] = scale* a[idx];
115caccb7e3SMatthew G Knepley     idx += blockDim.x * gridDim.x;
116caccb7e3SMatthew G Knepley   }
117caccb7e3SMatthew G Knepley }
118caccb7e3SMatthew G Knepley 
119caccb7e3SMatthew G Knepley __global__ void STREAM_Scale_Optimized(float *a, float *b, float scale,  size_t len)
120caccb7e3SMatthew G Knepley {
121caccb7e3SMatthew G Knepley   /*
122caccb7e3SMatthew G Knepley    * Ensure size of thread index space is as large as or greater than
123caccb7e3SMatthew G Knepley    * vector index space else return.
124caccb7e3SMatthew G Knepley    */
125caccb7e3SMatthew G Knepley   if (blockDim.x * gridDim.x < len) return;
126caccb7e3SMatthew G Knepley   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
127caccb7e3SMatthew G Knepley   if (idx < len) b[idx] = scale* a[idx];
128caccb7e3SMatthew G Knepley }
129caccb7e3SMatthew G Knepley 
130caccb7e3SMatthew G Knepley __global__ void STREAM_Scale_Optimized_double(double *a, double *b, double scale,  size_t len)
131caccb7e3SMatthew G Knepley {
132caccb7e3SMatthew G Knepley   /*
133caccb7e3SMatthew G Knepley    * Ensure size of thread index space is as large as or greater than
134caccb7e3SMatthew G Knepley    * vector index space else return.
135caccb7e3SMatthew G Knepley    */
136caccb7e3SMatthew G Knepley   if (blockDim.x * gridDim.x < len) return;
137caccb7e3SMatthew G Knepley   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
138caccb7e3SMatthew G Knepley   if (idx < len) b[idx] = scale* a[idx];
139caccb7e3SMatthew G Knepley }
140caccb7e3SMatthew G Knepley 
141403adfb6SMatthew G Knepley __global__ void STREAM_Add( float *a, float *b, float *c,  size_t len)
142403adfb6SMatthew G Knepley {
143403adfb6SMatthew G Knepley   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
144403adfb6SMatthew G Knepley   while (idx < len) {
145403adfb6SMatthew G Knepley     c[idx] = a[idx]+b[idx];
146403adfb6SMatthew G Knepley     idx += blockDim.x * gridDim.x;
147403adfb6SMatthew G Knepley   }
148403adfb6SMatthew G Knepley }
149403adfb6SMatthew G Knepley 
150caccb7e3SMatthew G Knepley __global__ void STREAM_Add_double( double *a, double *b, double *c,  size_t len)
151caccb7e3SMatthew G Knepley {
152caccb7e3SMatthew G Knepley   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
153caccb7e3SMatthew G Knepley   while (idx < len) {
154caccb7e3SMatthew G Knepley     c[idx] = a[idx]+b[idx];
155caccb7e3SMatthew G Knepley     idx += blockDim.x * gridDim.x;
156caccb7e3SMatthew G Knepley   }
157caccb7e3SMatthew G Knepley }
158caccb7e3SMatthew G Knepley 
159caccb7e3SMatthew G Knepley __global__ void STREAM_Add_Optimized( float *a, float *b, float *c,  size_t len)
160caccb7e3SMatthew G Knepley {
161caccb7e3SMatthew G Knepley   /*
162caccb7e3SMatthew G Knepley    * Ensure size of thread index space is as large as or greater than
163caccb7e3SMatthew G Knepley    * vector index space else return.
164caccb7e3SMatthew G Knepley    */
165caccb7e3SMatthew G Knepley   if (blockDim.x * gridDim.x < len) return;
166caccb7e3SMatthew G Knepley   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
167caccb7e3SMatthew G Knepley   if (idx < len) c[idx] = a[idx]+b[idx];
168caccb7e3SMatthew G Knepley }
169caccb7e3SMatthew G Knepley 
170caccb7e3SMatthew G Knepley __global__ void STREAM_Add_Optimized_double( double *a, double *b, double *c,  size_t len)
171caccb7e3SMatthew G Knepley {
172caccb7e3SMatthew G Knepley   /*
173caccb7e3SMatthew G Knepley    * Ensure size of thread index space is as large as or greater than
174caccb7e3SMatthew G Knepley    * vector index space else return.
175caccb7e3SMatthew G Knepley    */
176caccb7e3SMatthew G Knepley   if (blockDim.x * gridDim.x < len) return;
177caccb7e3SMatthew G Knepley   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
178caccb7e3SMatthew G Knepley   if (idx < len) c[idx] = a[idx]+b[idx];
179caccb7e3SMatthew G Knepley }
180caccb7e3SMatthew G Knepley 
181403adfb6SMatthew G Knepley __global__ void STREAM_Triad( float *a, float *b, float *c, float scalar, size_t len)
182403adfb6SMatthew G Knepley {
183403adfb6SMatthew G Knepley   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
184403adfb6SMatthew G Knepley   while (idx < len) {
185403adfb6SMatthew G Knepley     c[idx] = a[idx]+scalar*b[idx];
186403adfb6SMatthew G Knepley     idx += blockDim.x * gridDim.x;
187403adfb6SMatthew G Knepley   }
188403adfb6SMatthew G Knepley }
189403adfb6SMatthew G Knepley 
190caccb7e3SMatthew G Knepley __global__ void STREAM_Triad_double( double *a, double *b, double *c, double scalar, size_t len)
191caccb7e3SMatthew G Knepley {
192caccb7e3SMatthew G Knepley   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
193caccb7e3SMatthew G Knepley   while (idx < len) {
194caccb7e3SMatthew G Knepley     c[idx] = a[idx]+scalar*b[idx];
195caccb7e3SMatthew G Knepley     idx += blockDim.x * gridDim.x;
196caccb7e3SMatthew G Knepley   }
197caccb7e3SMatthew G Knepley }
198caccb7e3SMatthew G Knepley 
199caccb7e3SMatthew G Knepley __global__ void STREAM_Triad_Optimized( float *a, float *b, float *c, float scalar, size_t len)
200caccb7e3SMatthew G Knepley {
201caccb7e3SMatthew G Knepley   /*
202caccb7e3SMatthew G Knepley    * Ensure size of thread index space is as large as or greater than
203caccb7e3SMatthew G Knepley    * vector index space else return.
204caccb7e3SMatthew G Knepley    */
205caccb7e3SMatthew G Knepley   if (blockDim.x * gridDim.x < len) return;
206caccb7e3SMatthew G Knepley   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
207caccb7e3SMatthew G Knepley   if (idx < len) c[idx] = a[idx]+scalar*b[idx];
208caccb7e3SMatthew G Knepley }
209caccb7e3SMatthew G Knepley 
210caccb7e3SMatthew G Knepley __global__ void STREAM_Triad_Optimized_double( double *a, double *b, double *c, double scalar, size_t len)
211caccb7e3SMatthew G Knepley {
212caccb7e3SMatthew G Knepley   /*
213caccb7e3SMatthew G Knepley    * Ensure size of thread index space is as large as or greater than
214caccb7e3SMatthew G Knepley    * vector index space else return.
215caccb7e3SMatthew G Knepley    */
216caccb7e3SMatthew G Knepley   if (blockDim.x * gridDim.x < len) return;
217caccb7e3SMatthew G Knepley   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
218caccb7e3SMatthew G Knepley   if (idx < len) c[idx] = a[idx]+scalar*b[idx];
219caccb7e3SMatthew G Knepley }
220caccb7e3SMatthew G Knepley 
221403adfb6SMatthew G Knepley /* Host side verification routines */
222*a6dfd86eSKarl Rupp bool STREAM_Copy_verify(float *a, float *b, size_t len)
223*a6dfd86eSKarl Rupp {
224403adfb6SMatthew G Knepley   size_t idx;
225403adfb6SMatthew G Knepley   bool bDifferent = false;
226403adfb6SMatthew G Knepley 
227403adfb6SMatthew G Knepley   for (idx = 0; idx < len && !bDifferent; idx++) {
228403adfb6SMatthew G Knepley     float expectedResult = a[idx];
229403adfb6SMatthew G Knepley     float diffResultExpected = (b[idx] - expectedResult);
230403adfb6SMatthew G Knepley     float relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps;
231403adfb6SMatthew G Knepley     /* element-wise relative error determination */
232403adfb6SMatthew G Knepley     bDifferent = (relErrorULPS > 2.f);
233403adfb6SMatthew G Knepley   }
234403adfb6SMatthew G Knepley 
235403adfb6SMatthew G Knepley   return bDifferent;
236403adfb6SMatthew G Knepley }
237403adfb6SMatthew G Knepley 
238*a6dfd86eSKarl Rupp bool STREAM_Copy_verify_double(double *a, double *b, size_t len)
239*a6dfd86eSKarl Rupp {
240caccb7e3SMatthew G Knepley   size_t idx;
241caccb7e3SMatthew G Knepley   bool bDifferent = false;
242caccb7e3SMatthew G Knepley 
243caccb7e3SMatthew G Knepley   for (idx = 0; idx < len && !bDifferent; idx++) {
244caccb7e3SMatthew G Knepley     double expectedResult = a[idx];
245caccb7e3SMatthew G Knepley     double diffResultExpected = (b[idx] - expectedResult);
246caccb7e3SMatthew G Knepley     double relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps;
247caccb7e3SMatthew G Knepley     /* element-wise relative error determination */
248caccb7e3SMatthew G Knepley     bDifferent = (relErrorULPS > 2.);
249caccb7e3SMatthew G Knepley   }
250caccb7e3SMatthew G Knepley 
251caccb7e3SMatthew G Knepley   return bDifferent;
252caccb7e3SMatthew G Knepley }
253caccb7e3SMatthew G Knepley 
254*a6dfd86eSKarl Rupp bool STREAM_Scale_verify(float *a, float *b, float scale, size_t len)
255*a6dfd86eSKarl Rupp {
256403adfb6SMatthew G Knepley   size_t idx;
257403adfb6SMatthew G Knepley   bool bDifferent = false;
258403adfb6SMatthew G Knepley 
259403adfb6SMatthew G Knepley   for (idx = 0; idx < len && !bDifferent; idx++) {
260403adfb6SMatthew G Knepley     float expectedResult = scale*a[idx];
261403adfb6SMatthew G Knepley     float diffResultExpected = (b[idx] - expectedResult);
262403adfb6SMatthew G Knepley     float relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps;
263403adfb6SMatthew G Knepley     /* element-wise relative error determination */
264403adfb6SMatthew G Knepley     bDifferent = (relErrorULPS > 2.f);
265403adfb6SMatthew G Knepley   }
266403adfb6SMatthew G Knepley 
267403adfb6SMatthew G Knepley   return bDifferent;
268403adfb6SMatthew G Knepley }
269403adfb6SMatthew G Knepley 
270*a6dfd86eSKarl Rupp bool STREAM_Scale_verify_double(double *a, double *b, double scale, size_t len)
271*a6dfd86eSKarl Rupp {
272caccb7e3SMatthew G Knepley   size_t idx;
273caccb7e3SMatthew G Knepley   bool bDifferent = false;
274caccb7e3SMatthew G Knepley 
275caccb7e3SMatthew G Knepley   for (idx = 0; idx < len && !bDifferent; idx++) {
276caccb7e3SMatthew G Knepley     double expectedResult = scale*a[idx];
277caccb7e3SMatthew G Knepley     double diffResultExpected = (b[idx] - expectedResult);
278caccb7e3SMatthew G Knepley     double relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps;
279caccb7e3SMatthew G Knepley     /* element-wise relative error determination */
280caccb7e3SMatthew G Knepley     bDifferent = (relErrorULPS > 2.);
281caccb7e3SMatthew G Knepley   }
282caccb7e3SMatthew G Knepley 
283caccb7e3SMatthew G Knepley   return bDifferent;
284caccb7e3SMatthew G Knepley }
285caccb7e3SMatthew G Knepley 
286*a6dfd86eSKarl Rupp bool STREAM_Add_verify(float *a, float *b, float *c, size_t len)
287*a6dfd86eSKarl Rupp {
288403adfb6SMatthew G Knepley   size_t idx;
289403adfb6SMatthew G Knepley   bool bDifferent = false;
290403adfb6SMatthew G Knepley 
291403adfb6SMatthew G Knepley   for (idx = 0; idx < len && !bDifferent; idx++) {
292403adfb6SMatthew G Knepley     float expectedResult = a[idx] + b[idx];
293403adfb6SMatthew G Knepley     float diffResultExpected = (c[idx] - expectedResult);
294403adfb6SMatthew G Knepley     float relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps;
295403adfb6SMatthew G Knepley     /* element-wise relative error determination */
296403adfb6SMatthew G Knepley     bDifferent = (relErrorULPS > 2.f);
297403adfb6SMatthew G Knepley   }
298403adfb6SMatthew G Knepley 
299403adfb6SMatthew G Knepley   return bDifferent;
300403adfb6SMatthew G Knepley }
301403adfb6SMatthew G Knepley 
302*a6dfd86eSKarl Rupp bool STREAM_Add_verify_double(double *a, double *b, double *c, size_t len)
303*a6dfd86eSKarl Rupp {
304caccb7e3SMatthew G Knepley   size_t idx;
305caccb7e3SMatthew G Knepley   bool bDifferent = false;
306caccb7e3SMatthew G Knepley 
307caccb7e3SMatthew G Knepley   for (idx = 0; idx < len && !bDifferent; idx++) {
308caccb7e3SMatthew G Knepley     double expectedResult = a[idx] + b[idx];
309caccb7e3SMatthew G Knepley     double diffResultExpected = (c[idx] - expectedResult);
310caccb7e3SMatthew G Knepley     double relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps;
311caccb7e3SMatthew G Knepley     /* element-wise relative error determination */
312caccb7e3SMatthew G Knepley     bDifferent = (relErrorULPS > 2.);
313caccb7e3SMatthew G Knepley   }
314caccb7e3SMatthew G Knepley 
315caccb7e3SMatthew G Knepley   return bDifferent;
316caccb7e3SMatthew G Knepley }
317caccb7e3SMatthew G Knepley 
318*a6dfd86eSKarl Rupp bool STREAM_Triad_verify(float *a, float *b, float *c, float scalar, size_t len)
319*a6dfd86eSKarl Rupp {
320403adfb6SMatthew G Knepley   size_t idx;
321403adfb6SMatthew G Knepley   bool bDifferent = false;
322403adfb6SMatthew G Knepley 
323403adfb6SMatthew G Knepley   for (idx = 0; idx < len && !bDifferent; idx++) {
324403adfb6SMatthew G Knepley     float expectedResult = a[idx] + scalar*b[idx];
325403adfb6SMatthew G Knepley     float diffResultExpected = (c[idx] - expectedResult);
326403adfb6SMatthew G Knepley     float relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps;
327403adfb6SMatthew G Knepley     /* element-wise relative error determination */
328403adfb6SMatthew G Knepley     bDifferent = (relErrorULPS > 3.f);
329403adfb6SMatthew G Knepley   }
330403adfb6SMatthew G Knepley 
331403adfb6SMatthew G Knepley   return bDifferent;
332403adfb6SMatthew G Knepley }
333403adfb6SMatthew G Knepley 
334*a6dfd86eSKarl Rupp bool STREAM_Triad_verify_double(double *a, double *b, double *c, double scalar, size_t len)
335*a6dfd86eSKarl Rupp {
336caccb7e3SMatthew G Knepley   size_t idx;
337caccb7e3SMatthew G Knepley   bool bDifferent = false;
338caccb7e3SMatthew G Knepley 
339caccb7e3SMatthew G Knepley   for (idx = 0; idx < len && !bDifferent; idx++) {
340caccb7e3SMatthew G Knepley     double expectedResult = a[idx] + scalar*b[idx];
341caccb7e3SMatthew G Knepley     double diffResultExpected = (c[idx] - expectedResult);
342caccb7e3SMatthew G Knepley     double relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps;
343caccb7e3SMatthew G Knepley     /* element-wise relative error determination */
344caccb7e3SMatthew G Knepley     bDifferent = (relErrorULPS > 3.);
345caccb7e3SMatthew G Knepley   }
346caccb7e3SMatthew G Knepley 
347caccb7e3SMatthew G Knepley   return bDifferent;
348caccb7e3SMatthew G Knepley }
349caccb7e3SMatthew G Knepley 
350403adfb6SMatthew G Knepley /* forward declarations */
351caccb7e3SMatthew G Knepley PetscErrorCode setupStream(PetscInt device, PetscBool runDouble, PetscBool cpuTiming);
352403adfb6SMatthew G Knepley PetscErrorCode runStream(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming);
353caccb7e3SMatthew G Knepley PetscErrorCode runStreamDouble(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming);
354403adfb6SMatthew G Knepley PetscErrorCode printResultsReadable(float times[][NTIMES]);
355403adfb6SMatthew G Knepley 
356403adfb6SMatthew G Knepley int main(int argc, char *argv[])
357403adfb6SMatthew G Knepley {
358403adfb6SMatthew G Knepley   PetscInt       device    = 0;
359caccb7e3SMatthew G Knepley   PetscBool      runDouble = PETSC_FALSE;
360403adfb6SMatthew G Knepley   PetscBool      cpuTiming = PETSC_FALSE;
361403adfb6SMatthew G Knepley   PetscErrorCode ierr;
362403adfb6SMatthew G Knepley 
363403adfb6SMatthew G Knepley   ierr = PetscInitialize(&argc, &argv, 0, help);CHKERRQ(ierr);
364caccb7e3SMatthew G Knepley   ierr = PetscPrintf(PETSC_COMM_SELF, "[Single and Double-Precision Device-Only STREAM Benchmark implementation in CUDA]\n");CHKERRQ(ierr);
365403adfb6SMatthew G Knepley   ierr = PetscPrintf(PETSC_COMM_SELF, "%s Starting...\n\n", argv[0]);CHKERRQ(ierr);
366403adfb6SMatthew G Knepley 
367403adfb6SMatthew G Knepley   ierr = PetscOptionsBegin(PETSC_COMM_WORLD, "", "STREAM Benchmark Options", "STREAM");CHKERRQ(ierr);
368403adfb6SMatthew G Knepley     ierr = PetscOptionsInt("-device", "Specify the CUDA device to be used", "STREAM", device, &device, PETSC_NULL);CHKERRQ(ierr);
369caccb7e3SMatthew G Knepley     ierr = PetscOptionsBool("-double",    "Also run double precision tests",   "STREAM", runDouble, &runDouble, PETSC_NULL);CHKERRQ(ierr);
370403adfb6SMatthew G Knepley     ierr = PetscOptionsBool("-cputiming", "Force CPU-based timing to be used", "STREAM", cpuTiming, &cpuTiming, PETSC_NULL);CHKERRQ(ierr);
371403adfb6SMatthew G Knepley   ierr = PetscOptionsEnd();
372403adfb6SMatthew G Knepley 
373caccb7e3SMatthew G Knepley   ierr = setupStream(device, runDouble, cpuTiming);
374403adfb6SMatthew G Knepley   if (ierr >= 0) {
375403adfb6SMatthew G Knepley     PetscErrorCode ierr2 = PetscPrintf(PETSC_COMM_SELF, "\n[streamBenchmark] - results:\t%s\n\n", (ierr == 0) ? "PASSES" : "FAILED");CHKERRQ(ierr2);
376403adfb6SMatthew G Knepley   }
377403adfb6SMatthew G Knepley   PetscFinalize();
378403adfb6SMatthew G Knepley   return 0;
379403adfb6SMatthew G Knepley }
380403adfb6SMatthew G Knepley 
381403adfb6SMatthew G Knepley ///////////////////////////////////////////////////////////////////////////////
382403adfb6SMatthew G Knepley //Run the appropriate tests
383403adfb6SMatthew G Knepley ///////////////////////////////////////////////////////////////////////////////
384caccb7e3SMatthew G Knepley PetscErrorCode setupStream(PetscInt deviceNum, PetscBool runDouble, PetscBool cpuTiming)
385403adfb6SMatthew G Knepley {
386403adfb6SMatthew G Knepley   PetscInt       iNumThreadsPerBlock = 128;
387403adfb6SMatthew G Knepley   PetscErrorCode ierr;
388403adfb6SMatthew G Knepley 
389403adfb6SMatthew G Knepley   PetscFunctionBegin;
390403adfb6SMatthew G Knepley   // Check device
391403adfb6SMatthew G Knepley   {
392403adfb6SMatthew G Knepley     int deviceCount;
393403adfb6SMatthew G Knepley 
394403adfb6SMatthew G Knepley     cudaGetDeviceCount(&deviceCount);
395403adfb6SMatthew G Knepley     if (deviceCount == 0) {
396403adfb6SMatthew G Knepley       ierr = PetscPrintf(PETSC_COMM_SELF, "!!!!!No devices found!!!!!\n");CHKERRQ(ierr);
397403adfb6SMatthew G Knepley       return -1000;
398403adfb6SMatthew G Knepley     }
399403adfb6SMatthew G Knepley 
400403adfb6SMatthew G Knepley     if (deviceNum >= deviceCount || deviceNum < 0) {
401403adfb6SMatthew G Knepley       ierr = PetscPrintf(PETSC_COMM_SELF, "\n!!!!!Invalid GPU number %d given hence default gpu %d will be used !!!!!\n", deviceNum, 0);CHKERRQ(ierr);
402403adfb6SMatthew G Knepley       deviceNum = 0;
403403adfb6SMatthew G Knepley     }
404403adfb6SMatthew G Knepley   }
405403adfb6SMatthew G Knepley 
406403adfb6SMatthew G Knepley   cudaSetDevice(deviceNum);
407403adfb6SMatthew G Knepley   ierr = PetscPrintf(PETSC_COMM_SELF, "Running on...\n\n");CHKERRQ(ierr);
408403adfb6SMatthew G Knepley   cudaDeviceProp deviceProp;
409403adfb6SMatthew G Knepley   if (cudaGetDeviceProperties(&deviceProp, deviceNum) == cudaSuccess) {
410403adfb6SMatthew G Knepley     ierr = PetscPrintf(PETSC_COMM_SELF, " Device %d: %s\n", deviceNum, deviceProp.name);CHKERRQ(ierr);
411403adfb6SMatthew G Knepley   } else {
412403adfb6SMatthew G Knepley     ierr = PetscPrintf(PETSC_COMM_SELF, " Unable to determine device %d properties, exiting\n");CHKERRQ(ierr);
413403adfb6SMatthew G Knepley     return -1;
414403adfb6SMatthew G Knepley   }
415403adfb6SMatthew G Knepley 
416caccb7e3SMatthew G Knepley   if (runDouble && deviceProp.major == 1 && deviceProp.minor < 3) {
417caccb7e3SMatthew G Knepley     ierr = PetscPrintf(PETSC_COMM_SELF, " Unable to run double-precision STREAM benchmark on a compute capability GPU less than 1.3\n");CHKERRQ(ierr);
418caccb7e3SMatthew G Knepley     return -1;
419caccb7e3SMatthew G Knepley   }
420403adfb6SMatthew G Knepley   if (deviceProp.major == 2 && deviceProp.minor == 1) {
421403adfb6SMatthew G Knepley     iNumThreadsPerBlock = 192; /* GF104 architecture / 48 CUDA Cores per MP */
422403adfb6SMatthew G Knepley   } else {
423403adfb6SMatthew G Knepley     iNumThreadsPerBlock = 128; /* GF100 architecture / 32 CUDA Cores per MP */
424403adfb6SMatthew G Knepley   }
425403adfb6SMatthew G Knepley 
426403adfb6SMatthew G Knepley   if (cpuTiming) {
427403adfb6SMatthew G Knepley     ierr = PetscPrintf(PETSC_COMM_SELF, " Using cpu-only timer.\n");CHKERRQ(ierr);
428403adfb6SMatthew G Knepley   }
429403adfb6SMatthew G Knepley 
430403adfb6SMatthew G Knepley   ierr = runStream(iNumThreadsPerBlock, cpuTiming);CHKERRQ(ierr);
431caccb7e3SMatthew G Knepley   if (runDouble) {
432caccb7e3SMatthew G Knepley     ierr = cudaSetDeviceFlags(cudaDeviceBlockingSync);CHKERRQ(ierr);
433caccb7e3SMatthew G Knepley     ierr = runStreamDouble(iNumThreadsPerBlock, cpuTiming);CHKERRQ(ierr);
434caccb7e3SMatthew G Knepley   }
435403adfb6SMatthew G Knepley   PetscFunctionReturn(0);
436403adfb6SMatthew G Knepley }
437403adfb6SMatthew G Knepley 
438403adfb6SMatthew G Knepley ///////////////////////////////////////////////////////////////////////////
439403adfb6SMatthew G Knepley // runStream
440403adfb6SMatthew G Knepley ///////////////////////////////////////////////////////////////////////////
441403adfb6SMatthew G Knepley PetscErrorCode runStream(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming)
442403adfb6SMatthew G Knepley {
443403adfb6SMatthew G Knepley   float *d_a, *d_b, *d_c;
444403adfb6SMatthew G Knepley   int k;
445caccb7e3SMatthew G Knepley   float times[8][NTIMES];
446403adfb6SMatthew G Knepley   float scalar;
447403adfb6SMatthew G Knepley   PetscErrorCode ierr;
448403adfb6SMatthew G Knepley 
449403adfb6SMatthew G Knepley   PetscFunctionBegin;
450403adfb6SMatthew G Knepley   /* Allocate memory on device */
451403adfb6SMatthew G Knepley   ierr = cudaMalloc((void**)&d_a, sizeof(float)*N);CHKERRQ(ierr);
452403adfb6SMatthew G Knepley   ierr = cudaMalloc((void**)&d_b, sizeof(float)*N);CHKERRQ(ierr);
453403adfb6SMatthew G Knepley   ierr = cudaMalloc((void**)&d_c, sizeof(float)*N);CHKERRQ(ierr);
454403adfb6SMatthew G Knepley 
455403adfb6SMatthew G Knepley   /* Compute execution configuration */
456403adfb6SMatthew G Knepley 
457403adfb6SMatthew G Knepley   dim3 dimBlock(iNumThreadsPerBlock); /* (iNumThreadsPerBlock,1,1) */
458403adfb6SMatthew G Knepley   dim3 dimGrid(N/dimBlock.x); /* (N/dimBlock.x,1,1) */
459403adfb6SMatthew G Knepley   if (N % dimBlock.x != 0) dimGrid.x+=1;
460403adfb6SMatthew G Knepley 
461403adfb6SMatthew G Knepley   ierr = PetscPrintf(PETSC_COMM_SELF, " Array size (single precision) = %u\n",N);CHKERRQ(ierr);
462403adfb6SMatthew G Knepley   ierr = PetscPrintf(PETSC_COMM_SELF, " using %u threads per block, %u blocks\n",dimBlock.x,dimGrid.x);CHKERRQ(ierr);
463403adfb6SMatthew G Knepley 
464403adfb6SMatthew G Knepley   /* Initialize memory on the device */
465403adfb6SMatthew G Knepley   set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N);
466403adfb6SMatthew G Knepley   set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N);
467403adfb6SMatthew G Knepley   set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N);
468403adfb6SMatthew G Knepley 
469403adfb6SMatthew G Knepley   /* --- MAIN LOOP --- repeat test cases NTIMES times --- */
470403adfb6SMatthew G Knepley   PetscLogDouble cpuTimer = 0.0;
471403adfb6SMatthew G Knepley   cudaEvent_t start, stop;
472403adfb6SMatthew G Knepley 
473403adfb6SMatthew G Knepley   /* both timers report msec */
474403adfb6SMatthew G Knepley   ierr = cudaEventCreate( &start );CHKERRQ(ierr); /* gpu timer facility */
475403adfb6SMatthew G Knepley   ierr = cudaEventCreate( &stop );CHKERRQ(ierr);  /* gpu timer facility */
476403adfb6SMatthew G Knepley 
477403adfb6SMatthew G Knepley   scalar=3.0f;
478403adfb6SMatthew G Knepley   for (k = 0; k < NTIMES; ++k) {
479403adfb6SMatthew G Knepley     PetscTimeSubtract(cpuTimer);
480403adfb6SMatthew G Knepley     ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr);
481403adfb6SMatthew G Knepley     STREAM_Copy<<<dimGrid,dimBlock>>>(d_a, d_c, N);
482403adfb6SMatthew G Knepley     ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr);
483403adfb6SMatthew G Knepley     ierr = cudaEventSynchronize(stop);CHKERRQ(ierr);
484403adfb6SMatthew G Knepley     //get the the total elapsed time in ms
485403adfb6SMatthew G Knepley     PetscTimeAdd(cpuTimer);
486caccb7e3SMatthew G Knepley     if (bDontUseGPUTiming) {
487403adfb6SMatthew G Knepley       times[0][k] = cpuTimer;
488403adfb6SMatthew G Knepley     } else {
489403adfb6SMatthew G Knepley       ierr = cudaEventElapsedTime( &times[0][k], start, stop );CHKERRQ(ierr);
490403adfb6SMatthew G Knepley     }
491403adfb6SMatthew G Knepley 
492403adfb6SMatthew G Knepley     cpuTimer = 0.0;
493403adfb6SMatthew G Knepley     PetscTimeSubtract(cpuTimer);
494403adfb6SMatthew G Knepley     ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr);
495403adfb6SMatthew G Knepley     STREAM_Copy_Optimized<<<dimGrid,dimBlock>>>(d_a, d_c, N);
496403adfb6SMatthew G Knepley     ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr);
497403adfb6SMatthew G Knepley     ierr = cudaEventSynchronize(stop);CHKERRQ(ierr);
498403adfb6SMatthew G Knepley     //get the the total elapsed time in ms
499403adfb6SMatthew G Knepley     PetscTimeAdd(cpuTimer);
500caccb7e3SMatthew G Knepley     if (bDontUseGPUTiming) {
501403adfb6SMatthew G Knepley       times[1][k] = cpuTimer;
502403adfb6SMatthew G Knepley     } else {
503403adfb6SMatthew G Knepley       ierr = cudaEventElapsedTime( &times[1][k], start, stop );CHKERRQ(ierr);
504403adfb6SMatthew G Knepley     }
505403adfb6SMatthew G Knepley 
506403adfb6SMatthew G Knepley     cpuTimer = 0.0;
507403adfb6SMatthew G Knepley     PetscTimeSubtract(cpuTimer);
508403adfb6SMatthew G Knepley     ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr);
509403adfb6SMatthew G Knepley     STREAM_Scale<<<dimGrid,dimBlock>>>(d_b, d_c, scalar,  N);
510403adfb6SMatthew G Knepley     ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr);
511403adfb6SMatthew G Knepley     ierr = cudaEventSynchronize(stop);CHKERRQ(ierr);
512403adfb6SMatthew G Knepley     //get the the total elapsed time in ms
513403adfb6SMatthew G Knepley     PetscTimeAdd(cpuTimer);
514403adfb6SMatthew G Knepley     if (bDontUseGPUTiming) {
515403adfb6SMatthew G Knepley       times[2][k] = cpuTimer;
516403adfb6SMatthew G Knepley     } else {
517403adfb6SMatthew G Knepley       ierr = cudaEventElapsedTime( &times[2][k], start, stop );CHKERRQ(ierr);
518403adfb6SMatthew G Knepley     }
519403adfb6SMatthew G Knepley 
520403adfb6SMatthew G Knepley     cpuTimer = 0.0;
521403adfb6SMatthew G Knepley     PetscTimeSubtract(cpuTimer);
522403adfb6SMatthew G Knepley     ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr);
523caccb7e3SMatthew G Knepley     STREAM_Scale_Optimized<<<dimGrid,dimBlock>>>(d_b, d_c, scalar,  N);
524403adfb6SMatthew G Knepley     ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr);
525403adfb6SMatthew G Knepley     ierr = cudaEventSynchronize(stop);CHKERRQ(ierr);
526403adfb6SMatthew G Knepley     //get the the total elapsed time in ms
527403adfb6SMatthew G Knepley     PetscTimeAdd(cpuTimer);
528403adfb6SMatthew G Knepley     if (bDontUseGPUTiming) {
529403adfb6SMatthew G Knepley       times[3][k] = cpuTimer;
530403adfb6SMatthew G Knepley     } else {
531403adfb6SMatthew G Knepley       ierr = cudaEventElapsedTime( &times[3][k], start, stop );CHKERRQ(ierr);
532403adfb6SMatthew G Knepley     }
533403adfb6SMatthew G Knepley 
534403adfb6SMatthew G Knepley     cpuTimer = 0.0;
535403adfb6SMatthew G Knepley     PetscTimeSubtract(cpuTimer);
536403adfb6SMatthew G Knepley     ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr);
537caccb7e3SMatthew G Knepley     STREAM_Add<<<dimGrid,dimBlock>>>(d_a, d_b, d_c,  N);
538403adfb6SMatthew G Knepley     ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr);
539403adfb6SMatthew G Knepley     ierr = cudaEventSynchronize(stop);CHKERRQ(ierr);
540403adfb6SMatthew G Knepley     //get the the total elapsed time in ms
541403adfb6SMatthew G Knepley     PetscTimeAdd(cpuTimer);
542403adfb6SMatthew G Knepley     if (bDontUseGPUTiming) {
543403adfb6SMatthew G Knepley       times[4][k] = cpuTimer;
544403adfb6SMatthew G Knepley     } else {
545403adfb6SMatthew G Knepley       ierr = cudaEventElapsedTime( &times[4][k], start, stop );CHKERRQ(ierr);
546403adfb6SMatthew G Knepley     }
547403adfb6SMatthew G Knepley 
548caccb7e3SMatthew G Knepley     cpuTimer = 0.0;
549caccb7e3SMatthew G Knepley     PetscTimeSubtract(cpuTimer);
550caccb7e3SMatthew G Knepley     ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr);
551caccb7e3SMatthew G Knepley     STREAM_Add_Optimized<<<dimGrid,dimBlock>>>(d_a, d_b, d_c,  N);
552caccb7e3SMatthew G Knepley     ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr);
553caccb7e3SMatthew G Knepley     ierr = cudaEventSynchronize(stop);CHKERRQ(ierr);
554caccb7e3SMatthew G Knepley     //get the the total elapsed time in ms
555caccb7e3SMatthew G Knepley     PetscTimeAdd(cpuTimer);
556caccb7e3SMatthew G Knepley     if (bDontUseGPUTiming) {
557caccb7e3SMatthew G Knepley       times[5][k] = cpuTimer;
558caccb7e3SMatthew G Knepley     } else {
559caccb7e3SMatthew G Knepley       ierr = cudaEventElapsedTime( &times[5][k], start, stop );CHKERRQ(ierr);
560caccb7e3SMatthew G Knepley     }
561caccb7e3SMatthew G Knepley 
562caccb7e3SMatthew G Knepley     cpuTimer = 0.0;
563caccb7e3SMatthew G Knepley     PetscTimeSubtract(cpuTimer);
564caccb7e3SMatthew G Knepley     ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr);
565caccb7e3SMatthew G Knepley     STREAM_Triad<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar,  N);
566caccb7e3SMatthew G Knepley     ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr);
567caccb7e3SMatthew G Knepley     ierr = cudaEventSynchronize(stop);CHKERRQ(ierr);
568caccb7e3SMatthew G Knepley     //get the the total elapsed time in ms
569caccb7e3SMatthew G Knepley     PetscTimeAdd(cpuTimer);
570caccb7e3SMatthew G Knepley     if (bDontUseGPUTiming) {
571caccb7e3SMatthew G Knepley       times[6][k] = cpuTimer;
572caccb7e3SMatthew G Knepley     } else {
573caccb7e3SMatthew G Knepley       ierr = cudaEventElapsedTime( &times[6][k], start, stop );CHKERRQ(ierr);
574caccb7e3SMatthew G Knepley     }
575caccb7e3SMatthew G Knepley 
576caccb7e3SMatthew G Knepley     cpuTimer = 0.0;
577caccb7e3SMatthew G Knepley     PetscTimeSubtract(cpuTimer);
578caccb7e3SMatthew G Knepley     ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr);
579caccb7e3SMatthew G Knepley     STREAM_Triad_Optimized<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar,  N);
580caccb7e3SMatthew G Knepley     ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr);
581caccb7e3SMatthew G Knepley     ierr = cudaEventSynchronize(stop);CHKERRQ(ierr);
582caccb7e3SMatthew G Knepley     //get the the total elapsed time in ms
583caccb7e3SMatthew G Knepley     PetscTimeAdd(cpuTimer);
584caccb7e3SMatthew G Knepley     if (bDontUseGPUTiming) {
585caccb7e3SMatthew G Knepley       times[7][k] = cpuTimer;
586caccb7e3SMatthew G Knepley     } else {
587caccb7e3SMatthew G Knepley       ierr = cudaEventElapsedTime( &times[7][k], start, stop );CHKERRQ(ierr);
588caccb7e3SMatthew G Knepley     }
589caccb7e3SMatthew G Knepley 
590403adfb6SMatthew G Knepley   }
591403adfb6SMatthew G Knepley 
592403adfb6SMatthew G Knepley   /* verify kernels */
593403adfb6SMatthew G Knepley   float *h_a, *h_b, *h_c;
594403adfb6SMatthew G Knepley   bool errorSTREAMkernel = true;
595403adfb6SMatthew G Knepley 
596403adfb6SMatthew G Knepley   if ( (h_a = (float*)calloc( N, sizeof(float) )) == (float*)NULL ) {
597403adfb6SMatthew G Knepley     printf("Unable to allocate array h_a, exiting ...\n");
598403adfb6SMatthew G Knepley     exit(1);
599403adfb6SMatthew G Knepley   }
600403adfb6SMatthew G Knepley   if ( (h_b = (float*)calloc( N, sizeof(float) )) == (float*)NULL ) {
601403adfb6SMatthew G Knepley     printf("Unable to allocate array h_b, exiting ...\n");
602403adfb6SMatthew G Knepley     exit(1);
603403adfb6SMatthew G Knepley   }
604403adfb6SMatthew G Knepley 
605403adfb6SMatthew G Knepley   if ( (h_c = (float*)calloc( N, sizeof(float) )) == (float*)NULL ) {
606403adfb6SMatthew G Knepley     printf("Unalbe to allocate array h_c, exiting ...\n");
607403adfb6SMatthew G Knepley     exit(1);
608403adfb6SMatthew G Knepley   }
609403adfb6SMatthew G Knepley 
610403adfb6SMatthew G Knepley   /*
611403adfb6SMatthew G Knepley    * perform kernel, copy device memory into host memory and verify each
612403adfb6SMatthew G Knepley    * device kernel output
613403adfb6SMatthew G Knepley    */
614403adfb6SMatthew G Knepley 
615403adfb6SMatthew G Knepley   /* Initialize memory on the device */
616403adfb6SMatthew G Knepley   set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N);
617403adfb6SMatthew G Knepley   set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N);
618403adfb6SMatthew G Knepley   set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N);
619403adfb6SMatthew G Knepley 
620403adfb6SMatthew G Knepley   STREAM_Copy<<<dimGrid,dimBlock>>>(d_a, d_c, N);
621403adfb6SMatthew G Knepley   ierr = cudaMemcpy( h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
622403adfb6SMatthew G Knepley   ierr = cudaMemcpy( h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
623403adfb6SMatthew G Knepley   errorSTREAMkernel = STREAM_Copy_verify(h_a, h_c, N);
624403adfb6SMatthew G Knepley   if (errorSTREAMkernel) {
625403adfb6SMatthew G Knepley     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy:\t\tError detected in device STREAM_Copy, exiting\n");CHKERRQ(ierr);
626403adfb6SMatthew G Knepley     exit(-2000);
627403adfb6SMatthew G Knepley   } else {
628403adfb6SMatthew G Knepley     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy:\t\tPass\n");CHKERRQ(ierr);
629403adfb6SMatthew G Knepley   }
630403adfb6SMatthew G Knepley 
631403adfb6SMatthew G Knepley   /* Initialize memory on the device */
632403adfb6SMatthew G Knepley   set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N);
633403adfb6SMatthew G Knepley   set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N);
634403adfb6SMatthew G Knepley   set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N);
635403adfb6SMatthew G Knepley 
636403adfb6SMatthew G Knepley   STREAM_Copy_Optimized<<<dimGrid,dimBlock>>>(d_a, d_c, N);
637403adfb6SMatthew G Knepley   ierr = cudaMemcpy( h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
638403adfb6SMatthew G Knepley   ierr = cudaMemcpy( h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
639403adfb6SMatthew G Knepley   errorSTREAMkernel = STREAM_Copy_verify(h_a, h_c, N);
640403adfb6SMatthew G Knepley   if (errorSTREAMkernel) {
641403adfb6SMatthew G Knepley     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy_Optimized:\tError detected in device STREAM_Copy_Optimized, exiting\n");CHKERRQ(ierr);
642403adfb6SMatthew G Knepley     exit(-3000);
643403adfb6SMatthew G Knepley   } else {
644403adfb6SMatthew G Knepley     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy_Optimized:\tPass\n");CHKERRQ(ierr);
645403adfb6SMatthew G Knepley   }
646403adfb6SMatthew G Knepley 
647403adfb6SMatthew G Knepley   /* Initialize memory on the device */
648403adfb6SMatthew G Knepley   set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N);
649403adfb6SMatthew G Knepley   set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N);
650403adfb6SMatthew G Knepley 
651403adfb6SMatthew G Knepley   STREAM_Scale<<<dimGrid,dimBlock>>>(d_b, d_c, scalar, N);
652403adfb6SMatthew G Knepley   ierr = cudaMemcpy( h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
653403adfb6SMatthew G Knepley   ierr = cudaMemcpy( h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
654403adfb6SMatthew G Knepley   errorSTREAMkernel = STREAM_Scale_verify(h_b, h_c, scalar, N);
655403adfb6SMatthew G Knepley   if (errorSTREAMkernel) {
656403adfb6SMatthew G Knepley     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Scale:\t\tError detected in device STREAM_Scale, exiting\n");CHKERRQ(ierr);
657403adfb6SMatthew G Knepley     exit(-4000);
658403adfb6SMatthew G Knepley   } else {
659403adfb6SMatthew G Knepley     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Scale:\t\tPass\n");CHKERRQ(ierr);
660403adfb6SMatthew G Knepley   }
661403adfb6SMatthew G Knepley 
662403adfb6SMatthew G Knepley   /* Initialize memory on the device */
663403adfb6SMatthew G Knepley   set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N);
664403adfb6SMatthew G Knepley   set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N);
665403adfb6SMatthew G Knepley   set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N);
666403adfb6SMatthew G Knepley 
667403adfb6SMatthew G Knepley   STREAM_Add<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N);
668403adfb6SMatthew G Knepley   ierr = cudaMemcpy( h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
669403adfb6SMatthew G Knepley   ierr = cudaMemcpy( h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
670403adfb6SMatthew G Knepley   ierr = cudaMemcpy( h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
671403adfb6SMatthew G Knepley   errorSTREAMkernel = STREAM_Add_verify(h_a, h_b, h_c, N);
672403adfb6SMatthew G Knepley   if (errorSTREAMkernel) {
673403adfb6SMatthew G Knepley     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Add:\t\tError detected in device STREAM_Add, exiting\n");CHKERRQ(ierr);
674403adfb6SMatthew G Knepley     exit(-5000);
675403adfb6SMatthew G Knepley   } else {
676403adfb6SMatthew G Knepley     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Add:\t\tPass\n");CHKERRQ(ierr);
677403adfb6SMatthew G Knepley   }
678403adfb6SMatthew G Knepley 
679403adfb6SMatthew G Knepley   /* Initialize memory on the device */
680403adfb6SMatthew G Knepley   set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N);
681403adfb6SMatthew G Knepley   set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N);
682403adfb6SMatthew G Knepley   set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N);
683403adfb6SMatthew G Knepley 
684403adfb6SMatthew G Knepley   STREAM_Triad<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N);
685403adfb6SMatthew G Knepley   ierr = cudaMemcpy( h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
686403adfb6SMatthew G Knepley   ierr = cudaMemcpy( h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
687403adfb6SMatthew G Knepley   ierr = cudaMemcpy( h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
688403adfb6SMatthew G Knepley   errorSTREAMkernel = STREAM_Triad_verify(h_b, h_c, h_a, scalar, N);
689403adfb6SMatthew G Knepley   if (errorSTREAMkernel) {
690403adfb6SMatthew G Knepley     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Triad:\t\tError detected in device STREAM_Triad, exiting\n");CHKERRQ(ierr);
691403adfb6SMatthew G Knepley     exit(-6000);
692403adfb6SMatthew G Knepley   } else {
693403adfb6SMatthew G Knepley     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Triad:\t\tPass\n");CHKERRQ(ierr);
694403adfb6SMatthew G Knepley   }
695403adfb6SMatthew G Knepley 
696403adfb6SMatthew G Knepley   /* continue from here */
697403adfb6SMatthew G Knepley   printResultsReadable(times);
698403adfb6SMatthew G Knepley 
699403adfb6SMatthew G Knepley   //clean up timers
700403adfb6SMatthew G Knepley   ierr = cudaEventDestroy( stop );CHKERRQ(ierr);
701403adfb6SMatthew G Knepley   ierr = cudaEventDestroy( start );CHKERRQ(ierr);
702403adfb6SMatthew G Knepley 
703403adfb6SMatthew G Knepley   /* Free memory on device */
704403adfb6SMatthew G Knepley   ierr = cudaFree(d_a);CHKERRQ(ierr);
705403adfb6SMatthew G Knepley   ierr = cudaFree(d_b);CHKERRQ(ierr);
706403adfb6SMatthew G Knepley   ierr = cudaFree(d_c);CHKERRQ(ierr);
707403adfb6SMatthew G Knepley   PetscFunctionReturn(0);
708403adfb6SMatthew G Knepley }
709403adfb6SMatthew G Knepley 
710caccb7e3SMatthew G Knepley PetscErrorCode runStreamDouble(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming)
711caccb7e3SMatthew G Knepley {
712caccb7e3SMatthew G Knepley   double *d_a, *d_b, *d_c;
713caccb7e3SMatthew G Knepley   int k;
714caccb7e3SMatthew G Knepley   float times[8][NTIMES];
715caccb7e3SMatthew G Knepley   double scalar;
716caccb7e3SMatthew G Knepley   PetscErrorCode ierr;
717caccb7e3SMatthew G Knepley 
718caccb7e3SMatthew G Knepley   PetscFunctionBegin;
719caccb7e3SMatthew G Knepley   /* Allocate memory on device */
720caccb7e3SMatthew G Knepley   ierr = cudaMalloc((void**)&d_a, sizeof(double)*N);CHKERRQ(ierr);
721caccb7e3SMatthew G Knepley   ierr = cudaMalloc((void**)&d_b, sizeof(double)*N);CHKERRQ(ierr);
722caccb7e3SMatthew G Knepley   ierr = cudaMalloc((void**)&d_c, sizeof(double)*N);CHKERRQ(ierr);
723caccb7e3SMatthew G Knepley 
724caccb7e3SMatthew G Knepley   /* Compute execution configuration */
725caccb7e3SMatthew G Knepley 
726caccb7e3SMatthew G Knepley   dim3 dimBlock(iNumThreadsPerBlock); /* (iNumThreadsPerBlock,1,1) */
727caccb7e3SMatthew G Knepley   dim3 dimGrid(N/dimBlock.x); /* (N/dimBlock.x,1,1) */
728caccb7e3SMatthew G Knepley   if (N % dimBlock.x != 0) dimGrid.x+=1;
729caccb7e3SMatthew G Knepley 
730caccb7e3SMatthew G Knepley   ierr = PetscPrintf(PETSC_COMM_SELF, " Array size (double precision) = %u\n",N);CHKERRQ(ierr);
731caccb7e3SMatthew G Knepley   ierr = PetscPrintf(PETSC_COMM_SELF, " using %u threads per block, %u blocks\n",dimBlock.x,dimGrid.x);CHKERRQ(ierr);
732caccb7e3SMatthew G Knepley 
733caccb7e3SMatthew G Knepley   /* Initialize memory on the device */
734caccb7e3SMatthew G Knepley   set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N);
735caccb7e3SMatthew G Knepley   set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N);
736caccb7e3SMatthew G Knepley   set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N);
737caccb7e3SMatthew G Knepley 
738caccb7e3SMatthew G Knepley   /* --- MAIN LOOP --- repeat test cases NTIMES times --- */
739caccb7e3SMatthew G Knepley   PetscLogDouble cpuTimer = 0.0;
740caccb7e3SMatthew G Knepley   cudaEvent_t start, stop;
741caccb7e3SMatthew G Knepley 
742caccb7e3SMatthew G Knepley   /* both timers report msec */
743caccb7e3SMatthew G Knepley   ierr = cudaEventCreate( &start );CHKERRQ(ierr); /* gpu timer facility */
744caccb7e3SMatthew G Knepley   ierr = cudaEventCreate( &stop );CHKERRQ(ierr);  /* gpu timer facility */
745caccb7e3SMatthew G Knepley 
746caccb7e3SMatthew G Knepley   scalar=3.0;
747caccb7e3SMatthew G Knepley   for (k = 0; k < NTIMES; ++k) {
748caccb7e3SMatthew G Knepley     PetscTimeSubtract(cpuTimer);
749caccb7e3SMatthew G Knepley     ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr);
750caccb7e3SMatthew G Knepley     STREAM_Copy_double<<<dimGrid,dimBlock>>>(d_a, d_c, N);
751caccb7e3SMatthew G Knepley     ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr);
752caccb7e3SMatthew G Knepley     ierr = cudaEventSynchronize(stop);CHKERRQ(ierr);
753caccb7e3SMatthew G Knepley     //get the the total elapsed time in ms
754caccb7e3SMatthew G Knepley     if (bDontUseGPUTiming) {
755caccb7e3SMatthew G Knepley       PetscTimeAdd(cpuTimer);
756caccb7e3SMatthew G Knepley       times[0][k] = cpuTimer;
757caccb7e3SMatthew G Knepley     } else {
758caccb7e3SMatthew G Knepley       ierr = cudaEventElapsedTime( &times[0][k], start, stop );CHKERRQ(ierr);
759caccb7e3SMatthew G Knepley     }
760caccb7e3SMatthew G Knepley 
761caccb7e3SMatthew G Knepley     cpuTimer = 0.0;
762caccb7e3SMatthew G Knepley     PetscTimeSubtract(cpuTimer);
763caccb7e3SMatthew G Knepley     ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr);
764caccb7e3SMatthew G Knepley     STREAM_Copy_Optimized_double<<<dimGrid,dimBlock>>>(d_a, d_c, N);
765caccb7e3SMatthew G Knepley     ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr);
766caccb7e3SMatthew G Knepley     ierr = cudaEventSynchronize(stop);CHKERRQ(ierr);
767caccb7e3SMatthew G Knepley     //get the the total elapsed time in ms
768caccb7e3SMatthew G Knepley     if (bDontUseGPUTiming) {
769caccb7e3SMatthew G Knepley       PetscTimeAdd(cpuTimer);
770caccb7e3SMatthew G Knepley       times[1][k] = cpuTimer;
771caccb7e3SMatthew G Knepley     } else {
772caccb7e3SMatthew G Knepley       ierr = cudaEventElapsedTime( &times[1][k], start, stop );CHKERRQ(ierr);
773caccb7e3SMatthew G Knepley     }
774caccb7e3SMatthew G Knepley 
775caccb7e3SMatthew G Knepley     cpuTimer = 0.0;
776caccb7e3SMatthew G Knepley     PetscTimeSubtract(cpuTimer);
777caccb7e3SMatthew G Knepley     ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr);
778caccb7e3SMatthew G Knepley     STREAM_Scale_double<<<dimGrid,dimBlock>>>(d_b, d_c, scalar,  N);
779caccb7e3SMatthew G Knepley     ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr);
780caccb7e3SMatthew G Knepley     ierr = cudaEventSynchronize(stop);CHKERRQ(ierr);
781caccb7e3SMatthew G Knepley     //get the the total elapsed time in ms
782caccb7e3SMatthew G Knepley     PetscTimeAdd(cpuTimer);
783caccb7e3SMatthew G Knepley     if (bDontUseGPUTiming) {
784caccb7e3SMatthew G Knepley       times[2][k] = cpuTimer;
785caccb7e3SMatthew G Knepley     } else {
786caccb7e3SMatthew G Knepley       ierr = cudaEventElapsedTime( &times[2][k], start, stop );CHKERRQ(ierr);
787caccb7e3SMatthew G Knepley     }
788caccb7e3SMatthew G Knepley 
789caccb7e3SMatthew G Knepley     cpuTimer = 0.0;
790caccb7e3SMatthew G Knepley     PetscTimeSubtract(cpuTimer);
791caccb7e3SMatthew G Knepley     ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr);
792caccb7e3SMatthew G Knepley     STREAM_Scale_Optimized_double<<<dimGrid,dimBlock>>>(d_b, d_c, scalar,  N);
793caccb7e3SMatthew G Knepley     ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr);
794caccb7e3SMatthew G Knepley     ierr = cudaEventSynchronize(stop);CHKERRQ(ierr);
795caccb7e3SMatthew G Knepley     //get the the total elapsed time in ms
796caccb7e3SMatthew G Knepley     PetscTimeAdd(cpuTimer);
797caccb7e3SMatthew G Knepley     if (bDontUseGPUTiming) {
798caccb7e3SMatthew G Knepley       times[3][k] = cpuTimer;
799caccb7e3SMatthew G Knepley     } else {
800caccb7e3SMatthew G Knepley       ierr = cudaEventElapsedTime( &times[2][k], start, stop );CHKERRQ(ierr);
801caccb7e3SMatthew G Knepley     }
802caccb7e3SMatthew G Knepley 
803caccb7e3SMatthew G Knepley     cpuTimer = 0.0;
804caccb7e3SMatthew G Knepley     PetscTimeSubtract(cpuTimer);
805caccb7e3SMatthew G Knepley     ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr);
806caccb7e3SMatthew G Knepley     STREAM_Add_double<<<dimGrid,dimBlock>>>(d_a, d_b, d_c,  N);
807caccb7e3SMatthew G Knepley     ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr);
808caccb7e3SMatthew G Knepley     ierr = cudaEventSynchronize(stop);CHKERRQ(ierr);
809caccb7e3SMatthew G Knepley     //get the the total elapsed time in ms
810caccb7e3SMatthew G Knepley     PetscTimeAdd(cpuTimer);
811caccb7e3SMatthew G Knepley     if (bDontUseGPUTiming) {
812caccb7e3SMatthew G Knepley       times[4][k] = cpuTimer;
813caccb7e3SMatthew G Knepley     } else {
814caccb7e3SMatthew G Knepley       ierr = cudaEventElapsedTime( &times[3][k], start, stop );CHKERRQ(ierr);
815caccb7e3SMatthew G Knepley     }
816caccb7e3SMatthew G Knepley 
817caccb7e3SMatthew G Knepley     cpuTimer = 0.0;
818caccb7e3SMatthew G Knepley     PetscTimeSubtract(cpuTimer);
819caccb7e3SMatthew G Knepley     ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr);
820caccb7e3SMatthew G Knepley     STREAM_Add_Optimized_double<<<dimGrid,dimBlock>>>(d_a, d_b, d_c,  N);
821caccb7e3SMatthew G Knepley     ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr);
822caccb7e3SMatthew G Knepley     ierr = cudaEventSynchronize(stop);CHKERRQ(ierr);
823caccb7e3SMatthew G Knepley     //get the the total elapsed time in ms
824caccb7e3SMatthew G Knepley     PetscTimeAdd(cpuTimer);
825caccb7e3SMatthew G Knepley     if (bDontUseGPUTiming) {
826caccb7e3SMatthew G Knepley       times[5][k] = cpuTimer;
827caccb7e3SMatthew G Knepley     } else {
828caccb7e3SMatthew G Knepley       ierr = cudaEventElapsedTime( &times[3][k], start, stop );CHKERRQ(ierr);
829caccb7e3SMatthew G Knepley     }
830caccb7e3SMatthew G Knepley 
831caccb7e3SMatthew G Knepley     cpuTimer = 0.0;
832caccb7e3SMatthew G Knepley     PetscTimeSubtract(cpuTimer);
833caccb7e3SMatthew G Knepley     ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr);
834caccb7e3SMatthew G Knepley     STREAM_Triad_double<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar,  N);
835caccb7e3SMatthew G Knepley     ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr);
836caccb7e3SMatthew G Knepley     ierr = cudaEventSynchronize(stop);CHKERRQ(ierr);
837caccb7e3SMatthew G Knepley     //get the the total elapsed time in ms
838caccb7e3SMatthew G Knepley     PetscTimeAdd(cpuTimer);
839caccb7e3SMatthew G Knepley     if (bDontUseGPUTiming) {
840caccb7e3SMatthew G Knepley       times[6][k] = cpuTimer;
841caccb7e3SMatthew G Knepley     } else {
842caccb7e3SMatthew G Knepley       ierr = cudaEventElapsedTime( &times[4][k], start, stop );CHKERRQ(ierr);
843caccb7e3SMatthew G Knepley     }
844caccb7e3SMatthew G Knepley 
845caccb7e3SMatthew G Knepley     cpuTimer = 0.0;
846caccb7e3SMatthew G Knepley     PetscTimeSubtract(cpuTimer);
847caccb7e3SMatthew G Knepley     ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr);
848caccb7e3SMatthew G Knepley     STREAM_Triad_Optimized_double<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar,  N);
849caccb7e3SMatthew G Knepley     ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr);
850caccb7e3SMatthew G Knepley     ierr = cudaEventSynchronize(stop);CHKERRQ(ierr);
851caccb7e3SMatthew G Knepley     //get the the total elapsed time in ms
852caccb7e3SMatthew G Knepley     PetscTimeAdd(cpuTimer);
853caccb7e3SMatthew G Knepley     if (bDontUseGPUTiming) {
854caccb7e3SMatthew G Knepley       times[7][k] = cpuTimer;
855caccb7e3SMatthew G Knepley     } else {
856caccb7e3SMatthew G Knepley       ierr = cudaEventElapsedTime( &times[4][k], start, stop );CHKERRQ(ierr);
857caccb7e3SMatthew G Knepley     }
858caccb7e3SMatthew G Knepley 
859caccb7e3SMatthew G Knepley   }
860caccb7e3SMatthew G Knepley 
861caccb7e3SMatthew G Knepley   /* verify kernels */
862caccb7e3SMatthew G Knepley   double *h_a, *h_b, *h_c;
863caccb7e3SMatthew G Knepley   bool errorSTREAMkernel = true;
864caccb7e3SMatthew G Knepley 
865caccb7e3SMatthew G Knepley   if ( (h_a = (double*)calloc( N, sizeof(double) )) == (double*)NULL ) {
866caccb7e3SMatthew G Knepley     printf("Unable to allocate array h_a, exiting ...\n");
867caccb7e3SMatthew G Knepley     exit(1);
868caccb7e3SMatthew G Knepley   }
869caccb7e3SMatthew G Knepley   if ( (h_b = (double*)calloc( N, sizeof(double) )) == (double*)NULL ) {
870caccb7e3SMatthew G Knepley     printf("Unable to allocate array h_b, exiting ...\n");
871caccb7e3SMatthew G Knepley     exit(1);
872caccb7e3SMatthew G Knepley   }
873caccb7e3SMatthew G Knepley 
874caccb7e3SMatthew G Knepley   if ( (h_c = (double*)calloc( N, sizeof(double) )) == (double*)NULL ) {
875caccb7e3SMatthew G Knepley     printf("Unalbe to allocate array h_c, exiting ...\n");
876caccb7e3SMatthew G Knepley     exit(1);
877caccb7e3SMatthew G Knepley   }
878caccb7e3SMatthew G Knepley 
879caccb7e3SMatthew G Knepley   /*
880caccb7e3SMatthew G Knepley    * perform kernel, copy device memory into host memory and verify each
881caccb7e3SMatthew G Knepley    * device kernel output
882caccb7e3SMatthew G Knepley    */
883caccb7e3SMatthew G Knepley 
884caccb7e3SMatthew G Knepley   /* Initialize memory on the device */
885caccb7e3SMatthew G Knepley   set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N);
886caccb7e3SMatthew G Knepley   set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N);
887caccb7e3SMatthew G Knepley   set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N);
888caccb7e3SMatthew G Knepley 
889caccb7e3SMatthew G Knepley   STREAM_Copy_double<<<dimGrid,dimBlock>>>(d_a, d_c, N);
890caccb7e3SMatthew G Knepley   ierr = cudaMemcpy( h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
891caccb7e3SMatthew G Knepley   ierr = cudaMemcpy( h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
892caccb7e3SMatthew G Knepley   errorSTREAMkernel = STREAM_Copy_verify_double(h_a, h_c, N);
893caccb7e3SMatthew G Knepley   if (errorSTREAMkernel) {
894caccb7e3SMatthew G Knepley     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy:\t\tError detected in device STREAM_Copy, exiting\n");CHKERRQ(ierr);
895caccb7e3SMatthew G Knepley     exit(-2000);
896caccb7e3SMatthew G Knepley   } else {
897caccb7e3SMatthew G Knepley     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy:\t\tPass\n");CHKERRQ(ierr);
898caccb7e3SMatthew G Knepley   }
899caccb7e3SMatthew G Knepley 
900caccb7e3SMatthew G Knepley   /* Initialize memory on the device */
901caccb7e3SMatthew G Knepley   set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N);
902caccb7e3SMatthew G Knepley   set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N);
903caccb7e3SMatthew G Knepley   set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N);
904caccb7e3SMatthew G Knepley 
905caccb7e3SMatthew G Knepley   STREAM_Copy_Optimized_double<<<dimGrid,dimBlock>>>(d_a, d_c, N);
906caccb7e3SMatthew G Knepley   ierr = cudaMemcpy( h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
907caccb7e3SMatthew G Knepley   ierr = cudaMemcpy( h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
908caccb7e3SMatthew G Knepley   errorSTREAMkernel = STREAM_Copy_verify_double(h_a, h_c, N);
909caccb7e3SMatthew G Knepley   if (errorSTREAMkernel) {
910caccb7e3SMatthew G Knepley     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy_Optimized:\tError detected in device STREAM_Copy_Optimized, exiting\n");CHKERRQ(ierr);
911caccb7e3SMatthew G Knepley     exit(-3000);
912caccb7e3SMatthew G Knepley   } else {
913caccb7e3SMatthew G Knepley     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy_Optimized:\tPass\n");CHKERRQ(ierr);
914caccb7e3SMatthew G Knepley   }
915caccb7e3SMatthew G Knepley 
916caccb7e3SMatthew G Knepley   /* Initialize memory on the device */
917caccb7e3SMatthew G Knepley   set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N);
918caccb7e3SMatthew G Knepley   set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N);
919caccb7e3SMatthew G Knepley 
920caccb7e3SMatthew G Knepley   STREAM_Scale_double<<<dimGrid,dimBlock>>>(d_b, d_c, scalar, N);
921caccb7e3SMatthew G Knepley   ierr = cudaMemcpy( h_b, d_b, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
922caccb7e3SMatthew G Knepley   ierr = cudaMemcpy( h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
923caccb7e3SMatthew G Knepley   errorSTREAMkernel = STREAM_Scale_verify_double(h_b, h_c, scalar, N);
924caccb7e3SMatthew G Knepley   if (errorSTREAMkernel) {
925caccb7e3SMatthew G Knepley     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Scale:\t\tError detected in device STREAM_Scale, exiting\n");CHKERRQ(ierr);
926caccb7e3SMatthew G Knepley     exit(-4000);
927caccb7e3SMatthew G Knepley   } else {
928caccb7e3SMatthew G Knepley     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Scale:\t\tPass\n");CHKERRQ(ierr);
929caccb7e3SMatthew G Knepley   }
930caccb7e3SMatthew G Knepley 
931caccb7e3SMatthew G Knepley   /* Initialize memory on the device */
932caccb7e3SMatthew G Knepley   set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N);
933caccb7e3SMatthew G Knepley   set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N);
934caccb7e3SMatthew G Knepley   set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N);
935caccb7e3SMatthew G Knepley 
936caccb7e3SMatthew G Knepley   STREAM_Add_double<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N);
937caccb7e3SMatthew G Knepley   ierr = cudaMemcpy( h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
938caccb7e3SMatthew G Knepley   ierr = cudaMemcpy( h_b, d_b, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
939caccb7e3SMatthew G Knepley   ierr = cudaMemcpy( h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
940caccb7e3SMatthew G Knepley   errorSTREAMkernel = STREAM_Add_verify_double(h_a, h_b, h_c, N);
941caccb7e3SMatthew G Knepley   if (errorSTREAMkernel) {
942caccb7e3SMatthew G Knepley     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Add:\t\tError detected in device STREAM_Add, exiting\n");CHKERRQ(ierr);
943caccb7e3SMatthew G Knepley     exit(-5000);
944caccb7e3SMatthew G Knepley   } else {
945caccb7e3SMatthew G Knepley     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Add:\t\tPass\n");CHKERRQ(ierr);
946caccb7e3SMatthew G Knepley   }
947caccb7e3SMatthew G Knepley 
948caccb7e3SMatthew G Knepley   /* Initialize memory on the device */
949caccb7e3SMatthew G Knepley   set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N);
950caccb7e3SMatthew G Knepley   set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N);
951caccb7e3SMatthew G Knepley   set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N);
952caccb7e3SMatthew G Knepley 
953caccb7e3SMatthew G Knepley   STREAM_Triad_double<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N);
954caccb7e3SMatthew G Knepley   ierr = cudaMemcpy( h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
955caccb7e3SMatthew G Knepley   ierr = cudaMemcpy( h_b, d_b, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
956caccb7e3SMatthew G Knepley   ierr = cudaMemcpy( h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
957caccb7e3SMatthew G Knepley   errorSTREAMkernel = STREAM_Triad_verify_double(h_b, h_c, h_a, scalar, N);
958caccb7e3SMatthew G Knepley   if (errorSTREAMkernel) {
959caccb7e3SMatthew G Knepley     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Triad:\t\tError detected in device STREAM_Triad, exiting\n");CHKERRQ(ierr);
960caccb7e3SMatthew G Knepley     exit(-6000);
961caccb7e3SMatthew G Knepley   } else {
962caccb7e3SMatthew G Knepley     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Triad:\t\tPass\n");CHKERRQ(ierr);
963caccb7e3SMatthew G Knepley   }
964caccb7e3SMatthew G Knepley 
965caccb7e3SMatthew G Knepley   /* continue from here */
966caccb7e3SMatthew G Knepley   printResultsReadable(times);
967caccb7e3SMatthew G Knepley 
968caccb7e3SMatthew G Knepley   //clean up timers
969caccb7e3SMatthew G Knepley   ierr = cudaEventDestroy( stop );CHKERRQ(ierr);
970caccb7e3SMatthew G Knepley   ierr = cudaEventDestroy( start );CHKERRQ(ierr);
971caccb7e3SMatthew G Knepley 
972caccb7e3SMatthew G Knepley   /* Free memory on device */
973caccb7e3SMatthew G Knepley   ierr = cudaFree(d_a);CHKERRQ(ierr);
974caccb7e3SMatthew G Knepley   ierr = cudaFree(d_b);CHKERRQ(ierr);
975caccb7e3SMatthew G Knepley   ierr = cudaFree(d_c);CHKERRQ(ierr);
976caccb7e3SMatthew G Knepley   PetscFunctionReturn(0);
977caccb7e3SMatthew G Knepley }
978caccb7e3SMatthew G Knepley 
979403adfb6SMatthew G Knepley ///////////////////////////////////////////////////////////////////////////
980403adfb6SMatthew G Knepley //Print Results to Screen and File
981403adfb6SMatthew G Knepley ///////////////////////////////////////////////////////////////////////////
982*a6dfd86eSKarl Rupp PetscErrorCode printResultsReadable(float times[][NTIMES])
983*a6dfd86eSKarl Rupp {
984403adfb6SMatthew G Knepley   PetscErrorCode ierr;
985403adfb6SMatthew G Knepley   PetscInt       j, k;
986caccb7e3SMatthew G Knepley   float          avgtime[8] = {0., 0., 0., 0., 0., 0., 0., 0.};
987caccb7e3SMatthew G Knepley   float          maxtime[8] = {0., 0., 0., 0., 0., 0., 0., 0.};
988caccb7e3SMatthew G Knepley   float          mintime[8] = {1e30,1e30,1e30,1e30,1e30,1e30,1e30,1e30};
989caccb7e3SMatthew G Knepley   char *label[8]   = {"Copy:      ", "Copy Opt.: ", "Scale:     ", "Scale Opt: ", "Add:       ", "Add Opt:   ", "Triad:     ", "Triad Opt: "};
990caccb7e3SMatthew G Knepley   float          bytes_per_kernel[8] = {
991403adfb6SMatthew G Knepley     2. * sizeof(float) * N,
992403adfb6SMatthew G Knepley     2. * sizeof(float) * N,
993403adfb6SMatthew G Knepley     2. * sizeof(float) * N,
994caccb7e3SMatthew G Knepley     2. * sizeof(float) * N,
995caccb7e3SMatthew G Knepley     3. * sizeof(float) * N,
996caccb7e3SMatthew G Knepley     3. * sizeof(float) * N,
997403adfb6SMatthew G Knepley     3. * sizeof(float) * N,
998403adfb6SMatthew G Knepley     3. * sizeof(float) * N
999403adfb6SMatthew G Knepley   };
1000403adfb6SMatthew G Knepley 
1001403adfb6SMatthew G Knepley   PetscFunctionBegin;
1002403adfb6SMatthew G Knepley   /* --- SUMMARY --- */
1003403adfb6SMatthew G Knepley   for (k = 1; k < NTIMES; ++k) { /* note -- skip first iteration */
1004caccb7e3SMatthew G Knepley     for (j = 0; j < 8; ++j) {
1005403adfb6SMatthew G Knepley       avgtime[j] = avgtime[j] + (1.e-03f * times[j][k]);
1006403adfb6SMatthew G Knepley       mintime[j] = MIN(mintime[j], (1.e-03f * times[j][k]));
1007403adfb6SMatthew G Knepley       maxtime[j] = MAX(maxtime[j], (1.e-03f * times[j][k]));
1008403adfb6SMatthew G Knepley     }
1009403adfb6SMatthew G Knepley   }
1010403adfb6SMatthew G Knepley 
1011403adfb6SMatthew G Knepley   ierr = PetscPrintf(PETSC_COMM_SELF, "Function    Rate (MB/s)    Avg time      Min time      Max time\n");CHKERRQ(ierr);
1012403adfb6SMatthew G Knepley 
1013caccb7e3SMatthew G Knepley   for (j = 0; j < 8; ++j) {
1014403adfb6SMatthew G Knepley      avgtime[j] = avgtime[j]/(float)(NTIMES-1);
1015403adfb6SMatthew G Knepley      ierr = PetscPrintf(PETSC_COMM_SELF, "%s%11.4f  %11.6f  %12.6f  %12.6f\n", label[j], 1.0E-06 * bytes_per_kernel[j]/mintime[j], avgtime[j], mintime[j], maxtime[j]);CHKERRQ(ierr);
1016403adfb6SMatthew G Knepley   }
1017403adfb6SMatthew G Knepley   PetscFunctionReturn(0);
1018403adfb6SMatthew G Knepley }
1019