xref: /petsc/src/benchmarks/streams/CUDAVersion.cu (revision dd4005766979d6c32c7873f45a6074c17defa719)
1403adfb6SMatthew G Knepley /*
2403adfb6SMatthew G Knepley   STREAM benchmark implementation in CUDA.
3403adfb6SMatthew G Knepley 
4403adfb6SMatthew G Knepley     COPY:       a(i) = b(i)
5403adfb6SMatthew G Knepley     SCALE:      a(i) = q*b(i)
6403adfb6SMatthew G Knepley     SUM:        a(i) = b(i) + c(i)
7403adfb6SMatthew G Knepley     TRIAD:      a(i) = b(i) + q*c(i)
8403adfb6SMatthew G Knepley 
9403adfb6SMatthew G Knepley   It measures the memory system on the device.
1019816777SMark   The implementation is in double precision with a single option.
11403adfb6SMatthew G Knepley 
12403adfb6SMatthew G Knepley   Code based on the code developed by John D. McCalpin
13403adfb6SMatthew G Knepley   http://www.cs.virginia.edu/stream/FTP/Code/stream.c
14403adfb6SMatthew G Knepley 
15403adfb6SMatthew G Knepley   Written by: Massimiliano Fatica, NVIDIA Corporation
16403adfb6SMatthew G Knepley   Modified by: Douglas Enright (dpephd-nvidia@yahoo.com), 1 December 2010
17403adfb6SMatthew G Knepley   Extensive Revisions, 4 December 2010
18403adfb6SMatthew G Knepley   Modified for PETSc by: Matthew G. Knepley 14 Aug 2011
19403adfb6SMatthew G Knepley 
20403adfb6SMatthew G Knepley   User interface motivated by bandwidthTest NVIDIA SDK example.
21403adfb6SMatthew G Knepley */
2219816777SMark static char help[] = "Double-Precision STREAM Benchmark implementation in CUDA\n Performs Copy, Scale, Add, and Triad double-precision kernels\n\n";
23403adfb6SMatthew G Knepley 
24403adfb6SMatthew G Knepley #include <petscconf.h>
25403adfb6SMatthew G Knepley #include <petscsys.h>
26403adfb6SMatthew G Knepley #include <petsctime.h>
27403adfb6SMatthew G Knepley 
2819816777SMark #define N        10000000
29403adfb6SMatthew G Knepley #define NTIMES   10
30403adfb6SMatthew G Knepley 
31403adfb6SMatthew G Knepley # ifndef MIN
32403adfb6SMatthew G Knepley # define MIN(x,y) ((x)<(y) ? (x) : (y))
33403adfb6SMatthew G Knepley # endif
34403adfb6SMatthew G Knepley # ifndef MAX
35403adfb6SMatthew G Knepley # define MAX(x,y) ((x)>(y) ? (x) : (y))
36403adfb6SMatthew G Knepley # endif
37403adfb6SMatthew G Knepley 
38403adfb6SMatthew G Knepley const float  flt_eps = 1.192092896e-07f;
39caccb7e3SMatthew G Knepley const double dbl_eps = 2.2204460492503131e-16;
40403adfb6SMatthew G Knepley 
41403adfb6SMatthew G Knepley __global__ void set_array(float *a,  float value, size_t len)
42403adfb6SMatthew G Knepley {
43403adfb6SMatthew G Knepley   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
44403adfb6SMatthew G Knepley   while (idx < len) {
45403adfb6SMatthew G Knepley     a[idx] = value;
46403adfb6SMatthew G Knepley     idx   += blockDim.x * gridDim.x;
47403adfb6SMatthew G Knepley   }
48403adfb6SMatthew G Knepley }
49403adfb6SMatthew G Knepley 
50caccb7e3SMatthew G Knepley __global__ void set_array_double(double *a,  double value, size_t len)
51caccb7e3SMatthew G Knepley {
52caccb7e3SMatthew G Knepley   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
53caccb7e3SMatthew G Knepley   while (idx < len) {
54caccb7e3SMatthew G Knepley     a[idx] = value;
55caccb7e3SMatthew G Knepley     idx   += blockDim.x * gridDim.x;
56caccb7e3SMatthew G Knepley   }
57caccb7e3SMatthew G Knepley }
58caccb7e3SMatthew G Knepley 
59403adfb6SMatthew G Knepley __global__ void STREAM_Copy(float *a, float *b, size_t len)
60403adfb6SMatthew G Knepley {
61403adfb6SMatthew G Knepley   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
62403adfb6SMatthew G Knepley   while (idx < len) {
63403adfb6SMatthew G Knepley     b[idx] = a[idx];
64403adfb6SMatthew G Knepley     idx   += blockDim.x * gridDim.x;
65403adfb6SMatthew G Knepley   }
66403adfb6SMatthew G Knepley }
67403adfb6SMatthew G Knepley 
68caccb7e3SMatthew G Knepley __global__ void STREAM_Copy_double(double *a, double *b, size_t len)
69caccb7e3SMatthew G Knepley {
70caccb7e3SMatthew G Knepley   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
71caccb7e3SMatthew G Knepley   while (idx < len) {
72caccb7e3SMatthew G Knepley     b[idx] = a[idx];
73caccb7e3SMatthew G Knepley     idx   += blockDim.x * gridDim.x;
74caccb7e3SMatthew G Knepley   }
75caccb7e3SMatthew G Knepley }
76caccb7e3SMatthew G Knepley 
77403adfb6SMatthew G Knepley __global__ void STREAM_Copy_Optimized(float *a, float *b, size_t len)
78403adfb6SMatthew G Knepley {
79403adfb6SMatthew G Knepley   /*
80403adfb6SMatthew G Knepley    * Ensure size of thread index space is as large as or greater than
81403adfb6SMatthew G Knepley    * vector index space else return.
82403adfb6SMatthew G Knepley    */
83403adfb6SMatthew G Knepley   if (blockDim.x * gridDim.x < len) return;
84403adfb6SMatthew G Knepley   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
85403adfb6SMatthew G Knepley   if (idx < len) b[idx] = a[idx];
86403adfb6SMatthew G Knepley }
87403adfb6SMatthew G Knepley 
88caccb7e3SMatthew G Knepley __global__ void STREAM_Copy_Optimized_double(double *a, double *b, size_t len)
89caccb7e3SMatthew G Knepley {
90caccb7e3SMatthew G Knepley   /*
91caccb7e3SMatthew G Knepley    * Ensure size of thread index space is as large as or greater than
92caccb7e3SMatthew G Knepley    * vector index space else return.
93caccb7e3SMatthew G Knepley    */
94caccb7e3SMatthew G Knepley   if (blockDim.x * gridDim.x < len) return;
95caccb7e3SMatthew G Knepley   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
96caccb7e3SMatthew G Knepley   if (idx < len) b[idx] = a[idx];
97caccb7e3SMatthew G Knepley }
98caccb7e3SMatthew G Knepley 
99403adfb6SMatthew G Knepley __global__ void STREAM_Scale(float *a, float *b, float scale,  size_t len)
100403adfb6SMatthew G Knepley {
101403adfb6SMatthew G Knepley   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
102403adfb6SMatthew G Knepley   while (idx < len) {
103403adfb6SMatthew G Knepley     b[idx] = scale* a[idx];
104403adfb6SMatthew G Knepley     idx   += blockDim.x * gridDim.x;
105403adfb6SMatthew G Knepley   }
106403adfb6SMatthew G Knepley }
107403adfb6SMatthew G Knepley 
108caccb7e3SMatthew G Knepley __global__ void STREAM_Scale_double(double *a, double *b, double scale,  size_t len)
109caccb7e3SMatthew G Knepley {
110caccb7e3SMatthew G Knepley   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
111caccb7e3SMatthew G Knepley   while (idx < len) {
112caccb7e3SMatthew G Knepley     b[idx] = scale* a[idx];
113caccb7e3SMatthew G Knepley     idx   += blockDim.x * gridDim.x;
114caccb7e3SMatthew G Knepley   }
115caccb7e3SMatthew G Knepley }
116caccb7e3SMatthew G Knepley 
117caccb7e3SMatthew G Knepley __global__ void STREAM_Scale_Optimized(float *a, float *b, float scale,  size_t len)
118caccb7e3SMatthew G Knepley {
119caccb7e3SMatthew G Knepley   /*
120caccb7e3SMatthew G Knepley    * Ensure size of thread index space is as large as or greater than
121caccb7e3SMatthew G Knepley    * vector index space else return.
122caccb7e3SMatthew G Knepley    */
123caccb7e3SMatthew G Knepley   if (blockDim.x * gridDim.x < len) return;
124caccb7e3SMatthew G Knepley   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
125caccb7e3SMatthew G Knepley   if (idx < len) b[idx] = scale* a[idx];
126caccb7e3SMatthew G Knepley }
127caccb7e3SMatthew G Knepley 
128caccb7e3SMatthew G Knepley __global__ void STREAM_Scale_Optimized_double(double *a, double *b, double scale,  size_t len)
129caccb7e3SMatthew G Knepley {
130caccb7e3SMatthew G Knepley   /*
131caccb7e3SMatthew G Knepley    * Ensure size of thread index space is as large as or greater than
132caccb7e3SMatthew G Knepley    * vector index space else return.
133caccb7e3SMatthew G Knepley    */
134caccb7e3SMatthew G Knepley   if (blockDim.x * gridDim.x < len) return;
135caccb7e3SMatthew G Knepley   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
136caccb7e3SMatthew G Knepley   if (idx < len) b[idx] = scale* a[idx];
137caccb7e3SMatthew G Knepley }
138caccb7e3SMatthew G Knepley 
139403adfb6SMatthew G Knepley __global__ void STREAM_Add(float *a, float *b, float *c,  size_t len)
140403adfb6SMatthew G Knepley {
141403adfb6SMatthew G Knepley   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
142403adfb6SMatthew G Knepley   while (idx < len) {
143403adfb6SMatthew G Knepley     c[idx] = a[idx]+b[idx];
144403adfb6SMatthew G Knepley     idx   += blockDim.x * gridDim.x;
145403adfb6SMatthew G Knepley   }
146403adfb6SMatthew G Knepley }
147403adfb6SMatthew G Knepley 
148caccb7e3SMatthew G Knepley __global__ void STREAM_Add_double(double *a, double *b, double *c,  size_t len)
149caccb7e3SMatthew G Knepley {
150caccb7e3SMatthew G Knepley   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
151caccb7e3SMatthew G Knepley   while (idx < len) {
152caccb7e3SMatthew G Knepley     c[idx] = a[idx]+b[idx];
153caccb7e3SMatthew G Knepley     idx   += blockDim.x * gridDim.x;
154caccb7e3SMatthew G Knepley   }
155caccb7e3SMatthew G Knepley }
156caccb7e3SMatthew G Knepley 
157caccb7e3SMatthew G Knepley __global__ void STREAM_Add_Optimized(float *a, float *b, float *c,  size_t len)
158caccb7e3SMatthew G Knepley {
159caccb7e3SMatthew G Knepley   /*
160caccb7e3SMatthew G Knepley    * Ensure size of thread index space is as large as or greater than
161caccb7e3SMatthew G Knepley    * vector index space else return.
162caccb7e3SMatthew G Knepley    */
163caccb7e3SMatthew G Knepley   if (blockDim.x * gridDim.x < len) return;
164caccb7e3SMatthew G Knepley   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
165caccb7e3SMatthew G Knepley   if (idx < len) c[idx] = a[idx]+b[idx];
166caccb7e3SMatthew G Knepley }
167caccb7e3SMatthew G Knepley 
168caccb7e3SMatthew G Knepley __global__ void STREAM_Add_Optimized_double(double *a, double *b, double *c,  size_t len)
169caccb7e3SMatthew G Knepley {
170caccb7e3SMatthew G Knepley   /*
171caccb7e3SMatthew G Knepley    * Ensure size of thread index space is as large as or greater than
172caccb7e3SMatthew G Knepley    * vector index space else return.
173caccb7e3SMatthew G Knepley    */
174caccb7e3SMatthew G Knepley   if (blockDim.x * gridDim.x < len) return;
175caccb7e3SMatthew G Knepley   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
176caccb7e3SMatthew G Knepley   if (idx < len) c[idx] = a[idx]+b[idx];
177caccb7e3SMatthew G Knepley }
178caccb7e3SMatthew G Knepley 
179403adfb6SMatthew G Knepley __global__ void STREAM_Triad(float *a, float *b, float *c, float scalar, size_t len)
180403adfb6SMatthew G Knepley {
181403adfb6SMatthew G Knepley   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
182403adfb6SMatthew G Knepley   while (idx < len) {
183403adfb6SMatthew G Knepley     c[idx] = a[idx]+scalar*b[idx];
184403adfb6SMatthew G Knepley     idx   += blockDim.x * gridDim.x;
185403adfb6SMatthew G Knepley   }
186403adfb6SMatthew G Knepley }
187403adfb6SMatthew G Knepley 
188caccb7e3SMatthew G Knepley __global__ void STREAM_Triad_double(double *a, double *b, double *c, double scalar, size_t len)
189caccb7e3SMatthew G Knepley {
190caccb7e3SMatthew G Knepley   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
191caccb7e3SMatthew G Knepley   while (idx < len) {
192caccb7e3SMatthew G Knepley     c[idx] = a[idx]+scalar*b[idx];
193caccb7e3SMatthew G Knepley     idx   += blockDim.x * gridDim.x;
194caccb7e3SMatthew G Knepley   }
195caccb7e3SMatthew G Knepley }
196caccb7e3SMatthew G Knepley 
197caccb7e3SMatthew G Knepley __global__ void STREAM_Triad_Optimized(float *a, float *b, float *c, float scalar, size_t len)
198caccb7e3SMatthew G Knepley {
199caccb7e3SMatthew G Knepley   /*
200caccb7e3SMatthew G Knepley    * Ensure size of thread index space is as large as or greater than
201caccb7e3SMatthew G Knepley    * vector index space else return.
202caccb7e3SMatthew G Knepley    */
203caccb7e3SMatthew G Knepley   if (blockDim.x * gridDim.x < len) return;
204caccb7e3SMatthew G Knepley   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
205caccb7e3SMatthew G Knepley   if (idx < len) c[idx] = a[idx]+scalar*b[idx];
206caccb7e3SMatthew G Knepley }
207caccb7e3SMatthew G Knepley 
208caccb7e3SMatthew G Knepley __global__ void STREAM_Triad_Optimized_double(double *a, double *b, double *c, double scalar, size_t len)
209caccb7e3SMatthew G Knepley {
210caccb7e3SMatthew G Knepley   /*
211caccb7e3SMatthew G Knepley    * Ensure size of thread index space is as large as or greater than
212caccb7e3SMatthew G Knepley    * vector index space else return.
213caccb7e3SMatthew G Knepley    */
214caccb7e3SMatthew G Knepley   if (blockDim.x * gridDim.x < len) return;
215caccb7e3SMatthew G Knepley   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
216caccb7e3SMatthew G Knepley   if (idx < len) c[idx] = a[idx]+scalar*b[idx];
217caccb7e3SMatthew G Knepley }
218caccb7e3SMatthew G Knepley 
219403adfb6SMatthew G Knepley /* Host side verification routines */
220a6dfd86eSKarl Rupp bool STREAM_Copy_verify(float *a, float *b, size_t len)
221a6dfd86eSKarl Rupp {
222403adfb6SMatthew G Knepley   size_t idx;
223403adfb6SMatthew G Knepley   bool   bDifferent = false;
224403adfb6SMatthew G Knepley 
225403adfb6SMatthew G Knepley   for (idx = 0; idx < len && !bDifferent; idx++) {
226403adfb6SMatthew G Knepley     float expectedResult     = a[idx];
227403adfb6SMatthew G Knepley     float diffResultExpected = (b[idx] - expectedResult);
228403adfb6SMatthew G Knepley     float relErrorULPS       = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps;
229403adfb6SMatthew G Knepley     /* element-wise relative error determination */
230403adfb6SMatthew G Knepley     bDifferent = (relErrorULPS > 2.f);
231403adfb6SMatthew G Knepley   }
232403adfb6SMatthew G Knepley 
233403adfb6SMatthew G Knepley   return bDifferent;
234403adfb6SMatthew G Knepley }
235403adfb6SMatthew G Knepley 
236a6dfd86eSKarl Rupp bool STREAM_Copy_verify_double(double *a, double *b, size_t len)
237a6dfd86eSKarl Rupp {
238caccb7e3SMatthew G Knepley   size_t idx;
239caccb7e3SMatthew G Knepley   bool   bDifferent = false;
240caccb7e3SMatthew G Knepley 
241caccb7e3SMatthew G Knepley   for (idx = 0; idx < len && !bDifferent; idx++) {
242caccb7e3SMatthew G Knepley     double expectedResult     = a[idx];
243caccb7e3SMatthew G Knepley     double diffResultExpected = (b[idx] - expectedResult);
24419816777SMark     double relErrorULPS       = (fabsf(diffResultExpected)/fabsf(expectedResult))/dbl_eps;
245caccb7e3SMatthew G Knepley     /* element-wise relative error determination */
246caccb7e3SMatthew G Knepley     bDifferent = (relErrorULPS > 2.);
247caccb7e3SMatthew G Knepley   }
248caccb7e3SMatthew G Knepley 
249caccb7e3SMatthew G Knepley   return bDifferent;
250caccb7e3SMatthew G Knepley }
251caccb7e3SMatthew G Knepley 
252a6dfd86eSKarl Rupp bool STREAM_Scale_verify(float *a, float *b, float scale, size_t len)
253a6dfd86eSKarl Rupp {
254403adfb6SMatthew G Knepley   size_t idx;
255403adfb6SMatthew G Knepley   bool   bDifferent = false;
256403adfb6SMatthew G Knepley 
257403adfb6SMatthew G Knepley   for (idx = 0; idx < len && !bDifferent; idx++) {
258403adfb6SMatthew G Knepley     float expectedResult     = scale*a[idx];
259403adfb6SMatthew G Knepley     float diffResultExpected = (b[idx] - expectedResult);
260403adfb6SMatthew G Knepley     float relErrorULPS       = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps;
261403adfb6SMatthew G Knepley     /* element-wise relative error determination */
262403adfb6SMatthew G Knepley     bDifferent = (relErrorULPS > 2.f);
263403adfb6SMatthew G Knepley   }
264403adfb6SMatthew G Knepley 
265403adfb6SMatthew G Knepley   return bDifferent;
266403adfb6SMatthew G Knepley }
267403adfb6SMatthew G Knepley 
268a6dfd86eSKarl Rupp bool STREAM_Scale_verify_double(double *a, double *b, double scale, size_t len)
269a6dfd86eSKarl Rupp {
270caccb7e3SMatthew G Knepley   size_t idx;
271caccb7e3SMatthew G Knepley   bool   bDifferent = false;
272caccb7e3SMatthew G Knepley 
273caccb7e3SMatthew G Knepley   for (idx = 0; idx < len && !bDifferent; idx++) {
274caccb7e3SMatthew G Knepley     double expectedResult     = scale*a[idx];
275caccb7e3SMatthew G Knepley     double diffResultExpected = (b[idx] - expectedResult);
276caccb7e3SMatthew G Knepley     double relErrorULPS       = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps;
277caccb7e3SMatthew G Knepley     /* element-wise relative error determination */
278caccb7e3SMatthew G Knepley     bDifferent = (relErrorULPS > 2.);
279caccb7e3SMatthew G Knepley   }
280caccb7e3SMatthew G Knepley 
281caccb7e3SMatthew G Knepley   return bDifferent;
282caccb7e3SMatthew G Knepley }
283caccb7e3SMatthew G Knepley 
284a6dfd86eSKarl Rupp bool STREAM_Add_verify(float *a, float *b, float *c, size_t len)
285a6dfd86eSKarl Rupp {
286403adfb6SMatthew G Knepley   size_t idx;
287403adfb6SMatthew G Knepley   bool   bDifferent = false;
288403adfb6SMatthew G Knepley 
289403adfb6SMatthew G Knepley   for (idx = 0; idx < len && !bDifferent; idx++) {
290403adfb6SMatthew G Knepley     float expectedResult     = a[idx] + b[idx];
291403adfb6SMatthew G Knepley     float diffResultExpected = (c[idx] - expectedResult);
292403adfb6SMatthew G Knepley     float relErrorULPS       = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps;
293403adfb6SMatthew G Knepley     /* element-wise relative error determination */
294403adfb6SMatthew G Knepley     bDifferent = (relErrorULPS > 2.f);
295403adfb6SMatthew G Knepley   }
296403adfb6SMatthew G Knepley 
297403adfb6SMatthew G Knepley   return bDifferent;
298403adfb6SMatthew G Knepley }
299403adfb6SMatthew G Knepley 
300a6dfd86eSKarl Rupp bool STREAM_Add_verify_double(double *a, double *b, double *c, size_t len)
301a6dfd86eSKarl Rupp {
302caccb7e3SMatthew G Knepley   size_t idx;
303caccb7e3SMatthew G Knepley   bool   bDifferent = false;
304caccb7e3SMatthew G Knepley 
305caccb7e3SMatthew G Knepley   for (idx = 0; idx < len && !bDifferent; idx++) {
306caccb7e3SMatthew G Knepley     double expectedResult     = a[idx] + b[idx];
307caccb7e3SMatthew G Knepley     double diffResultExpected = (c[idx] - expectedResult);
308caccb7e3SMatthew G Knepley     double relErrorULPS       = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps;
309caccb7e3SMatthew G Knepley     /* element-wise relative error determination */
310caccb7e3SMatthew G Knepley     bDifferent = (relErrorULPS > 2.);
311caccb7e3SMatthew G Knepley   }
312caccb7e3SMatthew G Knepley 
313caccb7e3SMatthew G Knepley   return bDifferent;
314caccb7e3SMatthew G Knepley }
315caccb7e3SMatthew G Knepley 
316a6dfd86eSKarl Rupp bool STREAM_Triad_verify(float *a, float *b, float *c, float scalar, size_t len)
317a6dfd86eSKarl Rupp {
318403adfb6SMatthew G Knepley   size_t idx;
319403adfb6SMatthew G Knepley   bool   bDifferent = false;
320403adfb6SMatthew G Knepley 
321403adfb6SMatthew G Knepley   for (idx = 0; idx < len && !bDifferent; idx++) {
322403adfb6SMatthew G Knepley     float expectedResult     = a[idx] + scalar*b[idx];
323403adfb6SMatthew G Knepley     float diffResultExpected = (c[idx] - expectedResult);
324403adfb6SMatthew G Knepley     float relErrorULPS       = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps;
325403adfb6SMatthew G Knepley     /* element-wise relative error determination */
326403adfb6SMatthew G Knepley     bDifferent = (relErrorULPS > 3.f);
327403adfb6SMatthew G Knepley   }
328403adfb6SMatthew G Knepley 
329403adfb6SMatthew G Knepley   return bDifferent;
330403adfb6SMatthew G Knepley }
331403adfb6SMatthew G Knepley 
332a6dfd86eSKarl Rupp bool STREAM_Triad_verify_double(double *a, double *b, double *c, double scalar, size_t len)
333a6dfd86eSKarl Rupp {
334caccb7e3SMatthew G Knepley   size_t idx;
335caccb7e3SMatthew G Knepley   bool   bDifferent = false;
336caccb7e3SMatthew G Knepley 
337caccb7e3SMatthew G Knepley   for (idx = 0; idx < len && !bDifferent; idx++) {
338caccb7e3SMatthew G Knepley     double expectedResult     = a[idx] + scalar*b[idx];
339caccb7e3SMatthew G Knepley     double diffResultExpected = (c[idx] - expectedResult);
340caccb7e3SMatthew G Knepley     double relErrorULPS       = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps;
341caccb7e3SMatthew G Knepley     /* element-wise relative error determination */
342caccb7e3SMatthew G Knepley     bDifferent = (relErrorULPS > 3.);
343caccb7e3SMatthew G Knepley   }
344caccb7e3SMatthew G Knepley 
345caccb7e3SMatthew G Knepley   return bDifferent;
346caccb7e3SMatthew G Knepley }
347caccb7e3SMatthew G Knepley 
348403adfb6SMatthew G Knepley /* forward declarations */
349caccb7e3SMatthew G Knepley PetscErrorCode setupStream(PetscInt device, PetscBool runDouble, PetscBool cpuTiming);
350403adfb6SMatthew G Knepley PetscErrorCode runStream(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming);
351caccb7e3SMatthew G Knepley PetscErrorCode runStreamDouble(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming);
35219816777SMark PetscErrorCode printResultsReadable(float times[][NTIMES], size_t);
353403adfb6SMatthew G Knepley 
354403adfb6SMatthew G Knepley int main(int argc, char *argv[])
355403adfb6SMatthew G Knepley {
356403adfb6SMatthew G Knepley   PetscInt       device    = 0;
35719816777SMark   PetscBool      runDouble = PETSC_TRUE;
35819816777SMark   const PetscBool cpuTiming = PETSC_TRUE; // must be true
359403adfb6SMatthew G Knepley   PetscErrorCode ierr;
360403adfb6SMatthew G Knepley 
36119816777SMark   ierr = cudaSetDeviceFlags(cudaDeviceBlockingSync);CHKERRQ(ierr);
36219816777SMark 
363a438ae71SBarry Smith   ierr = PetscInitialize(&argc, &argv, 0, help);if (ierr) return ierr;
364403adfb6SMatthew G Knepley 
365403adfb6SMatthew G Knepley   ierr = PetscOptionsBegin(PETSC_COMM_WORLD, "", "STREAM Benchmark Options", "STREAM");CHKERRQ(ierr);
3665a856986SBarry Smith   ierr = PetscOptionsBoundedInt("-device", "Specify the CUDA device to be used", "STREAM", device, &device, NULL,0);CHKERRQ(ierr);
3670298fd71SBarry Smith   ierr = PetscOptionsBool("-double",    "Also run double precision tests",   "STREAM", runDouble, &runDouble, NULL);CHKERRQ(ierr);
368403adfb6SMatthew G Knepley   ierr = PetscOptionsEnd();
369403adfb6SMatthew G Knepley 
370caccb7e3SMatthew G Knepley   ierr = setupStream(device, runDouble, cpuTiming);
37119816777SMark   if (ierr) {
37226f47effSBarry Smith     ierr = PetscPrintf(PETSC_COMM_SELF, "\n[streamBenchmark] - results:\t%s\n\n", (ierr == 0) ? "PASSES" : "FAILED");CHKERRQ(ierr);
373403adfb6SMatthew G Knepley   }
37426f47effSBarry Smith   ierr = PetscFinalize();
37526f47effSBarry Smith   return ierr;
376403adfb6SMatthew G Knepley }
377403adfb6SMatthew G Knepley 
378403adfb6SMatthew G Knepley ///////////////////////////////////////////////////////////////////////////////
379403adfb6SMatthew G Knepley //Run the appropriate tests
380403adfb6SMatthew G Knepley ///////////////////////////////////////////////////////////////////////////////
381caccb7e3SMatthew G Knepley PetscErrorCode setupStream(PetscInt deviceNum, PetscBool runDouble, PetscBool cpuTiming)
382403adfb6SMatthew G Knepley {
383403adfb6SMatthew G Knepley   PetscInt       iNumThreadsPerBlock = 128;
384403adfb6SMatthew G Knepley   PetscErrorCode ierr;
385403adfb6SMatthew G Knepley 
386403adfb6SMatthew G Knepley   PetscFunctionBegin;
387403adfb6SMatthew G Knepley   // Check device
388403adfb6SMatthew G Knepley   {
389403adfb6SMatthew G Knepley     int deviceCount;
390403adfb6SMatthew G Knepley 
391403adfb6SMatthew G Knepley     cudaGetDeviceCount(&deviceCount);
392403adfb6SMatthew G Knepley     if (deviceCount == 0) {
393403adfb6SMatthew G Knepley       ierr = PetscPrintf(PETSC_COMM_SELF, "!!!!!No devices found!!!!!\n");CHKERRQ(ierr);
394403adfb6SMatthew G Knepley       return -1000;
395403adfb6SMatthew G Knepley     }
396403adfb6SMatthew G Knepley 
397403adfb6SMatthew G Knepley     if (deviceNum >= deviceCount || deviceNum < 0) {
398403adfb6SMatthew G Knepley       ierr      = PetscPrintf(PETSC_COMM_SELF, "\n!!!!!Invalid GPU number %d given hence default gpu %d will be used !!!!!\n", deviceNum, 0);CHKERRQ(ierr);
399403adfb6SMatthew G Knepley       deviceNum = 0;
400403adfb6SMatthew G Knepley     }
401403adfb6SMatthew G Knepley   }
402403adfb6SMatthew G Knepley 
403403adfb6SMatthew G Knepley   cudaSetDevice(deviceNum);
40419816777SMark   // ierr = PetscPrintf(PETSC_COMM_SELF, "Running on...\n\n");CHKERRQ(ierr);
405403adfb6SMatthew G Knepley   cudaDeviceProp deviceProp;
40619816777SMark   if (cudaGetDeviceProperties(&deviceProp, deviceNum) != cudaSuccess) {
407403adfb6SMatthew G Knepley     ierr = PetscPrintf(PETSC_COMM_SELF, " Unable to determine device %d properties, exiting\n");CHKERRQ(ierr);
408403adfb6SMatthew G Knepley     return -1;
409403adfb6SMatthew G Knepley   }
410403adfb6SMatthew G Knepley 
411caccb7e3SMatthew G Knepley   if (runDouble && deviceProp.major == 1 && deviceProp.minor < 3) {
412caccb7e3SMatthew G Knepley     ierr = PetscPrintf(PETSC_COMM_SELF, " Unable to run double-precision STREAM benchmark on a compute capability GPU less than 1.3\n");CHKERRQ(ierr);
413caccb7e3SMatthew G Knepley     return -1;
414caccb7e3SMatthew G Knepley   }
4156f2b61bcSKarl Rupp   if (deviceProp.major == 2 && deviceProp.minor == 1) iNumThreadsPerBlock = 192; /* GF104 architecture / 48 CUDA Cores per MP */
4166f2b61bcSKarl Rupp   else iNumThreadsPerBlock = 128; /* GF100 architecture / 32 CUDA Cores per MP */
417403adfb6SMatthew G Knepley 
418caccb7e3SMatthew G Knepley   if (runDouble) {
419caccb7e3SMatthew G Knepley     ierr = runStreamDouble(iNumThreadsPerBlock, cpuTiming);CHKERRQ(ierr);
42019816777SMark   } else {
42119816777SMark     ierr = runStream(iNumThreadsPerBlock, cpuTiming);CHKERRQ(ierr);
422caccb7e3SMatthew G Knepley   }
423403adfb6SMatthew G Knepley   PetscFunctionReturn(0);
424403adfb6SMatthew G Knepley }
425403adfb6SMatthew G Knepley 
426403adfb6SMatthew G Knepley ///////////////////////////////////////////////////////////////////////////
427403adfb6SMatthew G Knepley // runStream
428403adfb6SMatthew G Knepley ///////////////////////////////////////////////////////////////////////////
429403adfb6SMatthew G Knepley PetscErrorCode runStream(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming)
430403adfb6SMatthew G Knepley {
431403adfb6SMatthew G Knepley   float          *d_a, *d_b, *d_c;
432403adfb6SMatthew G Knepley   int            k;
433caccb7e3SMatthew G Knepley   float          times[8][NTIMES];
434403adfb6SMatthew G Knepley   float          scalar;
435403adfb6SMatthew G Knepley   PetscErrorCode ierr;
436403adfb6SMatthew G Knepley 
437403adfb6SMatthew G Knepley   PetscFunctionBegin;
438403adfb6SMatthew G Knepley   /* Allocate memory on device */
439403adfb6SMatthew G Knepley   ierr = cudaMalloc((void**)&d_a, sizeof(float)*N);CHKERRQ(ierr);
440403adfb6SMatthew G Knepley   ierr = cudaMalloc((void**)&d_b, sizeof(float)*N);CHKERRQ(ierr);
441403adfb6SMatthew G Knepley   ierr = cudaMalloc((void**)&d_c, sizeof(float)*N);CHKERRQ(ierr);
442403adfb6SMatthew G Knepley 
443403adfb6SMatthew G Knepley   /* Compute execution configuration */
444403adfb6SMatthew G Knepley 
445403adfb6SMatthew G Knepley   dim3 dimBlock(iNumThreadsPerBlock); /* (iNumThreadsPerBlock,1,1) */
446403adfb6SMatthew G Knepley   dim3 dimGrid(N/dimBlock.x); /* (N/dimBlock.x,1,1) */
447403adfb6SMatthew G Knepley   if (N % dimBlock.x != 0) dimGrid.x+=1;
448403adfb6SMatthew G Knepley 
449403adfb6SMatthew G Knepley   /* Initialize memory on the device */
450403adfb6SMatthew G Knepley   set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N);
451403adfb6SMatthew G Knepley   set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N);
452403adfb6SMatthew G Knepley   set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N);
453403adfb6SMatthew G Knepley 
454403adfb6SMatthew G Knepley   /* --- MAIN LOOP --- repeat test cases NTIMES times --- */
455403adfb6SMatthew G Knepley   PetscLogDouble cpuTimer = 0.0;
456403adfb6SMatthew G Knepley 
457403adfb6SMatthew G Knepley   scalar=3.0f;
458403adfb6SMatthew G Knepley   for (k = 0; k < NTIMES; ++k) {
4598563dfccSBarry Smith     PetscTimeSubtract(&cpuTimer);
460403adfb6SMatthew G Knepley     STREAM_Copy<<<dimGrid,dimBlock>>>(d_a, d_c, N);
46119816777SMark     cudaStreamSynchronize(NULL);
462ffc4695bSBarry Smith     ierr = MPI_Barrier(MPI_COMM_WORLD);CHKERRMPI(ierr);
4638563dfccSBarry Smith     PetscTimeAdd(&cpuTimer);
46419816777SMark     if (bDontUseGPUTiming) times[0][k] = cpuTimer*1.e3; // millisec
465403adfb6SMatthew G Knepley 
466403adfb6SMatthew G Knepley     cpuTimer = 0.0;
4678563dfccSBarry Smith     PetscTimeSubtract(&cpuTimer);
468403adfb6SMatthew G Knepley     STREAM_Copy_Optimized<<<dimGrid,dimBlock>>>(d_a, d_c, N);
46919816777SMark     cudaStreamSynchronize(NULL);
470ffc4695bSBarry Smith     ierr = MPI_Barrier(MPI_COMM_WORLD);CHKERRMPI(ierr);
471df3898eeSBarry Smith     //get the total elapsed time in ms
4728563dfccSBarry Smith     PetscTimeAdd(&cpuTimer);
47319816777SMark     if (bDontUseGPUTiming) times[1][k] = cpuTimer*1.e3;
474403adfb6SMatthew G Knepley 
475403adfb6SMatthew G Knepley     cpuTimer = 0.0;
4768563dfccSBarry Smith     PetscTimeSubtract(&cpuTimer);
477403adfb6SMatthew G Knepley     STREAM_Scale<<<dimGrid,dimBlock>>>(d_b, d_c, scalar,  N);
47819816777SMark     cudaStreamSynchronize(NULL);
479ffc4695bSBarry Smith     ierr = MPI_Barrier(MPI_COMM_WORLD);CHKERRMPI(ierr);
480df3898eeSBarry Smith     //get the total elapsed time in ms
4818563dfccSBarry Smith     PetscTimeAdd(&cpuTimer);
48219816777SMark     if (bDontUseGPUTiming) times[2][k] = cpuTimer*1.e3;
483403adfb6SMatthew G Knepley 
484403adfb6SMatthew G Knepley     cpuTimer = 0.0;
4858563dfccSBarry Smith     PetscTimeSubtract(&cpuTimer);
486caccb7e3SMatthew G Knepley     STREAM_Scale_Optimized<<<dimGrid,dimBlock>>>(d_b, d_c, scalar,  N);
48719816777SMark     cudaStreamSynchronize(NULL);
488ffc4695bSBarry Smith     ierr = MPI_Barrier(MPI_COMM_WORLD);CHKERRMPI(ierr);
489df3898eeSBarry Smith     //get the total elapsed time in ms
4908563dfccSBarry Smith     PetscTimeAdd(&cpuTimer);
49119816777SMark     if (bDontUseGPUTiming) times[3][k] = cpuTimer*1.e3;
492403adfb6SMatthew G Knepley 
493403adfb6SMatthew G Knepley     cpuTimer = 0.0;
4948563dfccSBarry Smith     PetscTimeSubtract(&cpuTimer);
49519816777SMark     // ierr = cudaEventRecord(start, 0);CHKERRQ(ierr);
496caccb7e3SMatthew G Knepley     STREAM_Add<<<dimGrid,dimBlock>>>(d_a, d_b, d_c,  N);
49719816777SMark     cudaStreamSynchronize(NULL);
498ffc4695bSBarry Smith     ierr = MPI_Barrier(MPI_COMM_WORLD);CHKERRMPI(ierr);
499ffc4695bSBarry Smith     ierr = cudaEventRecord(stop, 0);CHKERRQ(ierr);
50019816777SMark     // ierr = cudaEventSynchronize(stop);CHKERRQ(ierr);
501df3898eeSBarry Smith     //get the total elapsed time in ms
5028563dfccSBarry Smith     PetscTimeAdd(&cpuTimer);
50319816777SMark     if (bDontUseGPUTiming) times[4][k] = cpuTimer*1.e3;
5046f2b61bcSKarl Rupp     else {
50519816777SMark       // ierr = cudaEventElapsedTime(&times[4][k], start, stop);CHKERRQ(ierr);
506403adfb6SMatthew G Knepley     }
507403adfb6SMatthew G Knepley 
508caccb7e3SMatthew G Knepley     cpuTimer = 0.0;
5098563dfccSBarry Smith     PetscTimeSubtract(&cpuTimer);
510caccb7e3SMatthew G Knepley     STREAM_Add_Optimized<<<dimGrid,dimBlock>>>(d_a, d_b, d_c,  N);
51119816777SMark     cudaStreamSynchronize(NULL);
512ffc4695bSBarry Smith     ierr = MPI_Barrier(MPI_COMM_WORLD);CHKERRMPI(ierr);
513df3898eeSBarry Smith     //get the total elapsed time in ms
5148563dfccSBarry Smith     PetscTimeAdd(&cpuTimer);
51519816777SMark     if (bDontUseGPUTiming) times[5][k] = cpuTimer*1.e3;
516caccb7e3SMatthew G Knepley 
517caccb7e3SMatthew G Knepley     cpuTimer = 0.0;
5188563dfccSBarry Smith     PetscTimeSubtract(&cpuTimer);
519caccb7e3SMatthew G Knepley     STREAM_Triad<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar,  N);
52019816777SMark     cudaStreamSynchronize(NULL);
521ffc4695bSBarry Smith     ierr = MPI_Barrier(MPI_COMM_WORLD);CHKERRMPI(ierr);
522df3898eeSBarry Smith     //get the total elapsed time in ms
5238563dfccSBarry Smith     PetscTimeAdd(&cpuTimer);
52419816777SMark     if (bDontUseGPUTiming) times[6][k] = cpuTimer*1.e3;
525caccb7e3SMatthew G Knepley 
526caccb7e3SMatthew G Knepley     cpuTimer = 0.0;
5278563dfccSBarry Smith     PetscTimeSubtract(&cpuTimer);
528caccb7e3SMatthew G Knepley     STREAM_Triad_Optimized<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar,  N);
52919816777SMark     cudaStreamSynchronize(NULL);
530ffc4695bSBarry Smith     ierr = MPI_Barrier(MPI_COMM_WORLD);CHKERRMPI(ierr);
531df3898eeSBarry Smith     //get the total elapsed time in ms
5328563dfccSBarry Smith     PetscTimeAdd(&cpuTimer);
53319816777SMark     if (bDontUseGPUTiming) times[7][k] = cpuTimer*1.e3;
534caccb7e3SMatthew G Knepley   }
535caccb7e3SMatthew G Knepley 
53619816777SMark   if (1) { /* verify kernels */
537403adfb6SMatthew G Knepley   float *h_a, *h_b, *h_c;
538403adfb6SMatthew G Knepley   bool  errorSTREAMkernel = true;
539403adfb6SMatthew G Knepley 
540403adfb6SMatthew G Knepley   if ((h_a = (float*)calloc(N, sizeof(float))) == (float*)NULL) {
541403adfb6SMatthew G Knepley     printf("Unable to allocate array h_a, exiting ...\n");
542403adfb6SMatthew G Knepley     exit(1);
543403adfb6SMatthew G Knepley   }
544403adfb6SMatthew G Knepley   if ((h_b = (float*)calloc(N, sizeof(float))) == (float*)NULL) {
545403adfb6SMatthew G Knepley     printf("Unable to allocate array h_b, exiting ...\n");
546403adfb6SMatthew G Knepley     exit(1);
547403adfb6SMatthew G Knepley   }
548403adfb6SMatthew G Knepley 
549403adfb6SMatthew G Knepley   if ((h_c = (float*)calloc(N, sizeof(float))) == (float*)NULL) {
550403adfb6SMatthew G Knepley     printf("Unalbe to allocate array h_c, exiting ...\n");
551403adfb6SMatthew G Knepley     exit(1);
552403adfb6SMatthew G Knepley   }
553403adfb6SMatthew G Knepley 
554403adfb6SMatthew G Knepley   /*
555403adfb6SMatthew G Knepley    * perform kernel, copy device memory into host memory and verify each
556403adfb6SMatthew G Knepley    * device kernel output
557403adfb6SMatthew G Knepley    */
558403adfb6SMatthew G Knepley 
559403adfb6SMatthew G Knepley   /* Initialize memory on the device */
560403adfb6SMatthew G Knepley   set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N);
561403adfb6SMatthew G Knepley   set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N);
562403adfb6SMatthew G Knepley   set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N);
563403adfb6SMatthew G Knepley 
564403adfb6SMatthew G Knepley   STREAM_Copy<<<dimGrid,dimBlock>>>(d_a, d_c, N);
565403adfb6SMatthew G Knepley   ierr              = cudaMemcpy(h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
566403adfb6SMatthew G Knepley   ierr              = cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
567403adfb6SMatthew G Knepley   errorSTREAMkernel = STREAM_Copy_verify(h_a, h_c, N);
568403adfb6SMatthew G Knepley   if (errorSTREAMkernel) {
569403adfb6SMatthew G Knepley     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy:\t\tError detected in device STREAM_Copy, exiting\n");CHKERRQ(ierr);
570403adfb6SMatthew G Knepley     exit(-2000);
571403adfb6SMatthew G Knepley   }
572403adfb6SMatthew G Knepley 
573403adfb6SMatthew G Knepley   /* Initialize memory on the device */
574403adfb6SMatthew G Knepley   set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N);
575403adfb6SMatthew G Knepley   set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N);
576403adfb6SMatthew G Knepley   set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N);
577403adfb6SMatthew G Knepley 
578403adfb6SMatthew G Knepley   STREAM_Copy_Optimized<<<dimGrid,dimBlock>>>(d_a, d_c, N);
579403adfb6SMatthew G Knepley   ierr              = cudaMemcpy(h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
580403adfb6SMatthew G Knepley   ierr              = cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
581403adfb6SMatthew G Knepley   errorSTREAMkernel = STREAM_Copy_verify(h_a, h_c, N);
582403adfb6SMatthew G Knepley   if (errorSTREAMkernel) {
583403adfb6SMatthew G Knepley     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy_Optimized:\tError detected in device STREAM_Copy_Optimized, exiting\n");CHKERRQ(ierr);
584403adfb6SMatthew G Knepley     exit(-3000);
585403adfb6SMatthew G Knepley   }
586403adfb6SMatthew G Knepley 
587403adfb6SMatthew G Knepley   /* Initialize memory on the device */
58819816777SMark   set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N);
589403adfb6SMatthew G Knepley   set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N);
590403adfb6SMatthew G Knepley   set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N);
591403adfb6SMatthew G Knepley 
592403adfb6SMatthew G Knepley   STREAM_Scale<<<dimGrid,dimBlock>>>(d_b, d_c, scalar, N);
593403adfb6SMatthew G Knepley   ierr              = cudaMemcpy(h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
594403adfb6SMatthew G Knepley   ierr              = cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
595403adfb6SMatthew G Knepley   errorSTREAMkernel = STREAM_Scale_verify(h_b, h_c, scalar, N);
596403adfb6SMatthew G Knepley   if (errorSTREAMkernel) {
597403adfb6SMatthew G Knepley     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Scale:\t\tError detected in device STREAM_Scale, exiting\n");CHKERRQ(ierr);
598403adfb6SMatthew G Knepley     exit(-4000);
599403adfb6SMatthew G Knepley   }
600403adfb6SMatthew G Knepley 
601403adfb6SMatthew G Knepley   /* Initialize memory on the device */
602403adfb6SMatthew G Knepley   set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N);
603403adfb6SMatthew G Knepley   set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N);
604403adfb6SMatthew G Knepley   set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N);
605403adfb6SMatthew G Knepley 
606403adfb6SMatthew G Knepley   STREAM_Add<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N);
607403adfb6SMatthew G Knepley   ierr              = cudaMemcpy(h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
608403adfb6SMatthew G Knepley   ierr              = cudaMemcpy(h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
609403adfb6SMatthew G Knepley   ierr              = cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
610403adfb6SMatthew G Knepley   errorSTREAMkernel = STREAM_Add_verify(h_a, h_b, h_c, N);
611403adfb6SMatthew G Knepley   if (errorSTREAMkernel) {
612403adfb6SMatthew G Knepley     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Add:\t\tError detected in device STREAM_Add, exiting\n");CHKERRQ(ierr);
613403adfb6SMatthew G Knepley     exit(-5000);
614403adfb6SMatthew G Knepley   }
615403adfb6SMatthew G Knepley 
616403adfb6SMatthew G Knepley   /* Initialize memory on the device */
617403adfb6SMatthew G Knepley   set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N);
618403adfb6SMatthew G Knepley   set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N);
619403adfb6SMatthew G Knepley   set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N);
620403adfb6SMatthew G Knepley 
621403adfb6SMatthew G Knepley   STREAM_Triad<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N);
622403adfb6SMatthew G Knepley   ierr              = cudaMemcpy(h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
623403adfb6SMatthew G Knepley   ierr              = cudaMemcpy(h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
624403adfb6SMatthew G Knepley   ierr              = cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
625403adfb6SMatthew G Knepley   errorSTREAMkernel = STREAM_Triad_verify(h_b, h_c, h_a, scalar, N);
626403adfb6SMatthew G Knepley   if (errorSTREAMkernel) {
627403adfb6SMatthew G Knepley     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Triad:\t\tError detected in device STREAM_Triad, exiting\n");CHKERRQ(ierr);
628403adfb6SMatthew G Knepley     exit(-6000);
629403adfb6SMatthew G Knepley   }
630403adfb6SMatthew G Knepley 
63119816777SMark   free(h_a);
63219816777SMark   free(h_b);
63319816777SMark   free(h_c);
63419816777SMark   }
635403adfb6SMatthew G Knepley   /* continue from here */
63619816777SMark   printResultsReadable(times, sizeof(float));
637403adfb6SMatthew G Knepley 
638403adfb6SMatthew G Knepley   /* Free memory on device */
639403adfb6SMatthew G Knepley   ierr = cudaFree(d_a);CHKERRQ(ierr);
640403adfb6SMatthew G Knepley   ierr = cudaFree(d_b);CHKERRQ(ierr);
641403adfb6SMatthew G Knepley   ierr = cudaFree(d_c);CHKERRQ(ierr);
64219816777SMark 
643403adfb6SMatthew G Knepley   PetscFunctionReturn(0);
644403adfb6SMatthew G Knepley }
645403adfb6SMatthew G Knepley 
646caccb7e3SMatthew G Knepley PetscErrorCode runStreamDouble(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming)
647caccb7e3SMatthew G Knepley {
648caccb7e3SMatthew G Knepley   double         *d_a, *d_b, *d_c;
649caccb7e3SMatthew G Knepley   int            k;
650caccb7e3SMatthew G Knepley   float          times[8][NTIMES];
651caccb7e3SMatthew G Knepley   double         scalar;
652caccb7e3SMatthew G Knepley   PetscErrorCode ierr;
653caccb7e3SMatthew G Knepley 
654caccb7e3SMatthew G Knepley   PetscFunctionBegin;
655caccb7e3SMatthew G Knepley   /* Allocate memory on device */
656caccb7e3SMatthew G Knepley   ierr = cudaMalloc((void**)&d_a, sizeof(double)*N);CHKERRQ(ierr);
657caccb7e3SMatthew G Knepley   ierr = cudaMalloc((void**)&d_b, sizeof(double)*N);CHKERRQ(ierr);
658caccb7e3SMatthew G Knepley   ierr = cudaMalloc((void**)&d_c, sizeof(double)*N);CHKERRQ(ierr);
659caccb7e3SMatthew G Knepley 
660caccb7e3SMatthew G Knepley   /* Compute execution configuration */
661caccb7e3SMatthew G Knepley 
662caccb7e3SMatthew G Knepley   dim3 dimBlock(iNumThreadsPerBlock); /* (iNumThreadsPerBlock,1,1) */
663caccb7e3SMatthew G Knepley   dim3 dimGrid(N/dimBlock.x); /* (N/dimBlock.x,1,1) */
664caccb7e3SMatthew G Knepley   if (N % dimBlock.x != 0) dimGrid.x+=1;
665caccb7e3SMatthew G Knepley 
666caccb7e3SMatthew G Knepley   /* Initialize memory on the device */
667caccb7e3SMatthew G Knepley   set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N);
668caccb7e3SMatthew G Knepley   set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N);
669caccb7e3SMatthew G Knepley   set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N);
670caccb7e3SMatthew G Knepley 
671caccb7e3SMatthew G Knepley   /* --- MAIN LOOP --- repeat test cases NTIMES times --- */
672caccb7e3SMatthew G Knepley   PetscLogDouble cpuTimer = 0.0;
673caccb7e3SMatthew G Knepley 
674caccb7e3SMatthew G Knepley   scalar=3.0;
675caccb7e3SMatthew G Knepley   for (k = 0; k < NTIMES; ++k) {
6768563dfccSBarry Smith     PetscTimeSubtract(&cpuTimer);
677caccb7e3SMatthew G Knepley     STREAM_Copy_double<<<dimGrid,dimBlock>>>(d_a, d_c, N);
67819816777SMark     cudaStreamSynchronize(NULL);
679ffc4695bSBarry Smith     ierr = MPI_Barrier(MPI_COMM_WORLD);CHKERRMPI(ierr);
680df3898eeSBarry Smith     //get the total elapsed time in ms
681caccb7e3SMatthew G Knepley     if (bDontUseGPUTiming) {
6828563dfccSBarry Smith       PetscTimeAdd(&cpuTimer);
68319816777SMark       times[0][k] = cpuTimer*1.e3;
684caccb7e3SMatthew G Knepley     }
685caccb7e3SMatthew G Knepley 
686caccb7e3SMatthew G Knepley     cpuTimer = 0.0;
6878563dfccSBarry Smith     PetscTimeSubtract(&cpuTimer);
688caccb7e3SMatthew G Knepley     STREAM_Copy_Optimized_double<<<dimGrid,dimBlock>>>(d_a, d_c, N);
68919816777SMark     cudaStreamSynchronize(NULL);
690ffc4695bSBarry Smith     ierr = MPI_Barrier(MPI_COMM_WORLD);CHKERRMPI(ierr);
691df3898eeSBarry Smith     //get the total elapsed time in ms
692caccb7e3SMatthew G Knepley     if (bDontUseGPUTiming) {
6938563dfccSBarry Smith       PetscTimeAdd(&cpuTimer);
69419816777SMark       times[1][k] = cpuTimer*1.e3;
695caccb7e3SMatthew G Knepley     }
696caccb7e3SMatthew G Knepley 
697caccb7e3SMatthew G Knepley     cpuTimer = 0.0;
6988563dfccSBarry Smith     PetscTimeSubtract(&cpuTimer);
699caccb7e3SMatthew G Knepley     STREAM_Scale_double<<<dimGrid,dimBlock>>>(d_b, d_c, scalar,  N);
70019816777SMark     cudaStreamSynchronize(NULL);
701ffc4695bSBarry Smith     ierr = MPI_Barrier(MPI_COMM_WORLD);CHKERRMPI(ierr);
702df3898eeSBarry Smith     //get the total elapsed time in ms
7038563dfccSBarry Smith     PetscTimeAdd(&cpuTimer);
70419816777SMark     if (bDontUseGPUTiming) times[2][k] = cpuTimer*1.e3;
705caccb7e3SMatthew G Knepley 
706caccb7e3SMatthew G Knepley     cpuTimer = 0.0;
7078563dfccSBarry Smith     PetscTimeSubtract(&cpuTimer);
708caccb7e3SMatthew G Knepley     STREAM_Scale_Optimized_double<<<dimGrid,dimBlock>>>(d_b, d_c, scalar,  N);
70919816777SMark     cudaStreamSynchronize(NULL);
710ffc4695bSBarry Smith     ierr = MPI_Barrier(MPI_COMM_WORLD);CHKERRMPI(ierr);
711df3898eeSBarry Smith     //get the total elapsed time in ms
7128563dfccSBarry Smith     PetscTimeAdd(&cpuTimer);
71319816777SMark     if (bDontUseGPUTiming) times[3][k] = cpuTimer*1.e3;
714caccb7e3SMatthew G Knepley 
715caccb7e3SMatthew G Knepley     cpuTimer = 0.0;
7168563dfccSBarry Smith     PetscTimeSubtract(&cpuTimer);
717caccb7e3SMatthew G Knepley     STREAM_Add_double<<<dimGrid,dimBlock>>>(d_a, d_b, d_c,  N);
71819816777SMark     cudaStreamSynchronize(NULL);
719ffc4695bSBarry Smith     ierr = MPI_Barrier(MPI_COMM_WORLD);CHKERRMPI(ierr);
720df3898eeSBarry Smith     //get the total elapsed time in ms
7218563dfccSBarry Smith     PetscTimeAdd(&cpuTimer);
72219816777SMark     if (bDontUseGPUTiming) times[4][k] = cpuTimer*1.e3;
723caccb7e3SMatthew G Knepley 
724caccb7e3SMatthew G Knepley     cpuTimer = 0.0;
7258563dfccSBarry Smith     PetscTimeSubtract(&cpuTimer);
726caccb7e3SMatthew G Knepley     STREAM_Add_Optimized_double<<<dimGrid,dimBlock>>>(d_a, d_b, d_c,  N);
72719816777SMark     cudaStreamSynchronize(NULL);
728ffc4695bSBarry Smith     ierr = MPI_Barrier(MPI_COMM_WORLD);CHKERRMPI(ierr);
729df3898eeSBarry Smith     //get the total elapsed time in ms
7308563dfccSBarry Smith     PetscTimeAdd(&cpuTimer);
73119816777SMark     if (bDontUseGPUTiming) times[5][k] = cpuTimer*1.e3;
732caccb7e3SMatthew G Knepley 
733caccb7e3SMatthew G Knepley     cpuTimer = 0.0;
7348563dfccSBarry Smith     PetscTimeSubtract(&cpuTimer);
735caccb7e3SMatthew G Knepley     STREAM_Triad_double<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar,  N);
73619816777SMark     cudaStreamSynchronize(NULL);
737ffc4695bSBarry Smith     ierr = MPI_Barrier(MPI_COMM_WORLD);CHKERRMPI(ierr);
738df3898eeSBarry Smith     //get the total elapsed time in ms
7398563dfccSBarry Smith     PetscTimeAdd(&cpuTimer);
74019816777SMark     if (bDontUseGPUTiming) times[6][k] = cpuTimer*1.e3;
741caccb7e3SMatthew G Knepley 
742caccb7e3SMatthew G Knepley     cpuTimer = 0.0;
7438563dfccSBarry Smith     PetscTimeSubtract(&cpuTimer);
744caccb7e3SMatthew G Knepley     STREAM_Triad_Optimized_double<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar,  N);
74519816777SMark     cudaStreamSynchronize(NULL);
746ffc4695bSBarry Smith     ierr = MPI_Barrier(MPI_COMM_WORLD);CHKERRMPI(ierr);
747df3898eeSBarry Smith     //get the total elapsed time in ms
7488563dfccSBarry Smith     PetscTimeAdd(&cpuTimer);
74919816777SMark     if (bDontUseGPUTiming) times[7][k] = cpuTimer*1.e3;
750caccb7e3SMatthew G Knepley   }
751caccb7e3SMatthew G Knepley 
75219816777SMark   if (1) { /* verify kernels */
753caccb7e3SMatthew G Knepley   double *h_a, *h_b, *h_c;
754caccb7e3SMatthew G Knepley   bool   errorSTREAMkernel = true;
755caccb7e3SMatthew G Knepley 
756caccb7e3SMatthew G Knepley   if ((h_a = (double*)calloc(N, sizeof(double))) == (double*)NULL) {
757caccb7e3SMatthew G Knepley     printf("Unable to allocate array h_a, exiting ...\n");
758caccb7e3SMatthew G Knepley     exit(1);
759caccb7e3SMatthew G Knepley   }
760caccb7e3SMatthew G Knepley   if ((h_b = (double*)calloc(N, sizeof(double))) == (double*)NULL) {
761caccb7e3SMatthew G Knepley     printf("Unable to allocate array h_b, exiting ...\n");
762caccb7e3SMatthew G Knepley     exit(1);
763caccb7e3SMatthew G Knepley   }
764caccb7e3SMatthew G Knepley 
765caccb7e3SMatthew G Knepley   if ((h_c = (double*)calloc(N, sizeof(double))) == (double*)NULL) {
766caccb7e3SMatthew G Knepley     printf("Unalbe to allocate array h_c, exiting ...\n");
767caccb7e3SMatthew G Knepley     exit(1);
768caccb7e3SMatthew G Knepley   }
769caccb7e3SMatthew G Knepley 
770caccb7e3SMatthew G Knepley   /*
771caccb7e3SMatthew G Knepley    * perform kernel, copy device memory into host memory and verify each
772caccb7e3SMatthew G Knepley    * device kernel output
773caccb7e3SMatthew G Knepley    */
774caccb7e3SMatthew G Knepley 
775caccb7e3SMatthew G Knepley   /* Initialize memory on the device */
776caccb7e3SMatthew G Knepley   set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N);
777caccb7e3SMatthew G Knepley   set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N);
778caccb7e3SMatthew G Knepley   set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N);
779caccb7e3SMatthew G Knepley 
780caccb7e3SMatthew G Knepley   STREAM_Copy_double<<<dimGrid,dimBlock>>>(d_a, d_c, N);
781caccb7e3SMatthew G Knepley   ierr              = cudaMemcpy(h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
782caccb7e3SMatthew G Knepley   ierr              = cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
783caccb7e3SMatthew G Knepley   errorSTREAMkernel = STREAM_Copy_verify_double(h_a, h_c, N);
784caccb7e3SMatthew G Knepley   if (errorSTREAMkernel) {
785caccb7e3SMatthew G Knepley     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy:\t\tError detected in device STREAM_Copy, exiting\n");CHKERRQ(ierr);
786caccb7e3SMatthew G Knepley     exit(-2000);
787caccb7e3SMatthew G Knepley   }
788caccb7e3SMatthew G Knepley 
789caccb7e3SMatthew G Knepley   /* Initialize memory on the device */
790caccb7e3SMatthew G Knepley   set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N);
791caccb7e3SMatthew G Knepley   set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N);
792caccb7e3SMatthew G Knepley   set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N);
793caccb7e3SMatthew G Knepley 
794caccb7e3SMatthew G Knepley   STREAM_Copy_Optimized_double<<<dimGrid,dimBlock>>>(d_a, d_c, N);
795caccb7e3SMatthew G Knepley   ierr              = cudaMemcpy(h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
796caccb7e3SMatthew G Knepley   ierr              = cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
797caccb7e3SMatthew G Knepley   errorSTREAMkernel = STREAM_Copy_verify_double(h_a, h_c, N);
798caccb7e3SMatthew G Knepley   if (errorSTREAMkernel) {
799caccb7e3SMatthew G Knepley     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy_Optimized:\tError detected in device STREAM_Copy_Optimized, exiting\n");CHKERRQ(ierr);
800caccb7e3SMatthew G Knepley     exit(-3000);
801caccb7e3SMatthew G Knepley   }
802caccb7e3SMatthew G Knepley 
803caccb7e3SMatthew G Knepley   /* Initialize memory on the device */
804caccb7e3SMatthew G Knepley   set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N);
805caccb7e3SMatthew G Knepley   set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N);
806caccb7e3SMatthew G Knepley 
807caccb7e3SMatthew G Knepley   STREAM_Scale_double<<<dimGrid,dimBlock>>>(d_b, d_c, scalar, N);
808caccb7e3SMatthew G Knepley   ierr              = cudaMemcpy(h_b, d_b, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
809caccb7e3SMatthew G Knepley   ierr              = cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
810caccb7e3SMatthew G Knepley   errorSTREAMkernel = STREAM_Scale_verify_double(h_b, h_c, scalar, N);
811caccb7e3SMatthew G Knepley   if (errorSTREAMkernel) {
812caccb7e3SMatthew G Knepley     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Scale:\t\tError detected in device STREAM_Scale, exiting\n");CHKERRQ(ierr);
813caccb7e3SMatthew G Knepley     exit(-4000);
814caccb7e3SMatthew G Knepley   }
815caccb7e3SMatthew G Knepley 
816caccb7e3SMatthew G Knepley   /* Initialize memory on the device */
817caccb7e3SMatthew G Knepley   set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N);
818caccb7e3SMatthew G Knepley   set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N);
819caccb7e3SMatthew G Knepley   set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N);
820caccb7e3SMatthew G Knepley 
821caccb7e3SMatthew G Knepley   STREAM_Add_double<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N);
822caccb7e3SMatthew G Knepley   ierr              = cudaMemcpy(h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
823caccb7e3SMatthew G Knepley   ierr              = cudaMemcpy(h_b, d_b, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
824caccb7e3SMatthew G Knepley   ierr              = cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
825caccb7e3SMatthew G Knepley   errorSTREAMkernel = STREAM_Add_verify_double(h_a, h_b, h_c, N);
826caccb7e3SMatthew G Knepley   if (errorSTREAMkernel) {
827caccb7e3SMatthew G Knepley     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Add:\t\tError detected in device STREAM_Add, exiting\n");CHKERRQ(ierr);
828caccb7e3SMatthew G Knepley     exit(-5000);
829caccb7e3SMatthew G Knepley   }
830caccb7e3SMatthew G Knepley 
831caccb7e3SMatthew G Knepley   /* Initialize memory on the device */
832caccb7e3SMatthew G Knepley   set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N);
833caccb7e3SMatthew G Knepley   set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N);
834caccb7e3SMatthew G Knepley   set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N);
835caccb7e3SMatthew G Knepley 
836caccb7e3SMatthew G Knepley   STREAM_Triad_double<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N);
837caccb7e3SMatthew G Knepley   ierr              = cudaMemcpy(h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
838caccb7e3SMatthew G Knepley   ierr              = cudaMemcpy(h_b, d_b, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
839caccb7e3SMatthew G Knepley   ierr              = cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
840caccb7e3SMatthew G Knepley   errorSTREAMkernel = STREAM_Triad_verify_double(h_b, h_c, h_a, scalar, N);
841caccb7e3SMatthew G Knepley   if (errorSTREAMkernel) {
842caccb7e3SMatthew G Knepley     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Triad:\t\tError detected in device STREAM_Triad, exiting\n");CHKERRQ(ierr);
843caccb7e3SMatthew G Knepley     exit(-6000);
844caccb7e3SMatthew G Knepley   }
845caccb7e3SMatthew G Knepley 
84619816777SMark   free(h_a);
84719816777SMark   free(h_b);
84819816777SMark   free(h_c);
84919816777SMark   }
850caccb7e3SMatthew G Knepley   /* continue from here */
85119816777SMark   printResultsReadable(times,sizeof(double));
852caccb7e3SMatthew G Knepley 
853caccb7e3SMatthew G Knepley   /* Free memory on device */
854caccb7e3SMatthew G Knepley   ierr = cudaFree(d_a);CHKERRQ(ierr);
855caccb7e3SMatthew G Knepley   ierr = cudaFree(d_b);CHKERRQ(ierr);
856caccb7e3SMatthew G Knepley   ierr = cudaFree(d_c);CHKERRQ(ierr);
85719816777SMark 
858caccb7e3SMatthew G Knepley   PetscFunctionReturn(0);
859caccb7e3SMatthew G Knepley }
860caccb7e3SMatthew G Knepley 
861403adfb6SMatthew G Knepley ///////////////////////////////////////////////////////////////////////////
862403adfb6SMatthew G Knepley //Print Results to Screen and File
863403adfb6SMatthew G Knepley ///////////////////////////////////////////////////////////////////////////
86419816777SMark PetscErrorCode printResultsReadable(float times[][NTIMES], const size_t bsize)
865a6dfd86eSKarl Rupp {
866403adfb6SMatthew G Knepley   PetscErrorCode ierr;
867403adfb6SMatthew G Knepley   PetscInt       j, k;
868caccb7e3SMatthew G Knepley   float          avgtime[8]          = {0., 0., 0., 0., 0., 0., 0., 0.};
869caccb7e3SMatthew G Knepley   float          maxtime[8]          = {0., 0., 0., 0., 0., 0., 0., 0.};
870caccb7e3SMatthew G Knepley   float          mintime[8]          = {1e30,1e30,1e30,1e30,1e30,1e30,1e30,1e30};
87119816777SMark   // char           *label[8]           = {"Copy:      ", "Copy Opt.: ", "Scale:     ", "Scale Opt: ", "Add:       ", "Add Opt:   ", "Triad:     ", "Triad Opt: "};
87219816777SMark   const float    bytes_per_kernel[8] = {
87319816777SMark     2. * bsize * N,
87419816777SMark     2. * bsize * N,
87519816777SMark     2. * bsize * N,
87619816777SMark     2. * bsize * N,
87719816777SMark     3. * bsize * N,
87819816777SMark     3. * bsize * N,
87919816777SMark     3. * bsize * N,
88019816777SMark     3. * bsize * N
881403adfb6SMatthew G Knepley   };
88219816777SMark   double         rate,irate;
88319816777SMark   int            rank,size;
884403adfb6SMatthew G Knepley   PetscFunctionBegin;
885ffc4695bSBarry Smith   ierr = MPI_Comm_rank(MPI_COMM_WORLD,&rank);CHKERRMPI(ierr);
886ffc4695bSBarry Smith   ierr = MPI_Comm_size(MPI_COMM_WORLD,&size);CHKERRMPI(ierr);
887403adfb6SMatthew G Knepley   /* --- SUMMARY --- */
88819816777SMark   for (k = 0; k < NTIMES; ++k) {
889caccb7e3SMatthew G Knepley     for (j = 0; j < 8; ++j) {
89019816777SMark       avgtime[j] = avgtime[j] + (1.e-03f * times[j][k]); // millisec --> sec
891403adfb6SMatthew G Knepley       mintime[j] = MIN(mintime[j], (1.e-03f * times[j][k]));
892403adfb6SMatthew G Knepley       maxtime[j] = MAX(maxtime[j], (1.e-03f * times[j][k]));
893403adfb6SMatthew G Knepley     }
89419816777SMark   }
895caccb7e3SMatthew G Knepley   for (j = 0; j < 8; ++j) {
896403adfb6SMatthew G Knepley     avgtime[j] = avgtime[j]/(float)(NTIMES-1);
897403adfb6SMatthew G Knepley   }
89819816777SMark   j = 7;
89919816777SMark   irate = 1.0E-06 * bytes_per_kernel[j]/mintime[j];
90019816777SMark   ierr = MPI_Reduce(&irate,&rate,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD);
901*dd400576SPatrick Sanan   if (rank == 0) {
90219816777SMark     FILE *fd;
90319816777SMark     if (size == 1) {
90419816777SMark       printf("%d %11.4f   Rate (MB/s)\n",size, rate);
90519816777SMark       fd = fopen("flops","w");
90619816777SMark       fprintf(fd,"%g\n",rate);
90719816777SMark       fclose(fd);
90819816777SMark     } else {
90919816777SMark       double prate;
91019816777SMark       fd = fopen("flops","r");
91119816777SMark       fscanf(fd,"%lg",&prate);
91219816777SMark       fclose(fd);
91319816777SMark       printf("%d %11.4f   Rate (MB/s) %g \n", size, rate, rate/prate);
91419816777SMark     }
91519816777SMark   }
91619816777SMark 
917403adfb6SMatthew G Knepley   PetscFunctionReturn(0);
918403adfb6SMatthew G Knepley }
919