xref: /petsc/src/benchmarks/streams/CUDAVersion.cu (revision 403adfb6d7fc5442a9d3fb849d3da717f6691bef)
1 /*
2   STREAM benchmark implementation in CUDA.
3 
4     COPY:       a(i) = b(i)
5     SCALE:      a(i) = q*b(i)
6     SUM:        a(i) = b(i) + c(i)
7     TRIAD:      a(i) = b(i) + q*c(i)
8 
9   It measures the memory system on the device.
10   The implementation is in single precision.
11 
12   Code based on the code developed by John D. McCalpin
13   http://www.cs.virginia.edu/stream/FTP/Code/stream.c
14 
15   Written by: Massimiliano Fatica, NVIDIA Corporation
16   Modified by: Douglas Enright (dpephd-nvidia@yahoo.com), 1 December 2010
17   Extensive Revisions, 4 December 2010
18   Modified for PETSc by: Matthew G. Knepley 14 Aug 2011
19 
20   User interface motivated by bandwidthTest NVIDIA SDK example.
21 */
22 static char *help =  "Single-Precision STREAM Benchmark implementation in CUDA\n"
23                      "Performs Copy, Scale, Add, and Triad single-precision kernels\n\n";
24 
25 #include <petscconf.h>
26 #include <petscsys.h>
27 #include <petsctime.h>
28 
29 #define N	2000000
30 #define NTIMES	10
31 
32 # ifndef MIN
33 # define MIN(x,y) ((x)<(y)?(x):(y))
34 # endif
35 # ifndef MAX
36 # define MAX(x,y) ((x)>(y)?(x):(y))
37 # endif
38 
39 const float flt_eps = 1.192092896e-07f;
40 
41 __global__ void set_array(float *a,  float value, size_t len)
42 {
43   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
44   while (idx < len) {
45     a[idx] = value;
46     idx += blockDim.x * gridDim.x;
47   }
48 }
49 
50 __global__ void STREAM_Copy(float *a, float *b, size_t len)
51 {
52   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
53   while (idx < len) {
54     b[idx] = a[idx];
55     idx += blockDim.x * gridDim.x;
56   }
57 }
58 
59 __global__ void STREAM_Copy_Optimized(float *a, float *b, size_t len)
60 {
61   /*
62    * Ensure size of thread index space is as large as or greater than
63    * vector index space else return.
64    */
65   if (blockDim.x * gridDim.x < len) return;
66   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
67   if (idx < len) b[idx] = a[idx];
68 }
69 
70 __global__ void STREAM_Scale(float *a, float *b, float scale,  size_t len)
71 {
72   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
73   while (idx < len) {
74     b[idx] = scale* a[idx];
75     idx += blockDim.x * gridDim.x;
76   }
77 }
78 
79 __global__ void STREAM_Add( float *a, float *b, float *c,  size_t len)
80 {
81   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
82   while (idx < len) {
83     c[idx] = a[idx]+b[idx];
84     idx += blockDim.x * gridDim.x;
85   }
86 }
87 
88 __global__ void STREAM_Triad( float *a, float *b, float *c, float scalar, size_t len)
89 {
90   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
91   while (idx < len) {
92     c[idx] = a[idx]+scalar*b[idx];
93     idx += blockDim.x * gridDim.x;
94   }
95 }
96 
97 /* Host side verification routines */
98 bool STREAM_Copy_verify(float *a, float *b, size_t len) {
99   size_t idx;
100   bool bDifferent = false;
101 
102   for (idx = 0; idx < len && !bDifferent; idx++) {
103     float expectedResult = a[idx];
104     float diffResultExpected = (b[idx] - expectedResult);
105     float relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps;
106     /* element-wise relative error determination */
107     bDifferent = (relErrorULPS > 2.f);
108   }
109 
110   return bDifferent;
111 }
112 
113 bool STREAM_Scale_verify(float *a, float *b, float scale, size_t len) {
114   size_t idx;
115   bool bDifferent = false;
116 
117   for (idx = 0; idx < len && !bDifferent; idx++) {
118     float expectedResult = scale*a[idx];
119     float diffResultExpected = (b[idx] - expectedResult);
120     float relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps;
121     /* element-wise relative error determination */
122     bDifferent = (relErrorULPS > 2.f);
123   }
124 
125   return bDifferent;
126 }
127 
128 bool STREAM_Add_verify(float *a, float *b, float *c, size_t len) {
129   size_t idx;
130   bool bDifferent = false;
131 
132   for (idx = 0; idx < len && !bDifferent; idx++) {
133     float expectedResult = a[idx] + b[idx];
134     float diffResultExpected = (c[idx] - expectedResult);
135     float relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps;
136     /* element-wise relative error determination */
137     bDifferent = (relErrorULPS > 2.f);
138   }
139 
140   return bDifferent;
141 }
142 
143 bool STREAM_Triad_verify(float *a, float *b, float *c, float scalar, size_t len) {
144   size_t idx;
145   bool bDifferent = false;
146 
147   for (idx = 0; idx < len && !bDifferent; idx++) {
148     float expectedResult = a[idx] + scalar*b[idx];
149     float diffResultExpected = (c[idx] - expectedResult);
150     float relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps;
151     /* element-wise relative error determination */
152     bDifferent = (relErrorULPS > 3.f);
153   }
154 
155   return bDifferent;
156 }
157 
158 /* forward declarations */
159 PetscErrorCode setupStream(PetscInt device, PetscBool cpuTiming);
160 PetscErrorCode runStream(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming);
161 PetscErrorCode printResultsReadable(float times[][NTIMES]);
162 
163 int main(int argc, char *argv[])
164 {
165   PetscInt       device    = 0;
166   PetscBool      cpuTiming = PETSC_FALSE;
167   PetscErrorCode ierr;
168 
169   ierr = PetscInitialize(&argc, &argv, 0, help);CHKERRQ(ierr);
170   ierr = PetscPrintf(PETSC_COMM_SELF, "[Single-Precision Device-Only STREAM Benchmark implementation in CUDA]\n");CHKERRQ(ierr);
171   ierr = PetscPrintf(PETSC_COMM_SELF, "%s Starting...\n\n", argv[0]);CHKERRQ(ierr);
172 
173   ierr = PetscOptionsBegin(PETSC_COMM_WORLD, "", "STREAM Benchmark Options", "STREAM");CHKERRQ(ierr);
174     ierr = PetscOptionsInt("-device", "Specify the CUDA device to be used", "STREAM", device, &device, PETSC_NULL);CHKERRQ(ierr);
175     ierr = PetscOptionsBool("-cputiming", "Force CPU-based timing to be used", "STREAM", cpuTiming, &cpuTiming, PETSC_NULL);CHKERRQ(ierr);
176   ierr = PetscOptionsEnd();
177 
178   ierr = setupStream(device, cpuTiming);
179   if (ierr >= 0) {
180     PetscErrorCode ierr2 = PetscPrintf(PETSC_COMM_SELF, "\n[streamBenchmark] - results:\t%s\n\n", (ierr == 0) ? "PASSES" : "FAILED");CHKERRQ(ierr2);
181   }
182   PetscFinalize();
183   return 0;
184 }
185 
186 ///////////////////////////////////////////////////////////////////////////////
187 //Run the appropriate tests
188 ///////////////////////////////////////////////////////////////////////////////
189 PetscErrorCode setupStream(PetscInt deviceNum, PetscBool cpuTiming)
190 {
191   PetscInt       iNumThreadsPerBlock = 128;
192   PetscErrorCode ierr;
193 
194   PetscFunctionBegin;
195   // Check device
196   {
197     int deviceCount;
198 
199     cudaGetDeviceCount(&deviceCount);
200     if (deviceCount == 0) {
201       ierr = PetscPrintf(PETSC_COMM_SELF, "!!!!!No devices found!!!!!\n");CHKERRQ(ierr);
202       return -1000;
203     }
204 
205     if (deviceNum >= deviceCount || deviceNum < 0) {
206       ierr = PetscPrintf(PETSC_COMM_SELF, "\n!!!!!Invalid GPU number %d given hence default gpu %d will be used !!!!!\n", deviceNum, 0);CHKERRQ(ierr);
207       deviceNum = 0;
208     }
209   }
210 
211   cudaSetDevice(deviceNum);
212   ierr = PetscPrintf(PETSC_COMM_SELF, "Running on...\n\n");CHKERRQ(ierr);
213   cudaDeviceProp deviceProp;
214   if (cudaGetDeviceProperties(&deviceProp, deviceNum) == cudaSuccess) {
215     ierr = PetscPrintf(PETSC_COMM_SELF, " Device %d: %s\n", deviceNum, deviceProp.name);CHKERRQ(ierr);
216   } else {
217     ierr = PetscPrintf(PETSC_COMM_SELF, " Unable to determine device %d properties, exiting\n");CHKERRQ(ierr);
218     return -1;
219   }
220 
221   if (deviceProp.major == 2 && deviceProp.minor == 1) {
222     iNumThreadsPerBlock = 192; /* GF104 architecture / 48 CUDA Cores per MP */
223   } else {
224     iNumThreadsPerBlock = 128; /* GF100 architecture / 32 CUDA Cores per MP */
225   }
226 
227   /*cutilSafeCall(cudaSetDeviceFlags(cudaDeviceBlockingSync));*/
228 
229   if (cpuTiming) {
230     ierr = PetscPrintf(PETSC_COMM_SELF, " Using cpu-only timer.\n");CHKERRQ(ierr);
231   }
232 
233   ierr = runStream(iNumThreadsPerBlock, cpuTiming);CHKERRQ(ierr);
234   PetscFunctionReturn(0);
235 }
236 
237 ///////////////////////////////////////////////////////////////////////////
238 // runStream
239 ///////////////////////////////////////////////////////////////////////////
240 PetscErrorCode runStream(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming)
241 {
242   float *d_a, *d_b, *d_c;
243   int k;
244   float times[5][NTIMES];
245   float scalar;
246   PetscErrorCode ierr;
247 
248   PetscFunctionBegin;
249   /* Allocate memory on device */
250   ierr = cudaMalloc((void**)&d_a, sizeof(float)*N);CHKERRQ(ierr);
251   ierr = cudaMalloc((void**)&d_b, sizeof(float)*N);CHKERRQ(ierr);
252   ierr = cudaMalloc((void**)&d_c, sizeof(float)*N);CHKERRQ(ierr);
253 
254   /* Compute execution configuration */
255 
256   dim3 dimBlock(iNumThreadsPerBlock); /* (iNumThreadsPerBlock,1,1) */
257   dim3 dimGrid(N/dimBlock.x); /* (N/dimBlock.x,1,1) */
258   if (N % dimBlock.x != 0) dimGrid.x+=1;
259 
260   ierr = PetscPrintf(PETSC_COMM_SELF, " Array size (single precision) = %u\n",N);CHKERRQ(ierr);
261   ierr = PetscPrintf(PETSC_COMM_SELF, " using %u threads per block, %u blocks\n",dimBlock.x,dimGrid.x);CHKERRQ(ierr);
262 
263   /* Initialize memory on the device */
264   set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N);
265   set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N);
266   set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N);
267 
268   /*	--- MAIN LOOP --- repeat test cases NTIMES times --- */
269   PetscLogDouble cpuTimer = 0.0;
270   cudaEvent_t start, stop;
271 
272   /* both timers report msec */
273   ierr = cudaEventCreate( &start );CHKERRQ(ierr); /* gpu timer facility */
274   ierr = cudaEventCreate( &stop );CHKERRQ(ierr);  /* gpu timer facility */
275 
276   scalar=3.0f;
277   for(k = 0; k < NTIMES; ++k) {
278     PetscTimeSubtract(cpuTimer);
279     ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr);
280     STREAM_Copy<<<dimGrid,dimBlock>>>(d_a, d_c, N);
281     ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr);
282     ierr = cudaEventSynchronize(stop);CHKERRQ(ierr);
283     //get the the total elapsed time in ms
284     if (bDontUseGPUTiming) {
285       PetscTimeAdd(cpuTimer);
286       times[0][k] = cpuTimer;
287     } else {
288       ierr = cudaEventElapsedTime( &times[0][k], start, stop );CHKERRQ(ierr);
289     }
290 
291     cpuTimer = 0.0;
292     PetscTimeSubtract(cpuTimer);
293     ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr);
294     STREAM_Copy_Optimized<<<dimGrid,dimBlock>>>(d_a, d_c, N);
295     ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr);
296     ierr = cudaEventSynchronize(stop);CHKERRQ(ierr);
297     //get the the total elapsed time in ms
298     if (bDontUseGPUTiming) {
299       PetscTimeAdd(cpuTimer);
300       times[1][k] = cpuTimer;
301     } else {
302       ierr = cudaEventElapsedTime( &times[1][k], start, stop );CHKERRQ(ierr);
303     }
304 
305     cpuTimer = 0.0;
306     PetscTimeSubtract(cpuTimer);
307     ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr);
308     STREAM_Scale<<<dimGrid,dimBlock>>>(d_b, d_c, scalar,  N);
309     ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr);
310     ierr = cudaEventSynchronize(stop);CHKERRQ(ierr);
311     //get the the total elapsed time in ms
312     PetscTimeAdd(cpuTimer);
313     if (bDontUseGPUTiming) {
314       times[2][k] = cpuTimer;
315     } else {
316       ierr = cudaEventElapsedTime( &times[2][k], start, stop );CHKERRQ(ierr);
317     }
318 
319     cpuTimer = 0.0;
320     PetscTimeSubtract(cpuTimer);
321     ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr);
322     STREAM_Add<<<dimGrid,dimBlock>>>(d_a, d_b, d_c,  N);
323     ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr);
324     ierr = cudaEventSynchronize(stop);CHKERRQ(ierr);
325     //get the the total elapsed time in ms
326     PetscTimeAdd(cpuTimer);
327     if (bDontUseGPUTiming) {
328       times[3][k] = cpuTimer;
329     } else {
330       ierr = cudaEventElapsedTime( &times[3][k], start, stop );CHKERRQ(ierr);
331     }
332 
333     cpuTimer = 0.0;
334     PetscTimeSubtract(cpuTimer);
335     ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr);
336     STREAM_Triad<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar,  N);
337     ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr);
338     ierr = cudaEventSynchronize(stop);CHKERRQ(ierr);
339     //get the the total elapsed time in ms
340     PetscTimeAdd(cpuTimer);
341     if (bDontUseGPUTiming) {
342       times[4][k] = cpuTimer;
343     } else {
344       ierr = cudaEventElapsedTime( &times[4][k], start, stop );CHKERRQ(ierr);
345     }
346 
347   }
348 
349   /* verify kernels */
350   float *h_a, *h_b, *h_c;
351   bool errorSTREAMkernel = true;
352 
353   if ( (h_a = (float*)calloc( N, sizeof(float) )) == (float*)NULL ) {
354     printf("Unable to allocate array h_a, exiting ...\n");
355     exit(1);
356   }
357   if ( (h_b = (float*)calloc( N, sizeof(float) )) == (float*)NULL ) {
358     printf("Unable to allocate array h_b, exiting ...\n");
359     exit(1);
360   }
361 
362   if ( (h_c = (float*)calloc( N, sizeof(float) )) == (float*)NULL ) {
363     printf("Unalbe to allocate array h_c, exiting ...\n");
364     exit(1);
365   }
366 
367   /*
368    * perform kernel, copy device memory into host memory and verify each
369    * device kernel output
370    */
371 
372   /* Initialize memory on the device */
373   set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N);
374   set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N);
375   set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N);
376 
377   STREAM_Copy<<<dimGrid,dimBlock>>>(d_a, d_c, N);
378   ierr = cudaMemcpy( h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
379   ierr = cudaMemcpy( h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
380   errorSTREAMkernel = STREAM_Copy_verify(h_a, h_c, N);
381   if (errorSTREAMkernel) {
382     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy:\t\tError detected in device STREAM_Copy, exiting\n");CHKERRQ(ierr);
383     exit(-2000);
384   } else {
385     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy:\t\tPass\n");CHKERRQ(ierr);
386   }
387 
388   /* Initialize memory on the device */
389   set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N);
390   set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N);
391   set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N);
392 
393   STREAM_Copy_Optimized<<<dimGrid,dimBlock>>>(d_a, d_c, N);
394   ierr = cudaMemcpy( h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
395   ierr = cudaMemcpy( h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
396   errorSTREAMkernel = STREAM_Copy_verify(h_a, h_c, N);
397   if (errorSTREAMkernel) {
398     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy_Optimized:\tError detected in device STREAM_Copy_Optimized, exiting\n");CHKERRQ(ierr);
399     exit(-3000);
400   } else {
401     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy_Optimized:\tPass\n");CHKERRQ(ierr);
402   }
403 
404   /* Initialize memory on the device */
405   set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N);
406   set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N);
407 
408   STREAM_Scale<<<dimGrid,dimBlock>>>(d_b, d_c, scalar, N);
409   ierr = cudaMemcpy( h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
410   ierr = cudaMemcpy( h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
411   errorSTREAMkernel = STREAM_Scale_verify(h_b, h_c, scalar, N);
412   if (errorSTREAMkernel) {
413     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Scale:\t\tError detected in device STREAM_Scale, exiting\n");CHKERRQ(ierr);
414     exit(-4000);
415   } else {
416     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Scale:\t\tPass\n");CHKERRQ(ierr);
417   }
418 
419   /* Initialize memory on the device */
420   set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N);
421   set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N);
422   set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N);
423 
424   STREAM_Add<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N);
425   ierr = cudaMemcpy( h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
426   ierr = cudaMemcpy( h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
427   ierr = cudaMemcpy( h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
428   errorSTREAMkernel = STREAM_Add_verify(h_a, h_b, h_c, N);
429   if (errorSTREAMkernel) {
430     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Add:\t\tError detected in device STREAM_Add, exiting\n");CHKERRQ(ierr);
431     exit(-5000);
432   } else {
433     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Add:\t\tPass\n");CHKERRQ(ierr);
434   }
435 
436   /* Initialize memory on the device */
437   set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N);
438   set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N);
439   set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N);
440 
441   STREAM_Triad<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N);
442   ierr = cudaMemcpy( h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
443   ierr = cudaMemcpy( h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
444   ierr = cudaMemcpy( h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
445   errorSTREAMkernel = STREAM_Triad_verify(h_b, h_c, h_a, scalar, N);
446   if (errorSTREAMkernel) {
447     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Triad:\t\tError detected in device STREAM_Triad, exiting\n");CHKERRQ(ierr);
448     exit(-6000);
449   } else {
450     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Triad:\t\tPass\n");CHKERRQ(ierr);
451   }
452 
453   /* continue from here */
454   printResultsReadable(times);
455 
456   //clean up timers
457   ierr = cudaEventDestroy( stop );CHKERRQ(ierr);
458   ierr = cudaEventDestroy( start );CHKERRQ(ierr);
459 
460   /* Free memory on device */
461   ierr = cudaFree(d_a);CHKERRQ(ierr);
462   ierr = cudaFree(d_b);CHKERRQ(ierr);
463   ierr = cudaFree(d_c);CHKERRQ(ierr);
464   PetscFunctionReturn(0);
465 }
466 
467 ///////////////////////////////////////////////////////////////////////////
468 //Print Results to Screen and File
469 ///////////////////////////////////////////////////////////////////////////
470 PetscErrorCode printResultsReadable(float times[][NTIMES]) {
471   PetscErrorCode ierr;
472   PetscInt       j, k;
473   float	avgtime[5] = {0., 0., 0., 0., 0.};
474   float	maxtime[5] = {0., 0., 0., 0., 0.};
475   float	mintime[5] = {1e30,1e30,1e30,1e30,1e30};
476   char *label[5]   = {"Copy:      ", "Copy Opt.: ", "Scale:     ", "Add:       ", "Triad:     "};
477   float	bytes_per_kernel[5] = {
478     2. * sizeof(float) * N,
479     2. * sizeof(float) * N,
480     2. * sizeof(float) * N,
481     3. * sizeof(float) * N,
482     3. * sizeof(float) * N
483   };
484 
485   PetscFunctionBegin;
486   /* --- SUMMARY --- */
487   for(k = 1; k < NTIMES; ++k) { /* note -- skip first iteration */
488     for(j = 0; j < 5; ++j) {
489       avgtime[j] = avgtime[j] + (1.e-03f * times[j][k]);
490       mintime[j] = MIN(mintime[j], (1.e-03f * times[j][k]));
491       maxtime[j] = MAX(maxtime[j], (1.e-03f * times[j][k]));
492     }
493   }
494 
495   ierr = PetscPrintf(PETSC_COMM_SELF, "Function    Rate (MB/s)    Avg time      Min time      Max time\n");CHKERRQ(ierr);
496 
497   for(j = 0; j < 5; ++j) {
498      avgtime[j] = avgtime[j]/(float)(NTIMES-1);
499      ierr = PetscPrintf(PETSC_COMM_SELF, "%s%11.4f  %11.6f  %12.6f  %12.6f\n", label[j], 1.0E-06 * bytes_per_kernel[j]/mintime[j], avgtime[j], mintime[j], maxtime[j]);CHKERRQ(ierr);
500   }
501   PetscFunctionReturn(0);
502 }
503