1 /* 2 STREAM benchmark implementation in CUDA. 3 4 COPY: a(i) = b(i) 5 SCALE: a(i) = q*b(i) 6 SUM: a(i) = b(i) + c(i) 7 TRIAD: a(i) = b(i) + q*c(i) 8 9 It measures the memory system on the device. 10 The implementation is in single precision. 11 12 Code based on the code developed by John D. McCalpin 13 http://www.cs.virginia.edu/stream/FTP/Code/stream.c 14 15 Written by: Massimiliano Fatica, NVIDIA Corporation 16 Modified by: Douglas Enright (dpephd-nvidia@yahoo.com), 1 December 2010 17 Extensive Revisions, 4 December 2010 18 Modified for PETSc by: Matthew G. Knepley 14 Aug 2011 19 20 User interface motivated by bandwidthTest NVIDIA SDK example. 21 */ 22 static char *help = "Single-Precision STREAM Benchmark implementation in CUDA\n" 23 "Performs Copy, Scale, Add, and Triad single-precision kernels\n\n"; 24 25 #include <petscconf.h> 26 #include <petscsys.h> 27 #include <petsctime.h> 28 29 #define N 2000000 30 #define NTIMES 10 31 32 # ifndef MIN 33 # define MIN(x,y) ((x)<(y)?(x):(y)) 34 # endif 35 # ifndef MAX 36 # define MAX(x,y) ((x)>(y)?(x):(y)) 37 # endif 38 39 const float flt_eps = 1.192092896e-07f; 40 41 __global__ void set_array(float *a, float value, size_t len) 42 { 43 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 44 while (idx < len) { 45 a[idx] = value; 46 idx += blockDim.x * gridDim.x; 47 } 48 } 49 50 __global__ void STREAM_Copy(float *a, float *b, size_t len) 51 { 52 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 53 while (idx < len) { 54 b[idx] = a[idx]; 55 idx += blockDim.x * gridDim.x; 56 } 57 } 58 59 __global__ void STREAM_Copy_Optimized(float *a, float *b, size_t len) 60 { 61 /* 62 * Ensure size of thread index space is as large as or greater than 63 * vector index space else return. 64 */ 65 if (blockDim.x * gridDim.x < len) return; 66 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 67 if (idx < len) b[idx] = a[idx]; 68 } 69 70 __global__ void STREAM_Scale(float *a, float *b, float scale, size_t len) 71 { 72 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 73 while (idx < len) { 74 b[idx] = scale* a[idx]; 75 idx += blockDim.x * gridDim.x; 76 } 77 } 78 79 __global__ void STREAM_Add( float *a, float *b, float *c, size_t len) 80 { 81 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 82 while (idx < len) { 83 c[idx] = a[idx]+b[idx]; 84 idx += blockDim.x * gridDim.x; 85 } 86 } 87 88 __global__ void STREAM_Triad( float *a, float *b, float *c, float scalar, size_t len) 89 { 90 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 91 while (idx < len) { 92 c[idx] = a[idx]+scalar*b[idx]; 93 idx += blockDim.x * gridDim.x; 94 } 95 } 96 97 /* Host side verification routines */ 98 bool STREAM_Copy_verify(float *a, float *b, size_t len) { 99 size_t idx; 100 bool bDifferent = false; 101 102 for (idx = 0; idx < len && !bDifferent; idx++) { 103 float expectedResult = a[idx]; 104 float diffResultExpected = (b[idx] - expectedResult); 105 float relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps; 106 /* element-wise relative error determination */ 107 bDifferent = (relErrorULPS > 2.f); 108 } 109 110 return bDifferent; 111 } 112 113 bool STREAM_Scale_verify(float *a, float *b, float scale, size_t len) { 114 size_t idx; 115 bool bDifferent = false; 116 117 for (idx = 0; idx < len && !bDifferent; idx++) { 118 float expectedResult = scale*a[idx]; 119 float diffResultExpected = (b[idx] - expectedResult); 120 float relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps; 121 /* element-wise relative error determination */ 122 bDifferent = (relErrorULPS > 2.f); 123 } 124 125 return bDifferent; 126 } 127 128 bool STREAM_Add_verify(float *a, float *b, float *c, size_t len) { 129 size_t idx; 130 bool bDifferent = false; 131 132 for (idx = 0; idx < len && !bDifferent; idx++) { 133 float expectedResult = a[idx] + b[idx]; 134 float diffResultExpected = (c[idx] - expectedResult); 135 float relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps; 136 /* element-wise relative error determination */ 137 bDifferent = (relErrorULPS > 2.f); 138 } 139 140 return bDifferent; 141 } 142 143 bool STREAM_Triad_verify(float *a, float *b, float *c, float scalar, size_t len) { 144 size_t idx; 145 bool bDifferent = false; 146 147 for (idx = 0; idx < len && !bDifferent; idx++) { 148 float expectedResult = a[idx] + scalar*b[idx]; 149 float diffResultExpected = (c[idx] - expectedResult); 150 float relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps; 151 /* element-wise relative error determination */ 152 bDifferent = (relErrorULPS > 3.f); 153 } 154 155 return bDifferent; 156 } 157 158 /* forward declarations */ 159 PetscErrorCode setupStream(PetscInt device, PetscBool cpuTiming); 160 PetscErrorCode runStream(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming); 161 PetscErrorCode printResultsReadable(float times[][NTIMES]); 162 163 int main(int argc, char *argv[]) 164 { 165 PetscInt device = 0; 166 PetscBool cpuTiming = PETSC_FALSE; 167 PetscErrorCode ierr; 168 169 ierr = PetscInitialize(&argc, &argv, 0, help);CHKERRQ(ierr); 170 ierr = PetscPrintf(PETSC_COMM_SELF, "[Single-Precision Device-Only STREAM Benchmark implementation in CUDA]\n");CHKERRQ(ierr); 171 ierr = PetscPrintf(PETSC_COMM_SELF, "%s Starting...\n\n", argv[0]);CHKERRQ(ierr); 172 173 ierr = PetscOptionsBegin(PETSC_COMM_WORLD, "", "STREAM Benchmark Options", "STREAM");CHKERRQ(ierr); 174 ierr = PetscOptionsInt("-device", "Specify the CUDA device to be used", "STREAM", device, &device, PETSC_NULL);CHKERRQ(ierr); 175 ierr = PetscOptionsBool("-cputiming", "Force CPU-based timing to be used", "STREAM", cpuTiming, &cpuTiming, PETSC_NULL);CHKERRQ(ierr); 176 ierr = PetscOptionsEnd(); 177 178 ierr = setupStream(device, cpuTiming); 179 if (ierr >= 0) { 180 PetscErrorCode ierr2 = PetscPrintf(PETSC_COMM_SELF, "\n[streamBenchmark] - results:\t%s\n\n", (ierr == 0) ? "PASSES" : "FAILED");CHKERRQ(ierr2); 181 } 182 PetscFinalize(); 183 return 0; 184 } 185 186 /////////////////////////////////////////////////////////////////////////////// 187 //Run the appropriate tests 188 /////////////////////////////////////////////////////////////////////////////// 189 PetscErrorCode setupStream(PetscInt deviceNum, PetscBool cpuTiming) 190 { 191 PetscInt iNumThreadsPerBlock = 128; 192 PetscErrorCode ierr; 193 194 PetscFunctionBegin; 195 // Check device 196 { 197 int deviceCount; 198 199 cudaGetDeviceCount(&deviceCount); 200 if (deviceCount == 0) { 201 ierr = PetscPrintf(PETSC_COMM_SELF, "!!!!!No devices found!!!!!\n");CHKERRQ(ierr); 202 return -1000; 203 } 204 205 if (deviceNum >= deviceCount || deviceNum < 0) { 206 ierr = PetscPrintf(PETSC_COMM_SELF, "\n!!!!!Invalid GPU number %d given hence default gpu %d will be used !!!!!\n", deviceNum, 0);CHKERRQ(ierr); 207 deviceNum = 0; 208 } 209 } 210 211 cudaSetDevice(deviceNum); 212 ierr = PetscPrintf(PETSC_COMM_SELF, "Running on...\n\n");CHKERRQ(ierr); 213 cudaDeviceProp deviceProp; 214 if (cudaGetDeviceProperties(&deviceProp, deviceNum) == cudaSuccess) { 215 ierr = PetscPrintf(PETSC_COMM_SELF, " Device %d: %s\n", deviceNum, deviceProp.name);CHKERRQ(ierr); 216 } else { 217 ierr = PetscPrintf(PETSC_COMM_SELF, " Unable to determine device %d properties, exiting\n");CHKERRQ(ierr); 218 return -1; 219 } 220 221 if (deviceProp.major == 2 && deviceProp.minor == 1) { 222 iNumThreadsPerBlock = 192; /* GF104 architecture / 48 CUDA Cores per MP */ 223 } else { 224 iNumThreadsPerBlock = 128; /* GF100 architecture / 32 CUDA Cores per MP */ 225 } 226 227 /*cutilSafeCall(cudaSetDeviceFlags(cudaDeviceBlockingSync));*/ 228 229 if (cpuTiming) { 230 ierr = PetscPrintf(PETSC_COMM_SELF, " Using cpu-only timer.\n");CHKERRQ(ierr); 231 } 232 233 ierr = runStream(iNumThreadsPerBlock, cpuTiming);CHKERRQ(ierr); 234 PetscFunctionReturn(0); 235 } 236 237 /////////////////////////////////////////////////////////////////////////// 238 // runStream 239 /////////////////////////////////////////////////////////////////////////// 240 PetscErrorCode runStream(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming) 241 { 242 float *d_a, *d_b, *d_c; 243 int k; 244 float times[5][NTIMES]; 245 float scalar; 246 PetscErrorCode ierr; 247 248 PetscFunctionBegin; 249 /* Allocate memory on device */ 250 ierr = cudaMalloc((void**)&d_a, sizeof(float)*N);CHKERRQ(ierr); 251 ierr = cudaMalloc((void**)&d_b, sizeof(float)*N);CHKERRQ(ierr); 252 ierr = cudaMalloc((void**)&d_c, sizeof(float)*N);CHKERRQ(ierr); 253 254 /* Compute execution configuration */ 255 256 dim3 dimBlock(iNumThreadsPerBlock); /* (iNumThreadsPerBlock,1,1) */ 257 dim3 dimGrid(N/dimBlock.x); /* (N/dimBlock.x,1,1) */ 258 if (N % dimBlock.x != 0) dimGrid.x+=1; 259 260 ierr = PetscPrintf(PETSC_COMM_SELF, " Array size (single precision) = %u\n",N);CHKERRQ(ierr); 261 ierr = PetscPrintf(PETSC_COMM_SELF, " using %u threads per block, %u blocks\n",dimBlock.x,dimGrid.x);CHKERRQ(ierr); 262 263 /* Initialize memory on the device */ 264 set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N); 265 set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N); 266 set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N); 267 268 /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ 269 PetscLogDouble cpuTimer = 0.0; 270 cudaEvent_t start, stop; 271 272 /* both timers report msec */ 273 ierr = cudaEventCreate( &start );CHKERRQ(ierr); /* gpu timer facility */ 274 ierr = cudaEventCreate( &stop );CHKERRQ(ierr); /* gpu timer facility */ 275 276 scalar=3.0f; 277 for(k = 0; k < NTIMES; ++k) { 278 PetscTimeSubtract(cpuTimer); 279 ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr); 280 STREAM_Copy<<<dimGrid,dimBlock>>>(d_a, d_c, N); 281 ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr); 282 ierr = cudaEventSynchronize(stop);CHKERRQ(ierr); 283 //get the the total elapsed time in ms 284 if (bDontUseGPUTiming) { 285 PetscTimeAdd(cpuTimer); 286 times[0][k] = cpuTimer; 287 } else { 288 ierr = cudaEventElapsedTime( ×[0][k], start, stop );CHKERRQ(ierr); 289 } 290 291 cpuTimer = 0.0; 292 PetscTimeSubtract(cpuTimer); 293 ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr); 294 STREAM_Copy_Optimized<<<dimGrid,dimBlock>>>(d_a, d_c, N); 295 ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr); 296 ierr = cudaEventSynchronize(stop);CHKERRQ(ierr); 297 //get the the total elapsed time in ms 298 if (bDontUseGPUTiming) { 299 PetscTimeAdd(cpuTimer); 300 times[1][k] = cpuTimer; 301 } else { 302 ierr = cudaEventElapsedTime( ×[1][k], start, stop );CHKERRQ(ierr); 303 } 304 305 cpuTimer = 0.0; 306 PetscTimeSubtract(cpuTimer); 307 ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr); 308 STREAM_Scale<<<dimGrid,dimBlock>>>(d_b, d_c, scalar, N); 309 ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr); 310 ierr = cudaEventSynchronize(stop);CHKERRQ(ierr); 311 //get the the total elapsed time in ms 312 PetscTimeAdd(cpuTimer); 313 if (bDontUseGPUTiming) { 314 times[2][k] = cpuTimer; 315 } else { 316 ierr = cudaEventElapsedTime( ×[2][k], start, stop );CHKERRQ(ierr); 317 } 318 319 cpuTimer = 0.0; 320 PetscTimeSubtract(cpuTimer); 321 ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr); 322 STREAM_Add<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N); 323 ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr); 324 ierr = cudaEventSynchronize(stop);CHKERRQ(ierr); 325 //get the the total elapsed time in ms 326 PetscTimeAdd(cpuTimer); 327 if (bDontUseGPUTiming) { 328 times[3][k] = cpuTimer; 329 } else { 330 ierr = cudaEventElapsedTime( ×[3][k], start, stop );CHKERRQ(ierr); 331 } 332 333 cpuTimer = 0.0; 334 PetscTimeSubtract(cpuTimer); 335 ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr); 336 STREAM_Triad<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N); 337 ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr); 338 ierr = cudaEventSynchronize(stop);CHKERRQ(ierr); 339 //get the the total elapsed time in ms 340 PetscTimeAdd(cpuTimer); 341 if (bDontUseGPUTiming) { 342 times[4][k] = cpuTimer; 343 } else { 344 ierr = cudaEventElapsedTime( ×[4][k], start, stop );CHKERRQ(ierr); 345 } 346 347 } 348 349 /* verify kernels */ 350 float *h_a, *h_b, *h_c; 351 bool errorSTREAMkernel = true; 352 353 if ( (h_a = (float*)calloc( N, sizeof(float) )) == (float*)NULL ) { 354 printf("Unable to allocate array h_a, exiting ...\n"); 355 exit(1); 356 } 357 if ( (h_b = (float*)calloc( N, sizeof(float) )) == (float*)NULL ) { 358 printf("Unable to allocate array h_b, exiting ...\n"); 359 exit(1); 360 } 361 362 if ( (h_c = (float*)calloc( N, sizeof(float) )) == (float*)NULL ) { 363 printf("Unalbe to allocate array h_c, exiting ...\n"); 364 exit(1); 365 } 366 367 /* 368 * perform kernel, copy device memory into host memory and verify each 369 * device kernel output 370 */ 371 372 /* Initialize memory on the device */ 373 set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N); 374 set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N); 375 set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N); 376 377 STREAM_Copy<<<dimGrid,dimBlock>>>(d_a, d_c, N); 378 ierr = cudaMemcpy( h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 379 ierr = cudaMemcpy( h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 380 errorSTREAMkernel = STREAM_Copy_verify(h_a, h_c, N); 381 if (errorSTREAMkernel) { 382 ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy:\t\tError detected in device STREAM_Copy, exiting\n");CHKERRQ(ierr); 383 exit(-2000); 384 } else { 385 ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy:\t\tPass\n");CHKERRQ(ierr); 386 } 387 388 /* Initialize memory on the device */ 389 set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N); 390 set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N); 391 set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N); 392 393 STREAM_Copy_Optimized<<<dimGrid,dimBlock>>>(d_a, d_c, N); 394 ierr = cudaMemcpy( h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 395 ierr = cudaMemcpy( h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 396 errorSTREAMkernel = STREAM_Copy_verify(h_a, h_c, N); 397 if (errorSTREAMkernel) { 398 ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy_Optimized:\tError detected in device STREAM_Copy_Optimized, exiting\n");CHKERRQ(ierr); 399 exit(-3000); 400 } else { 401 ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy_Optimized:\tPass\n");CHKERRQ(ierr); 402 } 403 404 /* Initialize memory on the device */ 405 set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N); 406 set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N); 407 408 STREAM_Scale<<<dimGrid,dimBlock>>>(d_b, d_c, scalar, N); 409 ierr = cudaMemcpy( h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 410 ierr = cudaMemcpy( h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 411 errorSTREAMkernel = STREAM_Scale_verify(h_b, h_c, scalar, N); 412 if (errorSTREAMkernel) { 413 ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Scale:\t\tError detected in device STREAM_Scale, exiting\n");CHKERRQ(ierr); 414 exit(-4000); 415 } else { 416 ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Scale:\t\tPass\n");CHKERRQ(ierr); 417 } 418 419 /* Initialize memory on the device */ 420 set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N); 421 set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N); 422 set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N); 423 424 STREAM_Add<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N); 425 ierr = cudaMemcpy( h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 426 ierr = cudaMemcpy( h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 427 ierr = cudaMemcpy( h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 428 errorSTREAMkernel = STREAM_Add_verify(h_a, h_b, h_c, N); 429 if (errorSTREAMkernel) { 430 ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Add:\t\tError detected in device STREAM_Add, exiting\n");CHKERRQ(ierr); 431 exit(-5000); 432 } else { 433 ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Add:\t\tPass\n");CHKERRQ(ierr); 434 } 435 436 /* Initialize memory on the device */ 437 set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N); 438 set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N); 439 set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N); 440 441 STREAM_Triad<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N); 442 ierr = cudaMemcpy( h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 443 ierr = cudaMemcpy( h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 444 ierr = cudaMemcpy( h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 445 errorSTREAMkernel = STREAM_Triad_verify(h_b, h_c, h_a, scalar, N); 446 if (errorSTREAMkernel) { 447 ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Triad:\t\tError detected in device STREAM_Triad, exiting\n");CHKERRQ(ierr); 448 exit(-6000); 449 } else { 450 ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Triad:\t\tPass\n");CHKERRQ(ierr); 451 } 452 453 /* continue from here */ 454 printResultsReadable(times); 455 456 //clean up timers 457 ierr = cudaEventDestroy( stop );CHKERRQ(ierr); 458 ierr = cudaEventDestroy( start );CHKERRQ(ierr); 459 460 /* Free memory on device */ 461 ierr = cudaFree(d_a);CHKERRQ(ierr); 462 ierr = cudaFree(d_b);CHKERRQ(ierr); 463 ierr = cudaFree(d_c);CHKERRQ(ierr); 464 PetscFunctionReturn(0); 465 } 466 467 /////////////////////////////////////////////////////////////////////////// 468 //Print Results to Screen and File 469 /////////////////////////////////////////////////////////////////////////// 470 PetscErrorCode printResultsReadable(float times[][NTIMES]) { 471 PetscErrorCode ierr; 472 PetscInt j, k; 473 float avgtime[5] = {0., 0., 0., 0., 0.}; 474 float maxtime[5] = {0., 0., 0., 0., 0.}; 475 float mintime[5] = {1e30,1e30,1e30,1e30,1e30}; 476 char *label[5] = {"Copy: ", "Copy Opt.: ", "Scale: ", "Add: ", "Triad: "}; 477 float bytes_per_kernel[5] = { 478 2. * sizeof(float) * N, 479 2. * sizeof(float) * N, 480 2. * sizeof(float) * N, 481 3. * sizeof(float) * N, 482 3. * sizeof(float) * N 483 }; 484 485 PetscFunctionBegin; 486 /* --- SUMMARY --- */ 487 for(k = 1; k < NTIMES; ++k) { /* note -- skip first iteration */ 488 for(j = 0; j < 5; ++j) { 489 avgtime[j] = avgtime[j] + (1.e-03f * times[j][k]); 490 mintime[j] = MIN(mintime[j], (1.e-03f * times[j][k])); 491 maxtime[j] = MAX(maxtime[j], (1.e-03f * times[j][k])); 492 } 493 } 494 495 ierr = PetscPrintf(PETSC_COMM_SELF, "Function Rate (MB/s) Avg time Min time Max time\n");CHKERRQ(ierr); 496 497 for(j = 0; j < 5; ++j) { 498 avgtime[j] = avgtime[j]/(float)(NTIMES-1); 499 ierr = PetscPrintf(PETSC_COMM_SELF, "%s%11.4f %11.6f %12.6f %12.6f\n", label[j], 1.0E-06 * bytes_per_kernel[j]/mintime[j], avgtime[j], mintime[j], maxtime[j]);CHKERRQ(ierr); 500 } 501 PetscFunctionReturn(0); 502 } 503