1403adfb6SMatthew G Knepley /* 2403adfb6SMatthew G Knepley STREAM benchmark implementation in CUDA. 3403adfb6SMatthew G Knepley 4403adfb6SMatthew G Knepley COPY: a(i) = b(i) 5403adfb6SMatthew G Knepley SCALE: a(i) = q*b(i) 6403adfb6SMatthew G Knepley SUM: a(i) = b(i) + c(i) 7403adfb6SMatthew G Knepley TRIAD: a(i) = b(i) + q*c(i) 8403adfb6SMatthew G Knepley 9403adfb6SMatthew G Knepley It measures the memory system on the device. 10403adfb6SMatthew G Knepley The implementation is in single precision. 11403adfb6SMatthew G Knepley 12403adfb6SMatthew G Knepley Code based on the code developed by John D. McCalpin 13403adfb6SMatthew G Knepley http://www.cs.virginia.edu/stream/FTP/Code/stream.c 14403adfb6SMatthew G Knepley 15403adfb6SMatthew G Knepley Written by: Massimiliano Fatica, NVIDIA Corporation 16403adfb6SMatthew G Knepley Modified by: Douglas Enright (dpephd-nvidia@yahoo.com), 1 December 2010 17403adfb6SMatthew G Knepley Extensive Revisions, 4 December 2010 18403adfb6SMatthew G Knepley Modified for PETSc by: Matthew G. Knepley 14 Aug 2011 19403adfb6SMatthew G Knepley 20403adfb6SMatthew G Knepley User interface motivated by bandwidthTest NVIDIA SDK example. 21403adfb6SMatthew G Knepley */ 22403adfb6SMatthew G Knepley static char *help = "Single-Precision STREAM Benchmark implementation in CUDA\n" 23403adfb6SMatthew G Knepley "Performs Copy, Scale, Add, and Triad single-precision kernels\n\n"; 24403adfb6SMatthew G Knepley 25403adfb6SMatthew G Knepley #include <petscconf.h> 26403adfb6SMatthew G Knepley #include <petscsys.h> 27403adfb6SMatthew G Knepley #include <petsctime.h> 28403adfb6SMatthew G Knepley 29403adfb6SMatthew G Knepley #define N 2000000 30*caccb7e3SMatthew G Knepley #define N_DOUBLE 8000000 31403adfb6SMatthew G Knepley #define NTIMES 10 32403adfb6SMatthew G Knepley 33403adfb6SMatthew G Knepley # ifndef MIN 34403adfb6SMatthew G Knepley # define MIN(x,y) ((x)<(y)?(x):(y)) 35403adfb6SMatthew G Knepley # endif 36403adfb6SMatthew G Knepley # ifndef MAX 37403adfb6SMatthew G Knepley # define MAX(x,y) ((x)>(y)?(x):(y)) 38403adfb6SMatthew G Knepley # endif 39403adfb6SMatthew G Knepley 40403adfb6SMatthew G Knepley const float flt_eps = 1.192092896e-07f; 41*caccb7e3SMatthew G Knepley const double dbl_eps = 2.2204460492503131e-16; 42403adfb6SMatthew G Knepley 43403adfb6SMatthew G Knepley __global__ void set_array(float *a, float value, size_t len) 44403adfb6SMatthew G Knepley { 45403adfb6SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 46403adfb6SMatthew G Knepley while (idx < len) { 47403adfb6SMatthew G Knepley a[idx] = value; 48403adfb6SMatthew G Knepley idx += blockDim.x * gridDim.x; 49403adfb6SMatthew G Knepley } 50403adfb6SMatthew G Knepley } 51403adfb6SMatthew G Knepley 52*caccb7e3SMatthew G Knepley __global__ void set_array_double(double *a, double value, size_t len) 53*caccb7e3SMatthew G Knepley { 54*caccb7e3SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 55*caccb7e3SMatthew G Knepley while (idx < len) { 56*caccb7e3SMatthew G Knepley a[idx] = value; 57*caccb7e3SMatthew G Knepley idx += blockDim.x * gridDim.x; 58*caccb7e3SMatthew G Knepley } 59*caccb7e3SMatthew G Knepley } 60*caccb7e3SMatthew G Knepley 61403adfb6SMatthew G Knepley __global__ void STREAM_Copy(float *a, float *b, size_t len) 62403adfb6SMatthew G Knepley { 63403adfb6SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 64403adfb6SMatthew G Knepley while (idx < len) { 65403adfb6SMatthew G Knepley b[idx] = a[idx]; 66403adfb6SMatthew G Knepley idx += blockDim.x * gridDim.x; 67403adfb6SMatthew G Knepley } 68403adfb6SMatthew G Knepley } 69403adfb6SMatthew G Knepley 70*caccb7e3SMatthew G Knepley __global__ void STREAM_Copy_double(double *a, double *b, size_t len) 71*caccb7e3SMatthew G Knepley { 72*caccb7e3SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 73*caccb7e3SMatthew G Knepley while (idx < len) { 74*caccb7e3SMatthew G Knepley b[idx] = a[idx]; 75*caccb7e3SMatthew G Knepley idx += blockDim.x * gridDim.x; 76*caccb7e3SMatthew G Knepley } 77*caccb7e3SMatthew G Knepley } 78*caccb7e3SMatthew G Knepley 79403adfb6SMatthew G Knepley __global__ void STREAM_Copy_Optimized(float *a, float *b, size_t len) 80403adfb6SMatthew G Knepley { 81403adfb6SMatthew G Knepley /* 82403adfb6SMatthew G Knepley * Ensure size of thread index space is as large as or greater than 83403adfb6SMatthew G Knepley * vector index space else return. 84403adfb6SMatthew G Knepley */ 85403adfb6SMatthew G Knepley if (blockDim.x * gridDim.x < len) return; 86403adfb6SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 87403adfb6SMatthew G Knepley if (idx < len) b[idx] = a[idx]; 88403adfb6SMatthew G Knepley } 89403adfb6SMatthew G Knepley 90*caccb7e3SMatthew G Knepley __global__ void STREAM_Copy_Optimized_double(double *a, double *b, size_t len) 91*caccb7e3SMatthew G Knepley { 92*caccb7e3SMatthew G Knepley /* 93*caccb7e3SMatthew G Knepley * Ensure size of thread index space is as large as or greater than 94*caccb7e3SMatthew G Knepley * vector index space else return. 95*caccb7e3SMatthew G Knepley */ 96*caccb7e3SMatthew G Knepley if (blockDim.x * gridDim.x < len) return; 97*caccb7e3SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 98*caccb7e3SMatthew G Knepley if (idx < len) b[idx] = a[idx]; 99*caccb7e3SMatthew G Knepley } 100*caccb7e3SMatthew G Knepley 101403adfb6SMatthew G Knepley __global__ void STREAM_Scale(float *a, float *b, float scale, size_t len) 102403adfb6SMatthew G Knepley { 103403adfb6SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 104403adfb6SMatthew G Knepley while (idx < len) { 105403adfb6SMatthew G Knepley b[idx] = scale* a[idx]; 106403adfb6SMatthew G Knepley idx += blockDim.x * gridDim.x; 107403adfb6SMatthew G Knepley } 108403adfb6SMatthew G Knepley } 109403adfb6SMatthew G Knepley 110*caccb7e3SMatthew G Knepley __global__ void STREAM_Scale_double(double *a, double *b, double scale, size_t len) 111*caccb7e3SMatthew G Knepley { 112*caccb7e3SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 113*caccb7e3SMatthew G Knepley while (idx < len) { 114*caccb7e3SMatthew G Knepley b[idx] = scale* a[idx]; 115*caccb7e3SMatthew G Knepley idx += blockDim.x * gridDim.x; 116*caccb7e3SMatthew G Knepley } 117*caccb7e3SMatthew G Knepley } 118*caccb7e3SMatthew G Knepley 119*caccb7e3SMatthew G Knepley __global__ void STREAM_Scale_Optimized(float *a, float *b, float scale, size_t len) 120*caccb7e3SMatthew G Knepley { 121*caccb7e3SMatthew G Knepley /* 122*caccb7e3SMatthew G Knepley * Ensure size of thread index space is as large as or greater than 123*caccb7e3SMatthew G Knepley * vector index space else return. 124*caccb7e3SMatthew G Knepley */ 125*caccb7e3SMatthew G Knepley if (blockDim.x * gridDim.x < len) return; 126*caccb7e3SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 127*caccb7e3SMatthew G Knepley if (idx < len) b[idx] = scale* a[idx]; 128*caccb7e3SMatthew G Knepley } 129*caccb7e3SMatthew G Knepley 130*caccb7e3SMatthew G Knepley __global__ void STREAM_Scale_Optimized_double(double *a, double *b, double scale, size_t len) 131*caccb7e3SMatthew G Knepley { 132*caccb7e3SMatthew G Knepley /* 133*caccb7e3SMatthew G Knepley * Ensure size of thread index space is as large as or greater than 134*caccb7e3SMatthew G Knepley * vector index space else return. 135*caccb7e3SMatthew G Knepley */ 136*caccb7e3SMatthew G Knepley if (blockDim.x * gridDim.x < len) return; 137*caccb7e3SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 138*caccb7e3SMatthew G Knepley if (idx < len) b[idx] = scale* a[idx]; 139*caccb7e3SMatthew G Knepley } 140*caccb7e3SMatthew G Knepley 141403adfb6SMatthew G Knepley __global__ void STREAM_Add( float *a, float *b, float *c, size_t len) 142403adfb6SMatthew G Knepley { 143403adfb6SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 144403adfb6SMatthew G Knepley while (idx < len) { 145403adfb6SMatthew G Knepley c[idx] = a[idx]+b[idx]; 146403adfb6SMatthew G Knepley idx += blockDim.x * gridDim.x; 147403adfb6SMatthew G Knepley } 148403adfb6SMatthew G Knepley } 149403adfb6SMatthew G Knepley 150*caccb7e3SMatthew G Knepley __global__ void STREAM_Add_double( double *a, double *b, double *c, size_t len) 151*caccb7e3SMatthew G Knepley { 152*caccb7e3SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 153*caccb7e3SMatthew G Knepley while (idx < len) { 154*caccb7e3SMatthew G Knepley c[idx] = a[idx]+b[idx]; 155*caccb7e3SMatthew G Knepley idx += blockDim.x * gridDim.x; 156*caccb7e3SMatthew G Knepley } 157*caccb7e3SMatthew G Knepley } 158*caccb7e3SMatthew G Knepley 159*caccb7e3SMatthew G Knepley __global__ void STREAM_Add_Optimized( float *a, float *b, float *c, size_t len) 160*caccb7e3SMatthew G Knepley { 161*caccb7e3SMatthew G Knepley /* 162*caccb7e3SMatthew G Knepley * Ensure size of thread index space is as large as or greater than 163*caccb7e3SMatthew G Knepley * vector index space else return. 164*caccb7e3SMatthew G Knepley */ 165*caccb7e3SMatthew G Knepley if (blockDim.x * gridDim.x < len) return; 166*caccb7e3SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 167*caccb7e3SMatthew G Knepley if (idx < len) c[idx] = a[idx]+b[idx]; 168*caccb7e3SMatthew G Knepley } 169*caccb7e3SMatthew G Knepley 170*caccb7e3SMatthew G Knepley __global__ void STREAM_Add_Optimized_double( double *a, double *b, double *c, size_t len) 171*caccb7e3SMatthew G Knepley { 172*caccb7e3SMatthew G Knepley /* 173*caccb7e3SMatthew G Knepley * Ensure size of thread index space is as large as or greater than 174*caccb7e3SMatthew G Knepley * vector index space else return. 175*caccb7e3SMatthew G Knepley */ 176*caccb7e3SMatthew G Knepley if (blockDim.x * gridDim.x < len) return; 177*caccb7e3SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 178*caccb7e3SMatthew G Knepley if (idx < len) c[idx] = a[idx]+b[idx]; 179*caccb7e3SMatthew G Knepley } 180*caccb7e3SMatthew G Knepley 181403adfb6SMatthew G Knepley __global__ void STREAM_Triad( float *a, float *b, float *c, float scalar, size_t len) 182403adfb6SMatthew G Knepley { 183403adfb6SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 184403adfb6SMatthew G Knepley while (idx < len) { 185403adfb6SMatthew G Knepley c[idx] = a[idx]+scalar*b[idx]; 186403adfb6SMatthew G Knepley idx += blockDim.x * gridDim.x; 187403adfb6SMatthew G Knepley } 188403adfb6SMatthew G Knepley } 189403adfb6SMatthew G Knepley 190*caccb7e3SMatthew G Knepley __global__ void STREAM_Triad_double( double *a, double *b, double *c, double scalar, size_t len) 191*caccb7e3SMatthew G Knepley { 192*caccb7e3SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 193*caccb7e3SMatthew G Knepley while (idx < len) { 194*caccb7e3SMatthew G Knepley c[idx] = a[idx]+scalar*b[idx]; 195*caccb7e3SMatthew G Knepley idx += blockDim.x * gridDim.x; 196*caccb7e3SMatthew G Knepley } 197*caccb7e3SMatthew G Knepley } 198*caccb7e3SMatthew G Knepley 199*caccb7e3SMatthew G Knepley __global__ void STREAM_Triad_Optimized( float *a, float *b, float *c, float scalar, size_t len) 200*caccb7e3SMatthew G Knepley { 201*caccb7e3SMatthew G Knepley /* 202*caccb7e3SMatthew G Knepley * Ensure size of thread index space is as large as or greater than 203*caccb7e3SMatthew G Knepley * vector index space else return. 204*caccb7e3SMatthew G Knepley */ 205*caccb7e3SMatthew G Knepley if (blockDim.x * gridDim.x < len) return; 206*caccb7e3SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 207*caccb7e3SMatthew G Knepley if (idx < len) c[idx] = a[idx]+scalar*b[idx]; 208*caccb7e3SMatthew G Knepley } 209*caccb7e3SMatthew G Knepley 210*caccb7e3SMatthew G Knepley __global__ void STREAM_Triad_Optimized_double( double *a, double *b, double *c, double scalar, size_t len) 211*caccb7e3SMatthew G Knepley { 212*caccb7e3SMatthew G Knepley /* 213*caccb7e3SMatthew G Knepley * Ensure size of thread index space is as large as or greater than 214*caccb7e3SMatthew G Knepley * vector index space else return. 215*caccb7e3SMatthew G Knepley */ 216*caccb7e3SMatthew G Knepley if (blockDim.x * gridDim.x < len) return; 217*caccb7e3SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 218*caccb7e3SMatthew G Knepley if (idx < len) c[idx] = a[idx]+scalar*b[idx]; 219*caccb7e3SMatthew G Knepley } 220*caccb7e3SMatthew G Knepley 221403adfb6SMatthew G Knepley /* Host side verification routines */ 222403adfb6SMatthew G Knepley bool STREAM_Copy_verify(float *a, float *b, size_t len) { 223403adfb6SMatthew G Knepley size_t idx; 224403adfb6SMatthew G Knepley bool bDifferent = false; 225403adfb6SMatthew G Knepley 226403adfb6SMatthew G Knepley for (idx = 0; idx < len && !bDifferent; idx++) { 227403adfb6SMatthew G Knepley float expectedResult = a[idx]; 228403adfb6SMatthew G Knepley float diffResultExpected = (b[idx] - expectedResult); 229403adfb6SMatthew G Knepley float relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps; 230403adfb6SMatthew G Knepley /* element-wise relative error determination */ 231403adfb6SMatthew G Knepley bDifferent = (relErrorULPS > 2.f); 232403adfb6SMatthew G Knepley } 233403adfb6SMatthew G Knepley 234403adfb6SMatthew G Knepley return bDifferent; 235403adfb6SMatthew G Knepley } 236403adfb6SMatthew G Knepley 237*caccb7e3SMatthew G Knepley bool STREAM_Copy_verify_double(double *a, double *b, size_t len) { 238*caccb7e3SMatthew G Knepley size_t idx; 239*caccb7e3SMatthew G Knepley bool bDifferent = false; 240*caccb7e3SMatthew G Knepley 241*caccb7e3SMatthew G Knepley for (idx = 0; idx < len && !bDifferent; idx++) { 242*caccb7e3SMatthew G Knepley double expectedResult = a[idx]; 243*caccb7e3SMatthew G Knepley double diffResultExpected = (b[idx] - expectedResult); 244*caccb7e3SMatthew G Knepley double relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps; 245*caccb7e3SMatthew G Knepley /* element-wise relative error determination */ 246*caccb7e3SMatthew G Knepley bDifferent = (relErrorULPS > 2.); 247*caccb7e3SMatthew G Knepley } 248*caccb7e3SMatthew G Knepley 249*caccb7e3SMatthew G Knepley return bDifferent; 250*caccb7e3SMatthew G Knepley } 251*caccb7e3SMatthew G Knepley 252403adfb6SMatthew G Knepley bool STREAM_Scale_verify(float *a, float *b, float scale, size_t len) { 253403adfb6SMatthew G Knepley size_t idx; 254403adfb6SMatthew G Knepley bool bDifferent = false; 255403adfb6SMatthew G Knepley 256403adfb6SMatthew G Knepley for (idx = 0; idx < len && !bDifferent; idx++) { 257403adfb6SMatthew G Knepley float expectedResult = scale*a[idx]; 258403adfb6SMatthew G Knepley float diffResultExpected = (b[idx] - expectedResult); 259403adfb6SMatthew G Knepley float relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps; 260403adfb6SMatthew G Knepley /* element-wise relative error determination */ 261403adfb6SMatthew G Knepley bDifferent = (relErrorULPS > 2.f); 262403adfb6SMatthew G Knepley } 263403adfb6SMatthew G Knepley 264403adfb6SMatthew G Knepley return bDifferent; 265403adfb6SMatthew G Knepley } 266403adfb6SMatthew G Knepley 267*caccb7e3SMatthew G Knepley bool STREAM_Scale_verify_double(double *a, double *b, double scale, size_t len) { 268*caccb7e3SMatthew G Knepley size_t idx; 269*caccb7e3SMatthew G Knepley bool bDifferent = false; 270*caccb7e3SMatthew G Knepley 271*caccb7e3SMatthew G Knepley for (idx = 0; idx < len && !bDifferent; idx++) { 272*caccb7e3SMatthew G Knepley double expectedResult = scale*a[idx]; 273*caccb7e3SMatthew G Knepley double diffResultExpected = (b[idx] - expectedResult); 274*caccb7e3SMatthew G Knepley double relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps; 275*caccb7e3SMatthew G Knepley /* element-wise relative error determination */ 276*caccb7e3SMatthew G Knepley bDifferent = (relErrorULPS > 2.); 277*caccb7e3SMatthew G Knepley } 278*caccb7e3SMatthew G Knepley 279*caccb7e3SMatthew G Knepley return bDifferent; 280*caccb7e3SMatthew G Knepley } 281*caccb7e3SMatthew G Knepley 282403adfb6SMatthew G Knepley bool STREAM_Add_verify(float *a, float *b, float *c, size_t len) { 283403adfb6SMatthew G Knepley size_t idx; 284403adfb6SMatthew G Knepley bool bDifferent = false; 285403adfb6SMatthew G Knepley 286403adfb6SMatthew G Knepley for (idx = 0; idx < len && !bDifferent; idx++) { 287403adfb6SMatthew G Knepley float expectedResult = a[idx] + b[idx]; 288403adfb6SMatthew G Knepley float diffResultExpected = (c[idx] - expectedResult); 289403adfb6SMatthew G Knepley float relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps; 290403adfb6SMatthew G Knepley /* element-wise relative error determination */ 291403adfb6SMatthew G Knepley bDifferent = (relErrorULPS > 2.f); 292403adfb6SMatthew G Knepley } 293403adfb6SMatthew G Knepley 294403adfb6SMatthew G Knepley return bDifferent; 295403adfb6SMatthew G Knepley } 296403adfb6SMatthew G Knepley 297*caccb7e3SMatthew G Knepley bool STREAM_Add_verify_double(double *a, double *b, double *c, size_t len) { 298*caccb7e3SMatthew G Knepley size_t idx; 299*caccb7e3SMatthew G Knepley bool bDifferent = false; 300*caccb7e3SMatthew G Knepley 301*caccb7e3SMatthew G Knepley for (idx = 0; idx < len && !bDifferent; idx++) { 302*caccb7e3SMatthew G Knepley double expectedResult = a[idx] + b[idx]; 303*caccb7e3SMatthew G Knepley double diffResultExpected = (c[idx] - expectedResult); 304*caccb7e3SMatthew G Knepley double relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps; 305*caccb7e3SMatthew G Knepley /* element-wise relative error determination */ 306*caccb7e3SMatthew G Knepley bDifferent = (relErrorULPS > 2.); 307*caccb7e3SMatthew G Knepley } 308*caccb7e3SMatthew G Knepley 309*caccb7e3SMatthew G Knepley return bDifferent; 310*caccb7e3SMatthew G Knepley } 311*caccb7e3SMatthew G Knepley 312403adfb6SMatthew G Knepley bool STREAM_Triad_verify(float *a, float *b, float *c, float scalar, size_t len) { 313403adfb6SMatthew G Knepley size_t idx; 314403adfb6SMatthew G Knepley bool bDifferent = false; 315403adfb6SMatthew G Knepley 316403adfb6SMatthew G Knepley for (idx = 0; idx < len && !bDifferent; idx++) { 317403adfb6SMatthew G Knepley float expectedResult = a[idx] + scalar*b[idx]; 318403adfb6SMatthew G Knepley float diffResultExpected = (c[idx] - expectedResult); 319403adfb6SMatthew G Knepley float relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps; 320403adfb6SMatthew G Knepley /* element-wise relative error determination */ 321403adfb6SMatthew G Knepley bDifferent = (relErrorULPS > 3.f); 322403adfb6SMatthew G Knepley } 323403adfb6SMatthew G Knepley 324403adfb6SMatthew G Knepley return bDifferent; 325403adfb6SMatthew G Knepley } 326403adfb6SMatthew G Knepley 327*caccb7e3SMatthew G Knepley bool STREAM_Triad_verify_double(double *a, double *b, double *c, double scalar, size_t len) { 328*caccb7e3SMatthew G Knepley size_t idx; 329*caccb7e3SMatthew G Knepley bool bDifferent = false; 330*caccb7e3SMatthew G Knepley 331*caccb7e3SMatthew G Knepley for (idx = 0; idx < len && !bDifferent; idx++) { 332*caccb7e3SMatthew G Knepley double expectedResult = a[idx] + scalar*b[idx]; 333*caccb7e3SMatthew G Knepley double diffResultExpected = (c[idx] - expectedResult); 334*caccb7e3SMatthew G Knepley double relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps; 335*caccb7e3SMatthew G Knepley /* element-wise relative error determination */ 336*caccb7e3SMatthew G Knepley bDifferent = (relErrorULPS > 3.); 337*caccb7e3SMatthew G Knepley } 338*caccb7e3SMatthew G Knepley 339*caccb7e3SMatthew G Knepley return bDifferent; 340*caccb7e3SMatthew G Knepley } 341*caccb7e3SMatthew G Knepley 342403adfb6SMatthew G Knepley /* forward declarations */ 343*caccb7e3SMatthew G Knepley PetscErrorCode setupStream(PetscInt device, PetscBool runDouble, PetscBool cpuTiming); 344403adfb6SMatthew G Knepley PetscErrorCode runStream(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming); 345*caccb7e3SMatthew G Knepley PetscErrorCode runStreamDouble(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming); 346403adfb6SMatthew G Knepley PetscErrorCode printResultsReadable(float times[][NTIMES]); 347403adfb6SMatthew G Knepley 348403adfb6SMatthew G Knepley int main(int argc, char *argv[]) 349403adfb6SMatthew G Knepley { 350403adfb6SMatthew G Knepley PetscInt device = 0; 351*caccb7e3SMatthew G Knepley PetscBool runDouble = PETSC_FALSE; 352403adfb6SMatthew G Knepley PetscBool cpuTiming = PETSC_FALSE; 353403adfb6SMatthew G Knepley PetscErrorCode ierr; 354403adfb6SMatthew G Knepley 355403adfb6SMatthew G Knepley ierr = PetscInitialize(&argc, &argv, 0, help);CHKERRQ(ierr); 356*caccb7e3SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, "[Single and Double-Precision Device-Only STREAM Benchmark implementation in CUDA]\n");CHKERRQ(ierr); 357403adfb6SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, "%s Starting...\n\n", argv[0]);CHKERRQ(ierr); 358403adfb6SMatthew G Knepley 359403adfb6SMatthew G Knepley ierr = PetscOptionsBegin(PETSC_COMM_WORLD, "", "STREAM Benchmark Options", "STREAM");CHKERRQ(ierr); 360403adfb6SMatthew G Knepley ierr = PetscOptionsInt("-device", "Specify the CUDA device to be used", "STREAM", device, &device, PETSC_NULL);CHKERRQ(ierr); 361*caccb7e3SMatthew G Knepley ierr = PetscOptionsBool("-double", "Also run double precision tests", "STREAM", runDouble, &runDouble, PETSC_NULL);CHKERRQ(ierr); 362403adfb6SMatthew G Knepley ierr = PetscOptionsBool("-cputiming", "Force CPU-based timing to be used", "STREAM", cpuTiming, &cpuTiming, PETSC_NULL);CHKERRQ(ierr); 363403adfb6SMatthew G Knepley ierr = PetscOptionsEnd(); 364403adfb6SMatthew G Knepley 365*caccb7e3SMatthew G Knepley ierr = setupStream(device, runDouble, cpuTiming); 366403adfb6SMatthew G Knepley if (ierr >= 0) { 367403adfb6SMatthew G Knepley PetscErrorCode ierr2 = PetscPrintf(PETSC_COMM_SELF, "\n[streamBenchmark] - results:\t%s\n\n", (ierr == 0) ? "PASSES" : "FAILED");CHKERRQ(ierr2); 368403adfb6SMatthew G Knepley } 369403adfb6SMatthew G Knepley PetscFinalize(); 370403adfb6SMatthew G Knepley return 0; 371403adfb6SMatthew G Knepley } 372403adfb6SMatthew G Knepley 373403adfb6SMatthew G Knepley /////////////////////////////////////////////////////////////////////////////// 374403adfb6SMatthew G Knepley //Run the appropriate tests 375403adfb6SMatthew G Knepley /////////////////////////////////////////////////////////////////////////////// 376*caccb7e3SMatthew G Knepley PetscErrorCode setupStream(PetscInt deviceNum, PetscBool runDouble, PetscBool cpuTiming) 377403adfb6SMatthew G Knepley { 378403adfb6SMatthew G Knepley PetscInt iNumThreadsPerBlock = 128; 379403adfb6SMatthew G Knepley PetscErrorCode ierr; 380403adfb6SMatthew G Knepley 381403adfb6SMatthew G Knepley PetscFunctionBegin; 382403adfb6SMatthew G Knepley // Check device 383403adfb6SMatthew G Knepley { 384403adfb6SMatthew G Knepley int deviceCount; 385403adfb6SMatthew G Knepley 386403adfb6SMatthew G Knepley cudaGetDeviceCount(&deviceCount); 387403adfb6SMatthew G Knepley if (deviceCount == 0) { 388403adfb6SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, "!!!!!No devices found!!!!!\n");CHKERRQ(ierr); 389403adfb6SMatthew G Knepley return -1000; 390403adfb6SMatthew G Knepley } 391403adfb6SMatthew G Knepley 392403adfb6SMatthew G Knepley if (deviceNum >= deviceCount || deviceNum < 0) { 393403adfb6SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, "\n!!!!!Invalid GPU number %d given hence default gpu %d will be used !!!!!\n", deviceNum, 0);CHKERRQ(ierr); 394403adfb6SMatthew G Knepley deviceNum = 0; 395403adfb6SMatthew G Knepley } 396403adfb6SMatthew G Knepley } 397403adfb6SMatthew G Knepley 398403adfb6SMatthew G Knepley cudaSetDevice(deviceNum); 399403adfb6SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, "Running on...\n\n");CHKERRQ(ierr); 400403adfb6SMatthew G Knepley cudaDeviceProp deviceProp; 401403adfb6SMatthew G Knepley if (cudaGetDeviceProperties(&deviceProp, deviceNum) == cudaSuccess) { 402403adfb6SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, " Device %d: %s\n", deviceNum, deviceProp.name);CHKERRQ(ierr); 403403adfb6SMatthew G Knepley } else { 404403adfb6SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, " Unable to determine device %d properties, exiting\n");CHKERRQ(ierr); 405403adfb6SMatthew G Knepley return -1; 406403adfb6SMatthew G Knepley } 407403adfb6SMatthew G Knepley 408*caccb7e3SMatthew G Knepley if (runDouble && deviceProp.major == 1 && deviceProp.minor < 3) { 409*caccb7e3SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, " Unable to run double-precision STREAM benchmark on a compute capability GPU less than 1.3\n");CHKERRQ(ierr); 410*caccb7e3SMatthew G Knepley return -1; 411*caccb7e3SMatthew G Knepley } 412403adfb6SMatthew G Knepley if (deviceProp.major == 2 && deviceProp.minor == 1) { 413403adfb6SMatthew G Knepley iNumThreadsPerBlock = 192; /* GF104 architecture / 48 CUDA Cores per MP */ 414403adfb6SMatthew G Knepley } else { 415403adfb6SMatthew G Knepley iNumThreadsPerBlock = 128; /* GF100 architecture / 32 CUDA Cores per MP */ 416403adfb6SMatthew G Knepley } 417403adfb6SMatthew G Knepley 418403adfb6SMatthew G Knepley if (cpuTiming) { 419403adfb6SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, " Using cpu-only timer.\n");CHKERRQ(ierr); 420403adfb6SMatthew G Knepley } 421403adfb6SMatthew G Knepley 422403adfb6SMatthew G Knepley ierr = runStream(iNumThreadsPerBlock, cpuTiming);CHKERRQ(ierr); 423*caccb7e3SMatthew G Knepley if (runDouble) { 424*caccb7e3SMatthew G Knepley ierr = cudaSetDeviceFlags(cudaDeviceBlockingSync);CHKERRQ(ierr); 425*caccb7e3SMatthew G Knepley ierr = runStreamDouble(iNumThreadsPerBlock, cpuTiming);CHKERRQ(ierr); 426*caccb7e3SMatthew G Knepley } 427403adfb6SMatthew G Knepley PetscFunctionReturn(0); 428403adfb6SMatthew G Knepley } 429403adfb6SMatthew G Knepley 430403adfb6SMatthew G Knepley /////////////////////////////////////////////////////////////////////////// 431403adfb6SMatthew G Knepley // runStream 432403adfb6SMatthew G Knepley /////////////////////////////////////////////////////////////////////////// 433403adfb6SMatthew G Knepley PetscErrorCode runStream(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming) 434403adfb6SMatthew G Knepley { 435403adfb6SMatthew G Knepley float *d_a, *d_b, *d_c; 436403adfb6SMatthew G Knepley int k; 437*caccb7e3SMatthew G Knepley float times[8][NTIMES]; 438403adfb6SMatthew G Knepley float scalar; 439403adfb6SMatthew G Knepley PetscErrorCode ierr; 440403adfb6SMatthew G Knepley 441403adfb6SMatthew G Knepley PetscFunctionBegin; 442403adfb6SMatthew G Knepley /* Allocate memory on device */ 443403adfb6SMatthew G Knepley ierr = cudaMalloc((void**)&d_a, sizeof(float)*N);CHKERRQ(ierr); 444403adfb6SMatthew G Knepley ierr = cudaMalloc((void**)&d_b, sizeof(float)*N);CHKERRQ(ierr); 445403adfb6SMatthew G Knepley ierr = cudaMalloc((void**)&d_c, sizeof(float)*N);CHKERRQ(ierr); 446403adfb6SMatthew G Knepley 447403adfb6SMatthew G Knepley /* Compute execution configuration */ 448403adfb6SMatthew G Knepley 449403adfb6SMatthew G Knepley dim3 dimBlock(iNumThreadsPerBlock); /* (iNumThreadsPerBlock,1,1) */ 450403adfb6SMatthew G Knepley dim3 dimGrid(N/dimBlock.x); /* (N/dimBlock.x,1,1) */ 451403adfb6SMatthew G Knepley if (N % dimBlock.x != 0) dimGrid.x+=1; 452403adfb6SMatthew G Knepley 453403adfb6SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, " Array size (single precision) = %u\n",N);CHKERRQ(ierr); 454403adfb6SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, " using %u threads per block, %u blocks\n",dimBlock.x,dimGrid.x);CHKERRQ(ierr); 455403adfb6SMatthew G Knepley 456403adfb6SMatthew G Knepley /* Initialize memory on the device */ 457403adfb6SMatthew G Knepley set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N); 458403adfb6SMatthew G Knepley set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N); 459403adfb6SMatthew G Knepley set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N); 460403adfb6SMatthew G Knepley 461403adfb6SMatthew G Knepley /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ 462403adfb6SMatthew G Knepley PetscLogDouble cpuTimer = 0.0; 463403adfb6SMatthew G Knepley cudaEvent_t start, stop; 464403adfb6SMatthew G Knepley 465403adfb6SMatthew G Knepley /* both timers report msec */ 466403adfb6SMatthew G Knepley ierr = cudaEventCreate( &start );CHKERRQ(ierr); /* gpu timer facility */ 467403adfb6SMatthew G Knepley ierr = cudaEventCreate( &stop );CHKERRQ(ierr); /* gpu timer facility */ 468403adfb6SMatthew G Knepley 469403adfb6SMatthew G Knepley scalar=3.0f; 470403adfb6SMatthew G Knepley for(k = 0; k < NTIMES; ++k) { 471403adfb6SMatthew G Knepley PetscTimeSubtract(cpuTimer); 472403adfb6SMatthew G Knepley ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr); 473403adfb6SMatthew G Knepley STREAM_Copy<<<dimGrid,dimBlock>>>(d_a, d_c, N); 474403adfb6SMatthew G Knepley ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr); 475403adfb6SMatthew G Knepley ierr = cudaEventSynchronize(stop);CHKERRQ(ierr); 476403adfb6SMatthew G Knepley //get the the total elapsed time in ms 477403adfb6SMatthew G Knepley PetscTimeAdd(cpuTimer); 478*caccb7e3SMatthew G Knepley if (bDontUseGPUTiming) { 479403adfb6SMatthew G Knepley times[0][k] = cpuTimer; 480403adfb6SMatthew G Knepley } else { 481403adfb6SMatthew G Knepley ierr = cudaEventElapsedTime( ×[0][k], start, stop );CHKERRQ(ierr); 482403adfb6SMatthew G Knepley } 483403adfb6SMatthew G Knepley 484403adfb6SMatthew G Knepley cpuTimer = 0.0; 485403adfb6SMatthew G Knepley PetscTimeSubtract(cpuTimer); 486403adfb6SMatthew G Knepley ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr); 487403adfb6SMatthew G Knepley STREAM_Copy_Optimized<<<dimGrid,dimBlock>>>(d_a, d_c, N); 488403adfb6SMatthew G Knepley ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr); 489403adfb6SMatthew G Knepley ierr = cudaEventSynchronize(stop);CHKERRQ(ierr); 490403adfb6SMatthew G Knepley //get the the total elapsed time in ms 491403adfb6SMatthew G Knepley PetscTimeAdd(cpuTimer); 492*caccb7e3SMatthew G Knepley if (bDontUseGPUTiming) { 493403adfb6SMatthew G Knepley times[1][k] = cpuTimer; 494403adfb6SMatthew G Knepley } else { 495403adfb6SMatthew G Knepley ierr = cudaEventElapsedTime( ×[1][k], start, stop );CHKERRQ(ierr); 496403adfb6SMatthew G Knepley } 497403adfb6SMatthew G Knepley 498403adfb6SMatthew G Knepley cpuTimer = 0.0; 499403adfb6SMatthew G Knepley PetscTimeSubtract(cpuTimer); 500403adfb6SMatthew G Knepley ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr); 501403adfb6SMatthew G Knepley STREAM_Scale<<<dimGrid,dimBlock>>>(d_b, d_c, scalar, N); 502403adfb6SMatthew G Knepley ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr); 503403adfb6SMatthew G Knepley ierr = cudaEventSynchronize(stop);CHKERRQ(ierr); 504403adfb6SMatthew G Knepley //get the the total elapsed time in ms 505403adfb6SMatthew G Knepley PetscTimeAdd(cpuTimer); 506403adfb6SMatthew G Knepley if (bDontUseGPUTiming) { 507403adfb6SMatthew G Knepley times[2][k] = cpuTimer; 508403adfb6SMatthew G Knepley } else { 509403adfb6SMatthew G Knepley ierr = cudaEventElapsedTime( ×[2][k], start, stop );CHKERRQ(ierr); 510403adfb6SMatthew G Knepley } 511403adfb6SMatthew G Knepley 512403adfb6SMatthew G Knepley cpuTimer = 0.0; 513403adfb6SMatthew G Knepley PetscTimeSubtract(cpuTimer); 514403adfb6SMatthew G Knepley ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr); 515*caccb7e3SMatthew G Knepley STREAM_Scale_Optimized<<<dimGrid,dimBlock>>>(d_b, d_c, scalar, N); 516403adfb6SMatthew G Knepley ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr); 517403adfb6SMatthew G Knepley ierr = cudaEventSynchronize(stop);CHKERRQ(ierr); 518403adfb6SMatthew G Knepley //get the the total elapsed time in ms 519403adfb6SMatthew G Knepley PetscTimeAdd(cpuTimer); 520403adfb6SMatthew G Knepley if (bDontUseGPUTiming) { 521403adfb6SMatthew G Knepley times[3][k] = cpuTimer; 522403adfb6SMatthew G Knepley } else { 523403adfb6SMatthew G Knepley ierr = cudaEventElapsedTime( ×[3][k], start, stop );CHKERRQ(ierr); 524403adfb6SMatthew G Knepley } 525403adfb6SMatthew G Knepley 526403adfb6SMatthew G Knepley cpuTimer = 0.0; 527403adfb6SMatthew G Knepley PetscTimeSubtract(cpuTimer); 528403adfb6SMatthew G Knepley ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr); 529*caccb7e3SMatthew G Knepley STREAM_Add<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N); 530403adfb6SMatthew G Knepley ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr); 531403adfb6SMatthew G Knepley ierr = cudaEventSynchronize(stop);CHKERRQ(ierr); 532403adfb6SMatthew G Knepley //get the the total elapsed time in ms 533403adfb6SMatthew G Knepley PetscTimeAdd(cpuTimer); 534403adfb6SMatthew G Knepley if (bDontUseGPUTiming) { 535403adfb6SMatthew G Knepley times[4][k] = cpuTimer; 536403adfb6SMatthew G Knepley } else { 537403adfb6SMatthew G Knepley ierr = cudaEventElapsedTime( ×[4][k], start, stop );CHKERRQ(ierr); 538403adfb6SMatthew G Knepley } 539403adfb6SMatthew G Knepley 540*caccb7e3SMatthew G Knepley cpuTimer = 0.0; 541*caccb7e3SMatthew G Knepley PetscTimeSubtract(cpuTimer); 542*caccb7e3SMatthew G Knepley ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr); 543*caccb7e3SMatthew G Knepley STREAM_Add_Optimized<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N); 544*caccb7e3SMatthew G Knepley ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr); 545*caccb7e3SMatthew G Knepley ierr = cudaEventSynchronize(stop);CHKERRQ(ierr); 546*caccb7e3SMatthew G Knepley //get the the total elapsed time in ms 547*caccb7e3SMatthew G Knepley PetscTimeAdd(cpuTimer); 548*caccb7e3SMatthew G Knepley if (bDontUseGPUTiming) { 549*caccb7e3SMatthew G Knepley times[5][k] = cpuTimer; 550*caccb7e3SMatthew G Knepley } else { 551*caccb7e3SMatthew G Knepley ierr = cudaEventElapsedTime( ×[5][k], start, stop );CHKERRQ(ierr); 552*caccb7e3SMatthew G Knepley } 553*caccb7e3SMatthew G Knepley 554*caccb7e3SMatthew G Knepley cpuTimer = 0.0; 555*caccb7e3SMatthew G Knepley PetscTimeSubtract(cpuTimer); 556*caccb7e3SMatthew G Knepley ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr); 557*caccb7e3SMatthew G Knepley STREAM_Triad<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N); 558*caccb7e3SMatthew G Knepley ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr); 559*caccb7e3SMatthew G Knepley ierr = cudaEventSynchronize(stop);CHKERRQ(ierr); 560*caccb7e3SMatthew G Knepley //get the the total elapsed time in ms 561*caccb7e3SMatthew G Knepley PetscTimeAdd(cpuTimer); 562*caccb7e3SMatthew G Knepley if (bDontUseGPUTiming) { 563*caccb7e3SMatthew G Knepley times[6][k] = cpuTimer; 564*caccb7e3SMatthew G Knepley } else { 565*caccb7e3SMatthew G Knepley ierr = cudaEventElapsedTime( ×[6][k], start, stop );CHKERRQ(ierr); 566*caccb7e3SMatthew G Knepley } 567*caccb7e3SMatthew G Knepley 568*caccb7e3SMatthew G Knepley cpuTimer = 0.0; 569*caccb7e3SMatthew G Knepley PetscTimeSubtract(cpuTimer); 570*caccb7e3SMatthew G Knepley ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr); 571*caccb7e3SMatthew G Knepley STREAM_Triad_Optimized<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N); 572*caccb7e3SMatthew G Knepley ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr); 573*caccb7e3SMatthew G Knepley ierr = cudaEventSynchronize(stop);CHKERRQ(ierr); 574*caccb7e3SMatthew G Knepley //get the the total elapsed time in ms 575*caccb7e3SMatthew G Knepley PetscTimeAdd(cpuTimer); 576*caccb7e3SMatthew G Knepley if (bDontUseGPUTiming) { 577*caccb7e3SMatthew G Knepley times[7][k] = cpuTimer; 578*caccb7e3SMatthew G Knepley } else { 579*caccb7e3SMatthew G Knepley ierr = cudaEventElapsedTime( ×[7][k], start, stop );CHKERRQ(ierr); 580*caccb7e3SMatthew G Knepley } 581*caccb7e3SMatthew G Knepley 582403adfb6SMatthew G Knepley } 583403adfb6SMatthew G Knepley 584403adfb6SMatthew G Knepley /* verify kernels */ 585403adfb6SMatthew G Knepley float *h_a, *h_b, *h_c; 586403adfb6SMatthew G Knepley bool errorSTREAMkernel = true; 587403adfb6SMatthew G Knepley 588403adfb6SMatthew G Knepley if ( (h_a = (float*)calloc( N, sizeof(float) )) == (float*)NULL ) { 589403adfb6SMatthew G Knepley printf("Unable to allocate array h_a, exiting ...\n"); 590403adfb6SMatthew G Knepley exit(1); 591403adfb6SMatthew G Knepley } 592403adfb6SMatthew G Knepley if ( (h_b = (float*)calloc( N, sizeof(float) )) == (float*)NULL ) { 593403adfb6SMatthew G Knepley printf("Unable to allocate array h_b, exiting ...\n"); 594403adfb6SMatthew G Knepley exit(1); 595403adfb6SMatthew G Knepley } 596403adfb6SMatthew G Knepley 597403adfb6SMatthew G Knepley if ( (h_c = (float*)calloc( N, sizeof(float) )) == (float*)NULL ) { 598403adfb6SMatthew G Knepley printf("Unalbe to allocate array h_c, exiting ...\n"); 599403adfb6SMatthew G Knepley exit(1); 600403adfb6SMatthew G Knepley } 601403adfb6SMatthew G Knepley 602403adfb6SMatthew G Knepley /* 603403adfb6SMatthew G Knepley * perform kernel, copy device memory into host memory and verify each 604403adfb6SMatthew G Knepley * device kernel output 605403adfb6SMatthew G Knepley */ 606403adfb6SMatthew G Knepley 607403adfb6SMatthew G Knepley /* Initialize memory on the device */ 608403adfb6SMatthew G Knepley set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N); 609403adfb6SMatthew G Knepley set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N); 610403adfb6SMatthew G Knepley set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N); 611403adfb6SMatthew G Knepley 612403adfb6SMatthew G Knepley STREAM_Copy<<<dimGrid,dimBlock>>>(d_a, d_c, N); 613403adfb6SMatthew G Knepley ierr = cudaMemcpy( h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 614403adfb6SMatthew G Knepley ierr = cudaMemcpy( h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 615403adfb6SMatthew G Knepley errorSTREAMkernel = STREAM_Copy_verify(h_a, h_c, N); 616403adfb6SMatthew G Knepley if (errorSTREAMkernel) { 617403adfb6SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy:\t\tError detected in device STREAM_Copy, exiting\n");CHKERRQ(ierr); 618403adfb6SMatthew G Knepley exit(-2000); 619403adfb6SMatthew G Knepley } else { 620403adfb6SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy:\t\tPass\n");CHKERRQ(ierr); 621403adfb6SMatthew G Knepley } 622403adfb6SMatthew G Knepley 623403adfb6SMatthew G Knepley /* Initialize memory on the device */ 624403adfb6SMatthew G Knepley set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N); 625403adfb6SMatthew G Knepley set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N); 626403adfb6SMatthew G Knepley set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N); 627403adfb6SMatthew G Knepley 628403adfb6SMatthew G Knepley STREAM_Copy_Optimized<<<dimGrid,dimBlock>>>(d_a, d_c, N); 629403adfb6SMatthew G Knepley ierr = cudaMemcpy( h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 630403adfb6SMatthew G Knepley ierr = cudaMemcpy( h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 631403adfb6SMatthew G Knepley errorSTREAMkernel = STREAM_Copy_verify(h_a, h_c, N); 632403adfb6SMatthew G Knepley if (errorSTREAMkernel) { 633403adfb6SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy_Optimized:\tError detected in device STREAM_Copy_Optimized, exiting\n");CHKERRQ(ierr); 634403adfb6SMatthew G Knepley exit(-3000); 635403adfb6SMatthew G Knepley } else { 636403adfb6SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy_Optimized:\tPass\n");CHKERRQ(ierr); 637403adfb6SMatthew G Knepley } 638403adfb6SMatthew G Knepley 639403adfb6SMatthew G Knepley /* Initialize memory on the device */ 640403adfb6SMatthew G Knepley set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N); 641403adfb6SMatthew G Knepley set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N); 642403adfb6SMatthew G Knepley 643403adfb6SMatthew G Knepley STREAM_Scale<<<dimGrid,dimBlock>>>(d_b, d_c, scalar, N); 644403adfb6SMatthew G Knepley ierr = cudaMemcpy( h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 645403adfb6SMatthew G Knepley ierr = cudaMemcpy( h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 646403adfb6SMatthew G Knepley errorSTREAMkernel = STREAM_Scale_verify(h_b, h_c, scalar, N); 647403adfb6SMatthew G Knepley if (errorSTREAMkernel) { 648403adfb6SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Scale:\t\tError detected in device STREAM_Scale, exiting\n");CHKERRQ(ierr); 649403adfb6SMatthew G Knepley exit(-4000); 650403adfb6SMatthew G Knepley } else { 651403adfb6SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Scale:\t\tPass\n");CHKERRQ(ierr); 652403adfb6SMatthew G Knepley } 653403adfb6SMatthew G Knepley 654403adfb6SMatthew G Knepley /* Initialize memory on the device */ 655403adfb6SMatthew G Knepley set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N); 656403adfb6SMatthew G Knepley set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N); 657403adfb6SMatthew G Knepley set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N); 658403adfb6SMatthew G Knepley 659403adfb6SMatthew G Knepley STREAM_Add<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N); 660403adfb6SMatthew G Knepley ierr = cudaMemcpy( h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 661403adfb6SMatthew G Knepley ierr = cudaMemcpy( h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 662403adfb6SMatthew G Knepley ierr = cudaMemcpy( h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 663403adfb6SMatthew G Knepley errorSTREAMkernel = STREAM_Add_verify(h_a, h_b, h_c, N); 664403adfb6SMatthew G Knepley if (errorSTREAMkernel) { 665403adfb6SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Add:\t\tError detected in device STREAM_Add, exiting\n");CHKERRQ(ierr); 666403adfb6SMatthew G Knepley exit(-5000); 667403adfb6SMatthew G Knepley } else { 668403adfb6SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Add:\t\tPass\n");CHKERRQ(ierr); 669403adfb6SMatthew G Knepley } 670403adfb6SMatthew G Knepley 671403adfb6SMatthew G Knepley /* Initialize memory on the device */ 672403adfb6SMatthew G Knepley set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N); 673403adfb6SMatthew G Knepley set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N); 674403adfb6SMatthew G Knepley set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N); 675403adfb6SMatthew G Knepley 676403adfb6SMatthew G Knepley STREAM_Triad<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N); 677403adfb6SMatthew G Knepley ierr = cudaMemcpy( h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 678403adfb6SMatthew G Knepley ierr = cudaMemcpy( h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 679403adfb6SMatthew G Knepley ierr = cudaMemcpy( h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 680403adfb6SMatthew G Knepley errorSTREAMkernel = STREAM_Triad_verify(h_b, h_c, h_a, scalar, N); 681403adfb6SMatthew G Knepley if (errorSTREAMkernel) { 682403adfb6SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Triad:\t\tError detected in device STREAM_Triad, exiting\n");CHKERRQ(ierr); 683403adfb6SMatthew G Knepley exit(-6000); 684403adfb6SMatthew G Knepley } else { 685403adfb6SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Triad:\t\tPass\n");CHKERRQ(ierr); 686403adfb6SMatthew G Knepley } 687403adfb6SMatthew G Knepley 688403adfb6SMatthew G Knepley /* continue from here */ 689403adfb6SMatthew G Knepley printResultsReadable(times); 690403adfb6SMatthew G Knepley 691403adfb6SMatthew G Knepley //clean up timers 692403adfb6SMatthew G Knepley ierr = cudaEventDestroy( stop );CHKERRQ(ierr); 693403adfb6SMatthew G Knepley ierr = cudaEventDestroy( start );CHKERRQ(ierr); 694403adfb6SMatthew G Knepley 695403adfb6SMatthew G Knepley /* Free memory on device */ 696403adfb6SMatthew G Knepley ierr = cudaFree(d_a);CHKERRQ(ierr); 697403adfb6SMatthew G Knepley ierr = cudaFree(d_b);CHKERRQ(ierr); 698403adfb6SMatthew G Knepley ierr = cudaFree(d_c);CHKERRQ(ierr); 699403adfb6SMatthew G Knepley PetscFunctionReturn(0); 700403adfb6SMatthew G Knepley } 701403adfb6SMatthew G Knepley 702*caccb7e3SMatthew G Knepley PetscErrorCode runStreamDouble(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming) 703*caccb7e3SMatthew G Knepley { 704*caccb7e3SMatthew G Knepley double *d_a, *d_b, *d_c; 705*caccb7e3SMatthew G Knepley int k; 706*caccb7e3SMatthew G Knepley float times[8][NTIMES]; 707*caccb7e3SMatthew G Knepley double scalar; 708*caccb7e3SMatthew G Knepley PetscErrorCode ierr; 709*caccb7e3SMatthew G Knepley 710*caccb7e3SMatthew G Knepley PetscFunctionBegin; 711*caccb7e3SMatthew G Knepley /* Allocate memory on device */ 712*caccb7e3SMatthew G Knepley ierr = cudaMalloc((void**)&d_a, sizeof(double)*N);CHKERRQ(ierr); 713*caccb7e3SMatthew G Knepley ierr = cudaMalloc((void**)&d_b, sizeof(double)*N);CHKERRQ(ierr); 714*caccb7e3SMatthew G Knepley ierr = cudaMalloc((void**)&d_c, sizeof(double)*N);CHKERRQ(ierr); 715*caccb7e3SMatthew G Knepley 716*caccb7e3SMatthew G Knepley /* Compute execution configuration */ 717*caccb7e3SMatthew G Knepley 718*caccb7e3SMatthew G Knepley dim3 dimBlock(iNumThreadsPerBlock); /* (iNumThreadsPerBlock,1,1) */ 719*caccb7e3SMatthew G Knepley dim3 dimGrid(N/dimBlock.x); /* (N/dimBlock.x,1,1) */ 720*caccb7e3SMatthew G Knepley if (N % dimBlock.x != 0) dimGrid.x+=1; 721*caccb7e3SMatthew G Knepley 722*caccb7e3SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, " Array size (double precision) = %u\n",N);CHKERRQ(ierr); 723*caccb7e3SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, " using %u threads per block, %u blocks\n",dimBlock.x,dimGrid.x);CHKERRQ(ierr); 724*caccb7e3SMatthew G Knepley 725*caccb7e3SMatthew G Knepley /* Initialize memory on the device */ 726*caccb7e3SMatthew G Knepley set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N); 727*caccb7e3SMatthew G Knepley set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N); 728*caccb7e3SMatthew G Knepley set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N); 729*caccb7e3SMatthew G Knepley 730*caccb7e3SMatthew G Knepley /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ 731*caccb7e3SMatthew G Knepley PetscLogDouble cpuTimer = 0.0; 732*caccb7e3SMatthew G Knepley cudaEvent_t start, stop; 733*caccb7e3SMatthew G Knepley 734*caccb7e3SMatthew G Knepley /* both timers report msec */ 735*caccb7e3SMatthew G Knepley ierr = cudaEventCreate( &start );CHKERRQ(ierr); /* gpu timer facility */ 736*caccb7e3SMatthew G Knepley ierr = cudaEventCreate( &stop );CHKERRQ(ierr); /* gpu timer facility */ 737*caccb7e3SMatthew G Knepley 738*caccb7e3SMatthew G Knepley scalar=3.0; 739*caccb7e3SMatthew G Knepley for(k = 0; k < NTIMES; ++k) { 740*caccb7e3SMatthew G Knepley PetscTimeSubtract(cpuTimer); 741*caccb7e3SMatthew G Knepley ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr); 742*caccb7e3SMatthew G Knepley STREAM_Copy_double<<<dimGrid,dimBlock>>>(d_a, d_c, N); 743*caccb7e3SMatthew G Knepley ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr); 744*caccb7e3SMatthew G Knepley ierr = cudaEventSynchronize(stop);CHKERRQ(ierr); 745*caccb7e3SMatthew G Knepley //get the the total elapsed time in ms 746*caccb7e3SMatthew G Knepley if (bDontUseGPUTiming) { 747*caccb7e3SMatthew G Knepley PetscTimeAdd(cpuTimer); 748*caccb7e3SMatthew G Knepley times[0][k] = cpuTimer; 749*caccb7e3SMatthew G Knepley } else { 750*caccb7e3SMatthew G Knepley ierr = cudaEventElapsedTime( ×[0][k], start, stop );CHKERRQ(ierr); 751*caccb7e3SMatthew G Knepley } 752*caccb7e3SMatthew G Knepley 753*caccb7e3SMatthew G Knepley cpuTimer = 0.0; 754*caccb7e3SMatthew G Knepley PetscTimeSubtract(cpuTimer); 755*caccb7e3SMatthew G Knepley ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr); 756*caccb7e3SMatthew G Knepley STREAM_Copy_Optimized_double<<<dimGrid,dimBlock>>>(d_a, d_c, N); 757*caccb7e3SMatthew G Knepley ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr); 758*caccb7e3SMatthew G Knepley ierr = cudaEventSynchronize(stop);CHKERRQ(ierr); 759*caccb7e3SMatthew G Knepley //get the the total elapsed time in ms 760*caccb7e3SMatthew G Knepley if (bDontUseGPUTiming) { 761*caccb7e3SMatthew G Knepley PetscTimeAdd(cpuTimer); 762*caccb7e3SMatthew G Knepley times[1][k] = cpuTimer; 763*caccb7e3SMatthew G Knepley } else { 764*caccb7e3SMatthew G Knepley ierr = cudaEventElapsedTime( ×[1][k], start, stop );CHKERRQ(ierr); 765*caccb7e3SMatthew G Knepley } 766*caccb7e3SMatthew G Knepley 767*caccb7e3SMatthew G Knepley cpuTimer = 0.0; 768*caccb7e3SMatthew G Knepley PetscTimeSubtract(cpuTimer); 769*caccb7e3SMatthew G Knepley ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr); 770*caccb7e3SMatthew G Knepley STREAM_Scale_double<<<dimGrid,dimBlock>>>(d_b, d_c, scalar, N); 771*caccb7e3SMatthew G Knepley ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr); 772*caccb7e3SMatthew G Knepley ierr = cudaEventSynchronize(stop);CHKERRQ(ierr); 773*caccb7e3SMatthew G Knepley //get the the total elapsed time in ms 774*caccb7e3SMatthew G Knepley PetscTimeAdd(cpuTimer); 775*caccb7e3SMatthew G Knepley if (bDontUseGPUTiming) { 776*caccb7e3SMatthew G Knepley times[2][k] = cpuTimer; 777*caccb7e3SMatthew G Knepley } else { 778*caccb7e3SMatthew G Knepley ierr = cudaEventElapsedTime( ×[2][k], start, stop );CHKERRQ(ierr); 779*caccb7e3SMatthew G Knepley } 780*caccb7e3SMatthew G Knepley 781*caccb7e3SMatthew G Knepley cpuTimer = 0.0; 782*caccb7e3SMatthew G Knepley PetscTimeSubtract(cpuTimer); 783*caccb7e3SMatthew G Knepley ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr); 784*caccb7e3SMatthew G Knepley STREAM_Scale_Optimized_double<<<dimGrid,dimBlock>>>(d_b, d_c, scalar, N); 785*caccb7e3SMatthew G Knepley ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr); 786*caccb7e3SMatthew G Knepley ierr = cudaEventSynchronize(stop);CHKERRQ(ierr); 787*caccb7e3SMatthew G Knepley //get the the total elapsed time in ms 788*caccb7e3SMatthew G Knepley PetscTimeAdd(cpuTimer); 789*caccb7e3SMatthew G Knepley if (bDontUseGPUTiming) { 790*caccb7e3SMatthew G Knepley times[3][k] = cpuTimer; 791*caccb7e3SMatthew G Knepley } else { 792*caccb7e3SMatthew G Knepley ierr = cudaEventElapsedTime( ×[2][k], start, stop );CHKERRQ(ierr); 793*caccb7e3SMatthew G Knepley } 794*caccb7e3SMatthew G Knepley 795*caccb7e3SMatthew G Knepley cpuTimer = 0.0; 796*caccb7e3SMatthew G Knepley PetscTimeSubtract(cpuTimer); 797*caccb7e3SMatthew G Knepley ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr); 798*caccb7e3SMatthew G Knepley STREAM_Add_double<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N); 799*caccb7e3SMatthew G Knepley ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr); 800*caccb7e3SMatthew G Knepley ierr = cudaEventSynchronize(stop);CHKERRQ(ierr); 801*caccb7e3SMatthew G Knepley //get the the total elapsed time in ms 802*caccb7e3SMatthew G Knepley PetscTimeAdd(cpuTimer); 803*caccb7e3SMatthew G Knepley if (bDontUseGPUTiming) { 804*caccb7e3SMatthew G Knepley times[4][k] = cpuTimer; 805*caccb7e3SMatthew G Knepley } else { 806*caccb7e3SMatthew G Knepley ierr = cudaEventElapsedTime( ×[3][k], start, stop );CHKERRQ(ierr); 807*caccb7e3SMatthew G Knepley } 808*caccb7e3SMatthew G Knepley 809*caccb7e3SMatthew G Knepley cpuTimer = 0.0; 810*caccb7e3SMatthew G Knepley PetscTimeSubtract(cpuTimer); 811*caccb7e3SMatthew G Knepley ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr); 812*caccb7e3SMatthew G Knepley STREAM_Add_Optimized_double<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N); 813*caccb7e3SMatthew G Knepley ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr); 814*caccb7e3SMatthew G Knepley ierr = cudaEventSynchronize(stop);CHKERRQ(ierr); 815*caccb7e3SMatthew G Knepley //get the the total elapsed time in ms 816*caccb7e3SMatthew G Knepley PetscTimeAdd(cpuTimer); 817*caccb7e3SMatthew G Knepley if (bDontUseGPUTiming) { 818*caccb7e3SMatthew G Knepley times[5][k] = cpuTimer; 819*caccb7e3SMatthew G Knepley } else { 820*caccb7e3SMatthew G Knepley ierr = cudaEventElapsedTime( ×[3][k], start, stop );CHKERRQ(ierr); 821*caccb7e3SMatthew G Knepley } 822*caccb7e3SMatthew G Knepley 823*caccb7e3SMatthew G Knepley cpuTimer = 0.0; 824*caccb7e3SMatthew G Knepley PetscTimeSubtract(cpuTimer); 825*caccb7e3SMatthew G Knepley ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr); 826*caccb7e3SMatthew G Knepley STREAM_Triad_double<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N); 827*caccb7e3SMatthew G Knepley ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr); 828*caccb7e3SMatthew G Knepley ierr = cudaEventSynchronize(stop);CHKERRQ(ierr); 829*caccb7e3SMatthew G Knepley //get the the total elapsed time in ms 830*caccb7e3SMatthew G Knepley PetscTimeAdd(cpuTimer); 831*caccb7e3SMatthew G Knepley if (bDontUseGPUTiming) { 832*caccb7e3SMatthew G Knepley times[6][k] = cpuTimer; 833*caccb7e3SMatthew G Knepley } else { 834*caccb7e3SMatthew G Knepley ierr = cudaEventElapsedTime( ×[4][k], start, stop );CHKERRQ(ierr); 835*caccb7e3SMatthew G Knepley } 836*caccb7e3SMatthew G Knepley 837*caccb7e3SMatthew G Knepley cpuTimer = 0.0; 838*caccb7e3SMatthew G Knepley PetscTimeSubtract(cpuTimer); 839*caccb7e3SMatthew G Knepley ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr); 840*caccb7e3SMatthew G Knepley STREAM_Triad_Optimized_double<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N); 841*caccb7e3SMatthew G Knepley ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr); 842*caccb7e3SMatthew G Knepley ierr = cudaEventSynchronize(stop);CHKERRQ(ierr); 843*caccb7e3SMatthew G Knepley //get the the total elapsed time in ms 844*caccb7e3SMatthew G Knepley PetscTimeAdd(cpuTimer); 845*caccb7e3SMatthew G Knepley if (bDontUseGPUTiming) { 846*caccb7e3SMatthew G Knepley times[7][k] = cpuTimer; 847*caccb7e3SMatthew G Knepley } else { 848*caccb7e3SMatthew G Knepley ierr = cudaEventElapsedTime( ×[4][k], start, stop );CHKERRQ(ierr); 849*caccb7e3SMatthew G Knepley } 850*caccb7e3SMatthew G Knepley 851*caccb7e3SMatthew G Knepley } 852*caccb7e3SMatthew G Knepley 853*caccb7e3SMatthew G Knepley /* verify kernels */ 854*caccb7e3SMatthew G Knepley double *h_a, *h_b, *h_c; 855*caccb7e3SMatthew G Knepley bool errorSTREAMkernel = true; 856*caccb7e3SMatthew G Knepley 857*caccb7e3SMatthew G Knepley if ( (h_a = (double*)calloc( N, sizeof(double) )) == (double*)NULL ) { 858*caccb7e3SMatthew G Knepley printf("Unable to allocate array h_a, exiting ...\n"); 859*caccb7e3SMatthew G Knepley exit(1); 860*caccb7e3SMatthew G Knepley } 861*caccb7e3SMatthew G Knepley if ( (h_b = (double*)calloc( N, sizeof(double) )) == (double*)NULL ) { 862*caccb7e3SMatthew G Knepley printf("Unable to allocate array h_b, exiting ...\n"); 863*caccb7e3SMatthew G Knepley exit(1); 864*caccb7e3SMatthew G Knepley } 865*caccb7e3SMatthew G Knepley 866*caccb7e3SMatthew G Knepley if ( (h_c = (double*)calloc( N, sizeof(double) )) == (double*)NULL ) { 867*caccb7e3SMatthew G Knepley printf("Unalbe to allocate array h_c, exiting ...\n"); 868*caccb7e3SMatthew G Knepley exit(1); 869*caccb7e3SMatthew G Knepley } 870*caccb7e3SMatthew G Knepley 871*caccb7e3SMatthew G Knepley /* 872*caccb7e3SMatthew G Knepley * perform kernel, copy device memory into host memory and verify each 873*caccb7e3SMatthew G Knepley * device kernel output 874*caccb7e3SMatthew G Knepley */ 875*caccb7e3SMatthew G Knepley 876*caccb7e3SMatthew G Knepley /* Initialize memory on the device */ 877*caccb7e3SMatthew G Knepley set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N); 878*caccb7e3SMatthew G Knepley set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N); 879*caccb7e3SMatthew G Knepley set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N); 880*caccb7e3SMatthew G Knepley 881*caccb7e3SMatthew G Knepley STREAM_Copy_double<<<dimGrid,dimBlock>>>(d_a, d_c, N); 882*caccb7e3SMatthew G Knepley ierr = cudaMemcpy( h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 883*caccb7e3SMatthew G Knepley ierr = cudaMemcpy( h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 884*caccb7e3SMatthew G Knepley errorSTREAMkernel = STREAM_Copy_verify_double(h_a, h_c, N); 885*caccb7e3SMatthew G Knepley if (errorSTREAMkernel) { 886*caccb7e3SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy:\t\tError detected in device STREAM_Copy, exiting\n");CHKERRQ(ierr); 887*caccb7e3SMatthew G Knepley exit(-2000); 888*caccb7e3SMatthew G Knepley } else { 889*caccb7e3SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy:\t\tPass\n");CHKERRQ(ierr); 890*caccb7e3SMatthew G Knepley } 891*caccb7e3SMatthew G Knepley 892*caccb7e3SMatthew G Knepley /* Initialize memory on the device */ 893*caccb7e3SMatthew G Knepley set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N); 894*caccb7e3SMatthew G Knepley set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N); 895*caccb7e3SMatthew G Knepley set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N); 896*caccb7e3SMatthew G Knepley 897*caccb7e3SMatthew G Knepley STREAM_Copy_Optimized_double<<<dimGrid,dimBlock>>>(d_a, d_c, N); 898*caccb7e3SMatthew G Knepley ierr = cudaMemcpy( h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 899*caccb7e3SMatthew G Knepley ierr = cudaMemcpy( h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 900*caccb7e3SMatthew G Knepley errorSTREAMkernel = STREAM_Copy_verify_double(h_a, h_c, N); 901*caccb7e3SMatthew G Knepley if (errorSTREAMkernel) { 902*caccb7e3SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy_Optimized:\tError detected in device STREAM_Copy_Optimized, exiting\n");CHKERRQ(ierr); 903*caccb7e3SMatthew G Knepley exit(-3000); 904*caccb7e3SMatthew G Knepley } else { 905*caccb7e3SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy_Optimized:\tPass\n");CHKERRQ(ierr); 906*caccb7e3SMatthew G Knepley } 907*caccb7e3SMatthew G Knepley 908*caccb7e3SMatthew G Knepley /* Initialize memory on the device */ 909*caccb7e3SMatthew G Knepley set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N); 910*caccb7e3SMatthew G Knepley set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N); 911*caccb7e3SMatthew G Knepley 912*caccb7e3SMatthew G Knepley STREAM_Scale_double<<<dimGrid,dimBlock>>>(d_b, d_c, scalar, N); 913*caccb7e3SMatthew G Knepley ierr = cudaMemcpy( h_b, d_b, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 914*caccb7e3SMatthew G Knepley ierr = cudaMemcpy( h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 915*caccb7e3SMatthew G Knepley errorSTREAMkernel = STREAM_Scale_verify_double(h_b, h_c, scalar, N); 916*caccb7e3SMatthew G Knepley if (errorSTREAMkernel) { 917*caccb7e3SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Scale:\t\tError detected in device STREAM_Scale, exiting\n");CHKERRQ(ierr); 918*caccb7e3SMatthew G Knepley exit(-4000); 919*caccb7e3SMatthew G Knepley } else { 920*caccb7e3SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Scale:\t\tPass\n");CHKERRQ(ierr); 921*caccb7e3SMatthew G Knepley } 922*caccb7e3SMatthew G Knepley 923*caccb7e3SMatthew G Knepley /* Initialize memory on the device */ 924*caccb7e3SMatthew G Knepley set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N); 925*caccb7e3SMatthew G Knepley set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N); 926*caccb7e3SMatthew G Knepley set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N); 927*caccb7e3SMatthew G Knepley 928*caccb7e3SMatthew G Knepley STREAM_Add_double<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N); 929*caccb7e3SMatthew G Knepley ierr = cudaMemcpy( h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 930*caccb7e3SMatthew G Knepley ierr = cudaMemcpy( h_b, d_b, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 931*caccb7e3SMatthew G Knepley ierr = cudaMemcpy( h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 932*caccb7e3SMatthew G Knepley errorSTREAMkernel = STREAM_Add_verify_double(h_a, h_b, h_c, N); 933*caccb7e3SMatthew G Knepley if (errorSTREAMkernel) { 934*caccb7e3SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Add:\t\tError detected in device STREAM_Add, exiting\n");CHKERRQ(ierr); 935*caccb7e3SMatthew G Knepley exit(-5000); 936*caccb7e3SMatthew G Knepley } else { 937*caccb7e3SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Add:\t\tPass\n");CHKERRQ(ierr); 938*caccb7e3SMatthew G Knepley } 939*caccb7e3SMatthew G Knepley 940*caccb7e3SMatthew G Knepley /* Initialize memory on the device */ 941*caccb7e3SMatthew G Knepley set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N); 942*caccb7e3SMatthew G Knepley set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N); 943*caccb7e3SMatthew G Knepley set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N); 944*caccb7e3SMatthew G Knepley 945*caccb7e3SMatthew G Knepley STREAM_Triad_double<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N); 946*caccb7e3SMatthew G Knepley ierr = cudaMemcpy( h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 947*caccb7e3SMatthew G Knepley ierr = cudaMemcpy( h_b, d_b, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 948*caccb7e3SMatthew G Knepley ierr = cudaMemcpy( h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 949*caccb7e3SMatthew G Knepley errorSTREAMkernel = STREAM_Triad_verify_double(h_b, h_c, h_a, scalar, N); 950*caccb7e3SMatthew G Knepley if (errorSTREAMkernel) { 951*caccb7e3SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Triad:\t\tError detected in device STREAM_Triad, exiting\n");CHKERRQ(ierr); 952*caccb7e3SMatthew G Knepley exit(-6000); 953*caccb7e3SMatthew G Knepley } else { 954*caccb7e3SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Triad:\t\tPass\n");CHKERRQ(ierr); 955*caccb7e3SMatthew G Knepley } 956*caccb7e3SMatthew G Knepley 957*caccb7e3SMatthew G Knepley /* continue from here */ 958*caccb7e3SMatthew G Knepley printResultsReadable(times); 959*caccb7e3SMatthew G Knepley 960*caccb7e3SMatthew G Knepley //clean up timers 961*caccb7e3SMatthew G Knepley ierr = cudaEventDestroy( stop );CHKERRQ(ierr); 962*caccb7e3SMatthew G Knepley ierr = cudaEventDestroy( start );CHKERRQ(ierr); 963*caccb7e3SMatthew G Knepley 964*caccb7e3SMatthew G Knepley /* Free memory on device */ 965*caccb7e3SMatthew G Knepley ierr = cudaFree(d_a);CHKERRQ(ierr); 966*caccb7e3SMatthew G Knepley ierr = cudaFree(d_b);CHKERRQ(ierr); 967*caccb7e3SMatthew G Knepley ierr = cudaFree(d_c);CHKERRQ(ierr); 968*caccb7e3SMatthew G Knepley PetscFunctionReturn(0); 969*caccb7e3SMatthew G Knepley } 970*caccb7e3SMatthew G Knepley 971403adfb6SMatthew G Knepley /////////////////////////////////////////////////////////////////////////// 972403adfb6SMatthew G Knepley //Print Results to Screen and File 973403adfb6SMatthew G Knepley /////////////////////////////////////////////////////////////////////////// 974403adfb6SMatthew G Knepley PetscErrorCode printResultsReadable(float times[][NTIMES]) { 975403adfb6SMatthew G Knepley PetscErrorCode ierr; 976403adfb6SMatthew G Knepley PetscInt j, k; 977*caccb7e3SMatthew G Knepley float avgtime[8] = {0., 0., 0., 0., 0., 0., 0., 0.}; 978*caccb7e3SMatthew G Knepley float maxtime[8] = {0., 0., 0., 0., 0., 0., 0., 0.}; 979*caccb7e3SMatthew G Knepley float mintime[8] = {1e30,1e30,1e30,1e30,1e30,1e30,1e30,1e30}; 980*caccb7e3SMatthew G Knepley char *label[8] = {"Copy: ", "Copy Opt.: ", "Scale: ", "Scale Opt: ", "Add: ", "Add Opt: ", "Triad: ", "Triad Opt: "}; 981*caccb7e3SMatthew G Knepley float bytes_per_kernel[8] = { 982403adfb6SMatthew G Knepley 2. * sizeof(float) * N, 983403adfb6SMatthew G Knepley 2. * sizeof(float) * N, 984403adfb6SMatthew G Knepley 2. * sizeof(float) * N, 985*caccb7e3SMatthew G Knepley 2. * sizeof(float) * N, 986*caccb7e3SMatthew G Knepley 3. * sizeof(float) * N, 987*caccb7e3SMatthew G Knepley 3. * sizeof(float) * N, 988403adfb6SMatthew G Knepley 3. * sizeof(float) * N, 989403adfb6SMatthew G Knepley 3. * sizeof(float) * N 990403adfb6SMatthew G Knepley }; 991403adfb6SMatthew G Knepley 992403adfb6SMatthew G Knepley PetscFunctionBegin; 993403adfb6SMatthew G Knepley /* --- SUMMARY --- */ 994403adfb6SMatthew G Knepley for(k = 1; k < NTIMES; ++k) { /* note -- skip first iteration */ 995*caccb7e3SMatthew G Knepley for(j = 0; j < 8; ++j) { 996403adfb6SMatthew G Knepley avgtime[j] = avgtime[j] + (1.e-03f * times[j][k]); 997403adfb6SMatthew G Knepley mintime[j] = MIN(mintime[j], (1.e-03f * times[j][k])); 998403adfb6SMatthew G Knepley maxtime[j] = MAX(maxtime[j], (1.e-03f * times[j][k])); 999403adfb6SMatthew G Knepley } 1000403adfb6SMatthew G Knepley } 1001403adfb6SMatthew G Knepley 1002403adfb6SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, "Function Rate (MB/s) Avg time Min time Max time\n");CHKERRQ(ierr); 1003403adfb6SMatthew G Knepley 1004*caccb7e3SMatthew G Knepley for(j = 0; j < 8; ++j) { 1005403adfb6SMatthew G Knepley avgtime[j] = avgtime[j]/(float)(NTIMES-1); 1006403adfb6SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, "%s%11.4f %11.6f %12.6f %12.6f\n", label[j], 1.0E-06 * bytes_per_kernel[j]/mintime[j], avgtime[j], mintime[j], maxtime[j]);CHKERRQ(ierr); 1007403adfb6SMatthew G Knepley } 1008403adfb6SMatthew G Knepley PetscFunctionReturn(0); 1009403adfb6SMatthew G Knepley } 1010