1403adfb6SMatthew G Knepley /* 2403adfb6SMatthew G Knepley STREAM benchmark implementation in CUDA. 3403adfb6SMatthew G Knepley 4403adfb6SMatthew G Knepley COPY: a(i) = b(i) 5403adfb6SMatthew G Knepley SCALE: a(i) = q*b(i) 6403adfb6SMatthew G Knepley SUM: a(i) = b(i) + c(i) 7403adfb6SMatthew G Knepley TRIAD: a(i) = b(i) + q*c(i) 8403adfb6SMatthew G Knepley 9403adfb6SMatthew G Knepley It measures the memory system on the device. 10403adfb6SMatthew G Knepley The implementation is in single precision. 11403adfb6SMatthew G Knepley 12403adfb6SMatthew G Knepley Code based on the code developed by John D. McCalpin 13403adfb6SMatthew G Knepley http://www.cs.virginia.edu/stream/FTP/Code/stream.c 14403adfb6SMatthew G Knepley 15403adfb6SMatthew G Knepley Written by: Massimiliano Fatica, NVIDIA Corporation 16403adfb6SMatthew G Knepley Modified by: Douglas Enright (dpephd-nvidia@yahoo.com), 1 December 2010 17403adfb6SMatthew G Knepley Extensive Revisions, 4 December 2010 18403adfb6SMatthew G Knepley Modified for PETSc by: Matthew G. Knepley 14 Aug 2011 19403adfb6SMatthew G Knepley 20403adfb6SMatthew G Knepley User interface motivated by bandwidthTest NVIDIA SDK example. 21403adfb6SMatthew G Knepley */ 22403adfb6SMatthew G Knepley static char *help = "Single-Precision STREAM Benchmark implementation in CUDA\n" 23403adfb6SMatthew G Knepley "Performs Copy, Scale, Add, and Triad single-precision kernels\n\n"; 24403adfb6SMatthew G Knepley 25403adfb6SMatthew G Knepley #include <petscconf.h> 26403adfb6SMatthew G Knepley #include <petscsys.h> 27403adfb6SMatthew G Knepley #include <petsctime.h> 28403adfb6SMatthew G Knepley 29403adfb6SMatthew G Knepley #define N 2000000 30caccb7e3SMatthew G Knepley #define N_DOUBLE 8000000 31403adfb6SMatthew G Knepley #define NTIMES 10 32403adfb6SMatthew G Knepley 33403adfb6SMatthew G Knepley # ifndef MIN 34403adfb6SMatthew G Knepley # define MIN(x,y) ((x)<(y)?(x):(y)) 35403adfb6SMatthew G Knepley # endif 36403adfb6SMatthew G Knepley # ifndef MAX 37403adfb6SMatthew G Knepley # define MAX(x,y) ((x)>(y)?(x):(y)) 38403adfb6SMatthew G Knepley # endif 39403adfb6SMatthew G Knepley 40403adfb6SMatthew G Knepley const float flt_eps = 1.192092896e-07f; 41caccb7e3SMatthew G Knepley const double dbl_eps = 2.2204460492503131e-16; 42403adfb6SMatthew G Knepley 43403adfb6SMatthew G Knepley __global__ void set_array(float *a, float value, size_t len) 44403adfb6SMatthew G Knepley { 45403adfb6SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 46403adfb6SMatthew G Knepley while (idx < len) { 47403adfb6SMatthew G Knepley a[idx] = value; 48403adfb6SMatthew G Knepley idx += blockDim.x * gridDim.x; 49403adfb6SMatthew G Knepley } 50403adfb6SMatthew G Knepley } 51403adfb6SMatthew G Knepley 52caccb7e3SMatthew G Knepley __global__ void set_array_double(double *a, double value, size_t len) 53caccb7e3SMatthew G Knepley { 54caccb7e3SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 55caccb7e3SMatthew G Knepley while (idx < len) { 56caccb7e3SMatthew G Knepley a[idx] = value; 57caccb7e3SMatthew G Knepley idx += blockDim.x * gridDim.x; 58caccb7e3SMatthew G Knepley } 59caccb7e3SMatthew G Knepley } 60caccb7e3SMatthew G Knepley 61403adfb6SMatthew G Knepley __global__ void STREAM_Copy(float *a, float *b, size_t len) 62403adfb6SMatthew G Knepley { 63403adfb6SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 64403adfb6SMatthew G Knepley while (idx < len) { 65403adfb6SMatthew G Knepley b[idx] = a[idx]; 66403adfb6SMatthew G Knepley idx += blockDim.x * gridDim.x; 67403adfb6SMatthew G Knepley } 68403adfb6SMatthew G Knepley } 69403adfb6SMatthew G Knepley 70caccb7e3SMatthew G Knepley __global__ void STREAM_Copy_double(double *a, double *b, size_t len) 71caccb7e3SMatthew G Knepley { 72caccb7e3SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 73caccb7e3SMatthew G Knepley while (idx < len) { 74caccb7e3SMatthew G Knepley b[idx] = a[idx]; 75caccb7e3SMatthew G Knepley idx += blockDim.x * gridDim.x; 76caccb7e3SMatthew G Knepley } 77caccb7e3SMatthew G Knepley } 78caccb7e3SMatthew G Knepley 79403adfb6SMatthew G Knepley __global__ void STREAM_Copy_Optimized(float *a, float *b, size_t len) 80403adfb6SMatthew G Knepley { 81403adfb6SMatthew G Knepley /* 82403adfb6SMatthew G Knepley * Ensure size of thread index space is as large as or greater than 83403adfb6SMatthew G Knepley * vector index space else return. 84403adfb6SMatthew G Knepley */ 85403adfb6SMatthew G Knepley if (blockDim.x * gridDim.x < len) return; 86403adfb6SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 87403adfb6SMatthew G Knepley if (idx < len) b[idx] = a[idx]; 88403adfb6SMatthew G Knepley } 89403adfb6SMatthew G Knepley 90caccb7e3SMatthew G Knepley __global__ void STREAM_Copy_Optimized_double(double *a, double *b, size_t len) 91caccb7e3SMatthew G Knepley { 92caccb7e3SMatthew G Knepley /* 93caccb7e3SMatthew G Knepley * Ensure size of thread index space is as large as or greater than 94caccb7e3SMatthew G Knepley * vector index space else return. 95caccb7e3SMatthew G Knepley */ 96caccb7e3SMatthew G Knepley if (blockDim.x * gridDim.x < len) return; 97caccb7e3SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 98caccb7e3SMatthew G Knepley if (idx < len) b[idx] = a[idx]; 99caccb7e3SMatthew G Knepley } 100caccb7e3SMatthew G Knepley 101403adfb6SMatthew G Knepley __global__ void STREAM_Scale(float *a, float *b, float scale, size_t len) 102403adfb6SMatthew G Knepley { 103403adfb6SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 104403adfb6SMatthew G Knepley while (idx < len) { 105403adfb6SMatthew G Knepley b[idx] = scale* a[idx]; 106403adfb6SMatthew G Knepley idx += blockDim.x * gridDim.x; 107403adfb6SMatthew G Knepley } 108403adfb6SMatthew G Knepley } 109403adfb6SMatthew G Knepley 110caccb7e3SMatthew G Knepley __global__ void STREAM_Scale_double(double *a, double *b, double scale, size_t len) 111caccb7e3SMatthew G Knepley { 112caccb7e3SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 113caccb7e3SMatthew G Knepley while (idx < len) { 114caccb7e3SMatthew G Knepley b[idx] = scale* a[idx]; 115caccb7e3SMatthew G Knepley idx += blockDim.x * gridDim.x; 116caccb7e3SMatthew G Knepley } 117caccb7e3SMatthew G Knepley } 118caccb7e3SMatthew G Knepley 119caccb7e3SMatthew G Knepley __global__ void STREAM_Scale_Optimized(float *a, float *b, float scale, size_t len) 120caccb7e3SMatthew G Knepley { 121caccb7e3SMatthew G Knepley /* 122caccb7e3SMatthew G Knepley * Ensure size of thread index space is as large as or greater than 123caccb7e3SMatthew G Knepley * vector index space else return. 124caccb7e3SMatthew G Knepley */ 125caccb7e3SMatthew G Knepley if (blockDim.x * gridDim.x < len) return; 126caccb7e3SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 127caccb7e3SMatthew G Knepley if (idx < len) b[idx] = scale* a[idx]; 128caccb7e3SMatthew G Knepley } 129caccb7e3SMatthew G Knepley 130caccb7e3SMatthew G Knepley __global__ void STREAM_Scale_Optimized_double(double *a, double *b, double scale, size_t len) 131caccb7e3SMatthew G Knepley { 132caccb7e3SMatthew G Knepley /* 133caccb7e3SMatthew G Knepley * Ensure size of thread index space is as large as or greater than 134caccb7e3SMatthew G Knepley * vector index space else return. 135caccb7e3SMatthew G Knepley */ 136caccb7e3SMatthew G Knepley if (blockDim.x * gridDim.x < len) return; 137caccb7e3SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 138caccb7e3SMatthew G Knepley if (idx < len) b[idx] = scale* a[idx]; 139caccb7e3SMatthew G Knepley } 140caccb7e3SMatthew G Knepley 141403adfb6SMatthew G Knepley __global__ void STREAM_Add( float *a, float *b, float *c, size_t len) 142403adfb6SMatthew G Knepley { 143403adfb6SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 144403adfb6SMatthew G Knepley while (idx < len) { 145403adfb6SMatthew G Knepley c[idx] = a[idx]+b[idx]; 146403adfb6SMatthew G Knepley idx += blockDim.x * gridDim.x; 147403adfb6SMatthew G Knepley } 148403adfb6SMatthew G Knepley } 149403adfb6SMatthew G Knepley 150caccb7e3SMatthew G Knepley __global__ void STREAM_Add_double( double *a, double *b, double *c, size_t len) 151caccb7e3SMatthew G Knepley { 152caccb7e3SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 153caccb7e3SMatthew G Knepley while (idx < len) { 154caccb7e3SMatthew G Knepley c[idx] = a[idx]+b[idx]; 155caccb7e3SMatthew G Knepley idx += blockDim.x * gridDim.x; 156caccb7e3SMatthew G Knepley } 157caccb7e3SMatthew G Knepley } 158caccb7e3SMatthew G Knepley 159caccb7e3SMatthew G Knepley __global__ void STREAM_Add_Optimized( float *a, float *b, float *c, size_t len) 160caccb7e3SMatthew G Knepley { 161caccb7e3SMatthew G Knepley /* 162caccb7e3SMatthew G Knepley * Ensure size of thread index space is as large as or greater than 163caccb7e3SMatthew G Knepley * vector index space else return. 164caccb7e3SMatthew G Knepley */ 165caccb7e3SMatthew G Knepley if (blockDim.x * gridDim.x < len) return; 166caccb7e3SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 167caccb7e3SMatthew G Knepley if (idx < len) c[idx] = a[idx]+b[idx]; 168caccb7e3SMatthew G Knepley } 169caccb7e3SMatthew G Knepley 170caccb7e3SMatthew G Knepley __global__ void STREAM_Add_Optimized_double( double *a, double *b, double *c, size_t len) 171caccb7e3SMatthew G Knepley { 172caccb7e3SMatthew G Knepley /* 173caccb7e3SMatthew G Knepley * Ensure size of thread index space is as large as or greater than 174caccb7e3SMatthew G Knepley * vector index space else return. 175caccb7e3SMatthew G Knepley */ 176caccb7e3SMatthew G Knepley if (blockDim.x * gridDim.x < len) return; 177caccb7e3SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 178caccb7e3SMatthew G Knepley if (idx < len) c[idx] = a[idx]+b[idx]; 179caccb7e3SMatthew G Knepley } 180caccb7e3SMatthew G Knepley 181403adfb6SMatthew G Knepley __global__ void STREAM_Triad( float *a, float *b, float *c, float scalar, size_t len) 182403adfb6SMatthew G Knepley { 183403adfb6SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 184403adfb6SMatthew G Knepley while (idx < len) { 185403adfb6SMatthew G Knepley c[idx] = a[idx]+scalar*b[idx]; 186403adfb6SMatthew G Knepley idx += blockDim.x * gridDim.x; 187403adfb6SMatthew G Knepley } 188403adfb6SMatthew G Knepley } 189403adfb6SMatthew G Knepley 190caccb7e3SMatthew G Knepley __global__ void STREAM_Triad_double( double *a, double *b, double *c, double scalar, size_t len) 191caccb7e3SMatthew G Knepley { 192caccb7e3SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 193caccb7e3SMatthew G Knepley while (idx < len) { 194caccb7e3SMatthew G Knepley c[idx] = a[idx]+scalar*b[idx]; 195caccb7e3SMatthew G Knepley idx += blockDim.x * gridDim.x; 196caccb7e3SMatthew G Knepley } 197caccb7e3SMatthew G Knepley } 198caccb7e3SMatthew G Knepley 199caccb7e3SMatthew G Knepley __global__ void STREAM_Triad_Optimized( float *a, float *b, float *c, float scalar, size_t len) 200caccb7e3SMatthew G Knepley { 201caccb7e3SMatthew G Knepley /* 202caccb7e3SMatthew G Knepley * Ensure size of thread index space is as large as or greater than 203caccb7e3SMatthew G Knepley * vector index space else return. 204caccb7e3SMatthew G Knepley */ 205caccb7e3SMatthew G Knepley if (blockDim.x * gridDim.x < len) return; 206caccb7e3SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 207caccb7e3SMatthew G Knepley if (idx < len) c[idx] = a[idx]+scalar*b[idx]; 208caccb7e3SMatthew G Knepley } 209caccb7e3SMatthew G Knepley 210caccb7e3SMatthew G Knepley __global__ void STREAM_Triad_Optimized_double( double *a, double *b, double *c, double scalar, size_t len) 211caccb7e3SMatthew G Knepley { 212caccb7e3SMatthew G Knepley /* 213caccb7e3SMatthew G Knepley * Ensure size of thread index space is as large as or greater than 214caccb7e3SMatthew G Knepley * vector index space else return. 215caccb7e3SMatthew G Knepley */ 216caccb7e3SMatthew G Knepley if (blockDim.x * gridDim.x < len) return; 217caccb7e3SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 218caccb7e3SMatthew G Knepley if (idx < len) c[idx] = a[idx]+scalar*b[idx]; 219caccb7e3SMatthew G Knepley } 220caccb7e3SMatthew G Knepley 221403adfb6SMatthew G Knepley /* Host side verification routines */ 222*a6dfd86eSKarl Rupp bool STREAM_Copy_verify(float *a, float *b, size_t len) 223*a6dfd86eSKarl Rupp { 224403adfb6SMatthew G Knepley size_t idx; 225403adfb6SMatthew G Knepley bool bDifferent = false; 226403adfb6SMatthew G Knepley 227403adfb6SMatthew G Knepley for (idx = 0; idx < len && !bDifferent; idx++) { 228403adfb6SMatthew G Knepley float expectedResult = a[idx]; 229403adfb6SMatthew G Knepley float diffResultExpected = (b[idx] - expectedResult); 230403adfb6SMatthew G Knepley float relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps; 231403adfb6SMatthew G Knepley /* element-wise relative error determination */ 232403adfb6SMatthew G Knepley bDifferent = (relErrorULPS > 2.f); 233403adfb6SMatthew G Knepley } 234403adfb6SMatthew G Knepley 235403adfb6SMatthew G Knepley return bDifferent; 236403adfb6SMatthew G Knepley } 237403adfb6SMatthew G Knepley 238*a6dfd86eSKarl Rupp bool STREAM_Copy_verify_double(double *a, double *b, size_t len) 239*a6dfd86eSKarl Rupp { 240caccb7e3SMatthew G Knepley size_t idx; 241caccb7e3SMatthew G Knepley bool bDifferent = false; 242caccb7e3SMatthew G Knepley 243caccb7e3SMatthew G Knepley for (idx = 0; idx < len && !bDifferent; idx++) { 244caccb7e3SMatthew G Knepley double expectedResult = a[idx]; 245caccb7e3SMatthew G Knepley double diffResultExpected = (b[idx] - expectedResult); 246caccb7e3SMatthew G Knepley double relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps; 247caccb7e3SMatthew G Knepley /* element-wise relative error determination */ 248caccb7e3SMatthew G Knepley bDifferent = (relErrorULPS > 2.); 249caccb7e3SMatthew G Knepley } 250caccb7e3SMatthew G Knepley 251caccb7e3SMatthew G Knepley return bDifferent; 252caccb7e3SMatthew G Knepley } 253caccb7e3SMatthew G Knepley 254*a6dfd86eSKarl Rupp bool STREAM_Scale_verify(float *a, float *b, float scale, size_t len) 255*a6dfd86eSKarl Rupp { 256403adfb6SMatthew G Knepley size_t idx; 257403adfb6SMatthew G Knepley bool bDifferent = false; 258403adfb6SMatthew G Knepley 259403adfb6SMatthew G Knepley for (idx = 0; idx < len && !bDifferent; idx++) { 260403adfb6SMatthew G Knepley float expectedResult = scale*a[idx]; 261403adfb6SMatthew G Knepley float diffResultExpected = (b[idx] - expectedResult); 262403adfb6SMatthew G Knepley float relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps; 263403adfb6SMatthew G Knepley /* element-wise relative error determination */ 264403adfb6SMatthew G Knepley bDifferent = (relErrorULPS > 2.f); 265403adfb6SMatthew G Knepley } 266403adfb6SMatthew G Knepley 267403adfb6SMatthew G Knepley return bDifferent; 268403adfb6SMatthew G Knepley } 269403adfb6SMatthew G Knepley 270*a6dfd86eSKarl Rupp bool STREAM_Scale_verify_double(double *a, double *b, double scale, size_t len) 271*a6dfd86eSKarl Rupp { 272caccb7e3SMatthew G Knepley size_t idx; 273caccb7e3SMatthew G Knepley bool bDifferent = false; 274caccb7e3SMatthew G Knepley 275caccb7e3SMatthew G Knepley for (idx = 0; idx < len && !bDifferent; idx++) { 276caccb7e3SMatthew G Knepley double expectedResult = scale*a[idx]; 277caccb7e3SMatthew G Knepley double diffResultExpected = (b[idx] - expectedResult); 278caccb7e3SMatthew G Knepley double relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps; 279caccb7e3SMatthew G Knepley /* element-wise relative error determination */ 280caccb7e3SMatthew G Knepley bDifferent = (relErrorULPS > 2.); 281caccb7e3SMatthew G Knepley } 282caccb7e3SMatthew G Knepley 283caccb7e3SMatthew G Knepley return bDifferent; 284caccb7e3SMatthew G Knepley } 285caccb7e3SMatthew G Knepley 286*a6dfd86eSKarl Rupp bool STREAM_Add_verify(float *a, float *b, float *c, size_t len) 287*a6dfd86eSKarl Rupp { 288403adfb6SMatthew G Knepley size_t idx; 289403adfb6SMatthew G Knepley bool bDifferent = false; 290403adfb6SMatthew G Knepley 291403adfb6SMatthew G Knepley for (idx = 0; idx < len && !bDifferent; idx++) { 292403adfb6SMatthew G Knepley float expectedResult = a[idx] + b[idx]; 293403adfb6SMatthew G Knepley float diffResultExpected = (c[idx] - expectedResult); 294403adfb6SMatthew G Knepley float relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps; 295403adfb6SMatthew G Knepley /* element-wise relative error determination */ 296403adfb6SMatthew G Knepley bDifferent = (relErrorULPS > 2.f); 297403adfb6SMatthew G Knepley } 298403adfb6SMatthew G Knepley 299403adfb6SMatthew G Knepley return bDifferent; 300403adfb6SMatthew G Knepley } 301403adfb6SMatthew G Knepley 302*a6dfd86eSKarl Rupp bool STREAM_Add_verify_double(double *a, double *b, double *c, size_t len) 303*a6dfd86eSKarl Rupp { 304caccb7e3SMatthew G Knepley size_t idx; 305caccb7e3SMatthew G Knepley bool bDifferent = false; 306caccb7e3SMatthew G Knepley 307caccb7e3SMatthew G Knepley for (idx = 0; idx < len && !bDifferent; idx++) { 308caccb7e3SMatthew G Knepley double expectedResult = a[idx] + b[idx]; 309caccb7e3SMatthew G Knepley double diffResultExpected = (c[idx] - expectedResult); 310caccb7e3SMatthew G Knepley double relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps; 311caccb7e3SMatthew G Knepley /* element-wise relative error determination */ 312caccb7e3SMatthew G Knepley bDifferent = (relErrorULPS > 2.); 313caccb7e3SMatthew G Knepley } 314caccb7e3SMatthew G Knepley 315caccb7e3SMatthew G Knepley return bDifferent; 316caccb7e3SMatthew G Knepley } 317caccb7e3SMatthew G Knepley 318*a6dfd86eSKarl Rupp bool STREAM_Triad_verify(float *a, float *b, float *c, float scalar, size_t len) 319*a6dfd86eSKarl Rupp { 320403adfb6SMatthew G Knepley size_t idx; 321403adfb6SMatthew G Knepley bool bDifferent = false; 322403adfb6SMatthew G Knepley 323403adfb6SMatthew G Knepley for (idx = 0; idx < len && !bDifferent; idx++) { 324403adfb6SMatthew G Knepley float expectedResult = a[idx] + scalar*b[idx]; 325403adfb6SMatthew G Knepley float diffResultExpected = (c[idx] - expectedResult); 326403adfb6SMatthew G Knepley float relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps; 327403adfb6SMatthew G Knepley /* element-wise relative error determination */ 328403adfb6SMatthew G Knepley bDifferent = (relErrorULPS > 3.f); 329403adfb6SMatthew G Knepley } 330403adfb6SMatthew G Knepley 331403adfb6SMatthew G Knepley return bDifferent; 332403adfb6SMatthew G Knepley } 333403adfb6SMatthew G Knepley 334*a6dfd86eSKarl Rupp bool STREAM_Triad_verify_double(double *a, double *b, double *c, double scalar, size_t len) 335*a6dfd86eSKarl Rupp { 336caccb7e3SMatthew G Knepley size_t idx; 337caccb7e3SMatthew G Knepley bool bDifferent = false; 338caccb7e3SMatthew G Knepley 339caccb7e3SMatthew G Knepley for (idx = 0; idx < len && !bDifferent; idx++) { 340caccb7e3SMatthew G Knepley double expectedResult = a[idx] + scalar*b[idx]; 341caccb7e3SMatthew G Knepley double diffResultExpected = (c[idx] - expectedResult); 342caccb7e3SMatthew G Knepley double relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps; 343caccb7e3SMatthew G Knepley /* element-wise relative error determination */ 344caccb7e3SMatthew G Knepley bDifferent = (relErrorULPS > 3.); 345caccb7e3SMatthew G Knepley } 346caccb7e3SMatthew G Knepley 347caccb7e3SMatthew G Knepley return bDifferent; 348caccb7e3SMatthew G Knepley } 349caccb7e3SMatthew G Knepley 350403adfb6SMatthew G Knepley /* forward declarations */ 351caccb7e3SMatthew G Knepley PetscErrorCode setupStream(PetscInt device, PetscBool runDouble, PetscBool cpuTiming); 352403adfb6SMatthew G Knepley PetscErrorCode runStream(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming); 353caccb7e3SMatthew G Knepley PetscErrorCode runStreamDouble(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming); 354403adfb6SMatthew G Knepley PetscErrorCode printResultsReadable(float times[][NTIMES]); 355403adfb6SMatthew G Knepley 356403adfb6SMatthew G Knepley int main(int argc, char *argv[]) 357403adfb6SMatthew G Knepley { 358403adfb6SMatthew G Knepley PetscInt device = 0; 359caccb7e3SMatthew G Knepley PetscBool runDouble = PETSC_FALSE; 360403adfb6SMatthew G Knepley PetscBool cpuTiming = PETSC_FALSE; 361403adfb6SMatthew G Knepley PetscErrorCode ierr; 362403adfb6SMatthew G Knepley 363403adfb6SMatthew G Knepley ierr = PetscInitialize(&argc, &argv, 0, help);CHKERRQ(ierr); 364caccb7e3SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, "[Single and Double-Precision Device-Only STREAM Benchmark implementation in CUDA]\n");CHKERRQ(ierr); 365403adfb6SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, "%s Starting...\n\n", argv[0]);CHKERRQ(ierr); 366403adfb6SMatthew G Knepley 367403adfb6SMatthew G Knepley ierr = PetscOptionsBegin(PETSC_COMM_WORLD, "", "STREAM Benchmark Options", "STREAM");CHKERRQ(ierr); 368403adfb6SMatthew G Knepley ierr = PetscOptionsInt("-device", "Specify the CUDA device to be used", "STREAM", device, &device, PETSC_NULL);CHKERRQ(ierr); 369caccb7e3SMatthew G Knepley ierr = PetscOptionsBool("-double", "Also run double precision tests", "STREAM", runDouble, &runDouble, PETSC_NULL);CHKERRQ(ierr); 370403adfb6SMatthew G Knepley ierr = PetscOptionsBool("-cputiming", "Force CPU-based timing to be used", "STREAM", cpuTiming, &cpuTiming, PETSC_NULL);CHKERRQ(ierr); 371403adfb6SMatthew G Knepley ierr = PetscOptionsEnd(); 372403adfb6SMatthew G Knepley 373caccb7e3SMatthew G Knepley ierr = setupStream(device, runDouble, cpuTiming); 374403adfb6SMatthew G Knepley if (ierr >= 0) { 375403adfb6SMatthew G Knepley PetscErrorCode ierr2 = PetscPrintf(PETSC_COMM_SELF, "\n[streamBenchmark] - results:\t%s\n\n", (ierr == 0) ? "PASSES" : "FAILED");CHKERRQ(ierr2); 376403adfb6SMatthew G Knepley } 377403adfb6SMatthew G Knepley PetscFinalize(); 378403adfb6SMatthew G Knepley return 0; 379403adfb6SMatthew G Knepley } 380403adfb6SMatthew G Knepley 381403adfb6SMatthew G Knepley /////////////////////////////////////////////////////////////////////////////// 382403adfb6SMatthew G Knepley //Run the appropriate tests 383403adfb6SMatthew G Knepley /////////////////////////////////////////////////////////////////////////////// 384caccb7e3SMatthew G Knepley PetscErrorCode setupStream(PetscInt deviceNum, PetscBool runDouble, PetscBool cpuTiming) 385403adfb6SMatthew G Knepley { 386403adfb6SMatthew G Knepley PetscInt iNumThreadsPerBlock = 128; 387403adfb6SMatthew G Knepley PetscErrorCode ierr; 388403adfb6SMatthew G Knepley 389403adfb6SMatthew G Knepley PetscFunctionBegin; 390403adfb6SMatthew G Knepley // Check device 391403adfb6SMatthew G Knepley { 392403adfb6SMatthew G Knepley int deviceCount; 393403adfb6SMatthew G Knepley 394403adfb6SMatthew G Knepley cudaGetDeviceCount(&deviceCount); 395403adfb6SMatthew G Knepley if (deviceCount == 0) { 396403adfb6SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, "!!!!!No devices found!!!!!\n");CHKERRQ(ierr); 397403adfb6SMatthew G Knepley return -1000; 398403adfb6SMatthew G Knepley } 399403adfb6SMatthew G Knepley 400403adfb6SMatthew G Knepley if (deviceNum >= deviceCount || deviceNum < 0) { 401403adfb6SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, "\n!!!!!Invalid GPU number %d given hence default gpu %d will be used !!!!!\n", deviceNum, 0);CHKERRQ(ierr); 402403adfb6SMatthew G Knepley deviceNum = 0; 403403adfb6SMatthew G Knepley } 404403adfb6SMatthew G Knepley } 405403adfb6SMatthew G Knepley 406403adfb6SMatthew G Knepley cudaSetDevice(deviceNum); 407403adfb6SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, "Running on...\n\n");CHKERRQ(ierr); 408403adfb6SMatthew G Knepley cudaDeviceProp deviceProp; 409403adfb6SMatthew G Knepley if (cudaGetDeviceProperties(&deviceProp, deviceNum) == cudaSuccess) { 410403adfb6SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, " Device %d: %s\n", deviceNum, deviceProp.name);CHKERRQ(ierr); 411403adfb6SMatthew G Knepley } else { 412403adfb6SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, " Unable to determine device %d properties, exiting\n");CHKERRQ(ierr); 413403adfb6SMatthew G Knepley return -1; 414403adfb6SMatthew G Knepley } 415403adfb6SMatthew G Knepley 416caccb7e3SMatthew G Knepley if (runDouble && deviceProp.major == 1 && deviceProp.minor < 3) { 417caccb7e3SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, " Unable to run double-precision STREAM benchmark on a compute capability GPU less than 1.3\n");CHKERRQ(ierr); 418caccb7e3SMatthew G Knepley return -1; 419caccb7e3SMatthew G Knepley } 420403adfb6SMatthew G Knepley if (deviceProp.major == 2 && deviceProp.minor == 1) { 421403adfb6SMatthew G Knepley iNumThreadsPerBlock = 192; /* GF104 architecture / 48 CUDA Cores per MP */ 422403adfb6SMatthew G Knepley } else { 423403adfb6SMatthew G Knepley iNumThreadsPerBlock = 128; /* GF100 architecture / 32 CUDA Cores per MP */ 424403adfb6SMatthew G Knepley } 425403adfb6SMatthew G Knepley 426403adfb6SMatthew G Knepley if (cpuTiming) { 427403adfb6SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, " Using cpu-only timer.\n");CHKERRQ(ierr); 428403adfb6SMatthew G Knepley } 429403adfb6SMatthew G Knepley 430403adfb6SMatthew G Knepley ierr = runStream(iNumThreadsPerBlock, cpuTiming);CHKERRQ(ierr); 431caccb7e3SMatthew G Knepley if (runDouble) { 432caccb7e3SMatthew G Knepley ierr = cudaSetDeviceFlags(cudaDeviceBlockingSync);CHKERRQ(ierr); 433caccb7e3SMatthew G Knepley ierr = runStreamDouble(iNumThreadsPerBlock, cpuTiming);CHKERRQ(ierr); 434caccb7e3SMatthew G Knepley } 435403adfb6SMatthew G Knepley PetscFunctionReturn(0); 436403adfb6SMatthew G Knepley } 437403adfb6SMatthew G Knepley 438403adfb6SMatthew G Knepley /////////////////////////////////////////////////////////////////////////// 439403adfb6SMatthew G Knepley // runStream 440403adfb6SMatthew G Knepley /////////////////////////////////////////////////////////////////////////// 441403adfb6SMatthew G Knepley PetscErrorCode runStream(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming) 442403adfb6SMatthew G Knepley { 443403adfb6SMatthew G Knepley float *d_a, *d_b, *d_c; 444403adfb6SMatthew G Knepley int k; 445caccb7e3SMatthew G Knepley float times[8][NTIMES]; 446403adfb6SMatthew G Knepley float scalar; 447403adfb6SMatthew G Knepley PetscErrorCode ierr; 448403adfb6SMatthew G Knepley 449403adfb6SMatthew G Knepley PetscFunctionBegin; 450403adfb6SMatthew G Knepley /* Allocate memory on device */ 451403adfb6SMatthew G Knepley ierr = cudaMalloc((void**)&d_a, sizeof(float)*N);CHKERRQ(ierr); 452403adfb6SMatthew G Knepley ierr = cudaMalloc((void**)&d_b, sizeof(float)*N);CHKERRQ(ierr); 453403adfb6SMatthew G Knepley ierr = cudaMalloc((void**)&d_c, sizeof(float)*N);CHKERRQ(ierr); 454403adfb6SMatthew G Knepley 455403adfb6SMatthew G Knepley /* Compute execution configuration */ 456403adfb6SMatthew G Knepley 457403adfb6SMatthew G Knepley dim3 dimBlock(iNumThreadsPerBlock); /* (iNumThreadsPerBlock,1,1) */ 458403adfb6SMatthew G Knepley dim3 dimGrid(N/dimBlock.x); /* (N/dimBlock.x,1,1) */ 459403adfb6SMatthew G Knepley if (N % dimBlock.x != 0) dimGrid.x+=1; 460403adfb6SMatthew G Knepley 461403adfb6SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, " Array size (single precision) = %u\n",N);CHKERRQ(ierr); 462403adfb6SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, " using %u threads per block, %u blocks\n",dimBlock.x,dimGrid.x);CHKERRQ(ierr); 463403adfb6SMatthew G Knepley 464403adfb6SMatthew G Knepley /* Initialize memory on the device */ 465403adfb6SMatthew G Knepley set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N); 466403adfb6SMatthew G Knepley set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N); 467403adfb6SMatthew G Knepley set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N); 468403adfb6SMatthew G Knepley 469403adfb6SMatthew G Knepley /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ 470403adfb6SMatthew G Knepley PetscLogDouble cpuTimer = 0.0; 471403adfb6SMatthew G Knepley cudaEvent_t start, stop; 472403adfb6SMatthew G Knepley 473403adfb6SMatthew G Knepley /* both timers report msec */ 474403adfb6SMatthew G Knepley ierr = cudaEventCreate( &start );CHKERRQ(ierr); /* gpu timer facility */ 475403adfb6SMatthew G Knepley ierr = cudaEventCreate( &stop );CHKERRQ(ierr); /* gpu timer facility */ 476403adfb6SMatthew G Knepley 477403adfb6SMatthew G Knepley scalar=3.0f; 478403adfb6SMatthew G Knepley for (k = 0; k < NTIMES; ++k) { 479403adfb6SMatthew G Knepley PetscTimeSubtract(cpuTimer); 480403adfb6SMatthew G Knepley ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr); 481403adfb6SMatthew G Knepley STREAM_Copy<<<dimGrid,dimBlock>>>(d_a, d_c, N); 482403adfb6SMatthew G Knepley ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr); 483403adfb6SMatthew G Knepley ierr = cudaEventSynchronize(stop);CHKERRQ(ierr); 484403adfb6SMatthew G Knepley //get the the total elapsed time in ms 485403adfb6SMatthew G Knepley PetscTimeAdd(cpuTimer); 486caccb7e3SMatthew G Knepley if (bDontUseGPUTiming) { 487403adfb6SMatthew G Knepley times[0][k] = cpuTimer; 488403adfb6SMatthew G Knepley } else { 489403adfb6SMatthew G Knepley ierr = cudaEventElapsedTime( ×[0][k], start, stop );CHKERRQ(ierr); 490403adfb6SMatthew G Knepley } 491403adfb6SMatthew G Knepley 492403adfb6SMatthew G Knepley cpuTimer = 0.0; 493403adfb6SMatthew G Knepley PetscTimeSubtract(cpuTimer); 494403adfb6SMatthew G Knepley ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr); 495403adfb6SMatthew G Knepley STREAM_Copy_Optimized<<<dimGrid,dimBlock>>>(d_a, d_c, N); 496403adfb6SMatthew G Knepley ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr); 497403adfb6SMatthew G Knepley ierr = cudaEventSynchronize(stop);CHKERRQ(ierr); 498403adfb6SMatthew G Knepley //get the the total elapsed time in ms 499403adfb6SMatthew G Knepley PetscTimeAdd(cpuTimer); 500caccb7e3SMatthew G Knepley if (bDontUseGPUTiming) { 501403adfb6SMatthew G Knepley times[1][k] = cpuTimer; 502403adfb6SMatthew G Knepley } else { 503403adfb6SMatthew G Knepley ierr = cudaEventElapsedTime( ×[1][k], start, stop );CHKERRQ(ierr); 504403adfb6SMatthew G Knepley } 505403adfb6SMatthew G Knepley 506403adfb6SMatthew G Knepley cpuTimer = 0.0; 507403adfb6SMatthew G Knepley PetscTimeSubtract(cpuTimer); 508403adfb6SMatthew G Knepley ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr); 509403adfb6SMatthew G Knepley STREAM_Scale<<<dimGrid,dimBlock>>>(d_b, d_c, scalar, N); 510403adfb6SMatthew G Knepley ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr); 511403adfb6SMatthew G Knepley ierr = cudaEventSynchronize(stop);CHKERRQ(ierr); 512403adfb6SMatthew G Knepley //get the the total elapsed time in ms 513403adfb6SMatthew G Knepley PetscTimeAdd(cpuTimer); 514403adfb6SMatthew G Knepley if (bDontUseGPUTiming) { 515403adfb6SMatthew G Knepley times[2][k] = cpuTimer; 516403adfb6SMatthew G Knepley } else { 517403adfb6SMatthew G Knepley ierr = cudaEventElapsedTime( ×[2][k], start, stop );CHKERRQ(ierr); 518403adfb6SMatthew G Knepley } 519403adfb6SMatthew G Knepley 520403adfb6SMatthew G Knepley cpuTimer = 0.0; 521403adfb6SMatthew G Knepley PetscTimeSubtract(cpuTimer); 522403adfb6SMatthew G Knepley ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr); 523caccb7e3SMatthew G Knepley STREAM_Scale_Optimized<<<dimGrid,dimBlock>>>(d_b, d_c, scalar, N); 524403adfb6SMatthew G Knepley ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr); 525403adfb6SMatthew G Knepley ierr = cudaEventSynchronize(stop);CHKERRQ(ierr); 526403adfb6SMatthew G Knepley //get the the total elapsed time in ms 527403adfb6SMatthew G Knepley PetscTimeAdd(cpuTimer); 528403adfb6SMatthew G Knepley if (bDontUseGPUTiming) { 529403adfb6SMatthew G Knepley times[3][k] = cpuTimer; 530403adfb6SMatthew G Knepley } else { 531403adfb6SMatthew G Knepley ierr = cudaEventElapsedTime( ×[3][k], start, stop );CHKERRQ(ierr); 532403adfb6SMatthew G Knepley } 533403adfb6SMatthew G Knepley 534403adfb6SMatthew G Knepley cpuTimer = 0.0; 535403adfb6SMatthew G Knepley PetscTimeSubtract(cpuTimer); 536403adfb6SMatthew G Knepley ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr); 537caccb7e3SMatthew G Knepley STREAM_Add<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N); 538403adfb6SMatthew G Knepley ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr); 539403adfb6SMatthew G Knepley ierr = cudaEventSynchronize(stop);CHKERRQ(ierr); 540403adfb6SMatthew G Knepley //get the the total elapsed time in ms 541403adfb6SMatthew G Knepley PetscTimeAdd(cpuTimer); 542403adfb6SMatthew G Knepley if (bDontUseGPUTiming) { 543403adfb6SMatthew G Knepley times[4][k] = cpuTimer; 544403adfb6SMatthew G Knepley } else { 545403adfb6SMatthew G Knepley ierr = cudaEventElapsedTime( ×[4][k], start, stop );CHKERRQ(ierr); 546403adfb6SMatthew G Knepley } 547403adfb6SMatthew G Knepley 548caccb7e3SMatthew G Knepley cpuTimer = 0.0; 549caccb7e3SMatthew G Knepley PetscTimeSubtract(cpuTimer); 550caccb7e3SMatthew G Knepley ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr); 551caccb7e3SMatthew G Knepley STREAM_Add_Optimized<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N); 552caccb7e3SMatthew G Knepley ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr); 553caccb7e3SMatthew G Knepley ierr = cudaEventSynchronize(stop);CHKERRQ(ierr); 554caccb7e3SMatthew G Knepley //get the the total elapsed time in ms 555caccb7e3SMatthew G Knepley PetscTimeAdd(cpuTimer); 556caccb7e3SMatthew G Knepley if (bDontUseGPUTiming) { 557caccb7e3SMatthew G Knepley times[5][k] = cpuTimer; 558caccb7e3SMatthew G Knepley } else { 559caccb7e3SMatthew G Knepley ierr = cudaEventElapsedTime( ×[5][k], start, stop );CHKERRQ(ierr); 560caccb7e3SMatthew G Knepley } 561caccb7e3SMatthew G Knepley 562caccb7e3SMatthew G Knepley cpuTimer = 0.0; 563caccb7e3SMatthew G Knepley PetscTimeSubtract(cpuTimer); 564caccb7e3SMatthew G Knepley ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr); 565caccb7e3SMatthew G Knepley STREAM_Triad<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N); 566caccb7e3SMatthew G Knepley ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr); 567caccb7e3SMatthew G Knepley ierr = cudaEventSynchronize(stop);CHKERRQ(ierr); 568caccb7e3SMatthew G Knepley //get the the total elapsed time in ms 569caccb7e3SMatthew G Knepley PetscTimeAdd(cpuTimer); 570caccb7e3SMatthew G Knepley if (bDontUseGPUTiming) { 571caccb7e3SMatthew G Knepley times[6][k] = cpuTimer; 572caccb7e3SMatthew G Knepley } else { 573caccb7e3SMatthew G Knepley ierr = cudaEventElapsedTime( ×[6][k], start, stop );CHKERRQ(ierr); 574caccb7e3SMatthew G Knepley } 575caccb7e3SMatthew G Knepley 576caccb7e3SMatthew G Knepley cpuTimer = 0.0; 577caccb7e3SMatthew G Knepley PetscTimeSubtract(cpuTimer); 578caccb7e3SMatthew G Knepley ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr); 579caccb7e3SMatthew G Knepley STREAM_Triad_Optimized<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N); 580caccb7e3SMatthew G Knepley ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr); 581caccb7e3SMatthew G Knepley ierr = cudaEventSynchronize(stop);CHKERRQ(ierr); 582caccb7e3SMatthew G Knepley //get the the total elapsed time in ms 583caccb7e3SMatthew G Knepley PetscTimeAdd(cpuTimer); 584caccb7e3SMatthew G Knepley if (bDontUseGPUTiming) { 585caccb7e3SMatthew G Knepley times[7][k] = cpuTimer; 586caccb7e3SMatthew G Knepley } else { 587caccb7e3SMatthew G Knepley ierr = cudaEventElapsedTime( ×[7][k], start, stop );CHKERRQ(ierr); 588caccb7e3SMatthew G Knepley } 589caccb7e3SMatthew G Knepley 590403adfb6SMatthew G Knepley } 591403adfb6SMatthew G Knepley 592403adfb6SMatthew G Knepley /* verify kernels */ 593403adfb6SMatthew G Knepley float *h_a, *h_b, *h_c; 594403adfb6SMatthew G Knepley bool errorSTREAMkernel = true; 595403adfb6SMatthew G Knepley 596403adfb6SMatthew G Knepley if ( (h_a = (float*)calloc( N, sizeof(float) )) == (float*)NULL ) { 597403adfb6SMatthew G Knepley printf("Unable to allocate array h_a, exiting ...\n"); 598403adfb6SMatthew G Knepley exit(1); 599403adfb6SMatthew G Knepley } 600403adfb6SMatthew G Knepley if ( (h_b = (float*)calloc( N, sizeof(float) )) == (float*)NULL ) { 601403adfb6SMatthew G Knepley printf("Unable to allocate array h_b, exiting ...\n"); 602403adfb6SMatthew G Knepley exit(1); 603403adfb6SMatthew G Knepley } 604403adfb6SMatthew G Knepley 605403adfb6SMatthew G Knepley if ( (h_c = (float*)calloc( N, sizeof(float) )) == (float*)NULL ) { 606403adfb6SMatthew G Knepley printf("Unalbe to allocate array h_c, exiting ...\n"); 607403adfb6SMatthew G Knepley exit(1); 608403adfb6SMatthew G Knepley } 609403adfb6SMatthew G Knepley 610403adfb6SMatthew G Knepley /* 611403adfb6SMatthew G Knepley * perform kernel, copy device memory into host memory and verify each 612403adfb6SMatthew G Knepley * device kernel output 613403adfb6SMatthew G Knepley */ 614403adfb6SMatthew G Knepley 615403adfb6SMatthew G Knepley /* Initialize memory on the device */ 616403adfb6SMatthew G Knepley set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N); 617403adfb6SMatthew G Knepley set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N); 618403adfb6SMatthew G Knepley set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N); 619403adfb6SMatthew G Knepley 620403adfb6SMatthew G Knepley STREAM_Copy<<<dimGrid,dimBlock>>>(d_a, d_c, N); 621403adfb6SMatthew G Knepley ierr = cudaMemcpy( h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 622403adfb6SMatthew G Knepley ierr = cudaMemcpy( h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 623403adfb6SMatthew G Knepley errorSTREAMkernel = STREAM_Copy_verify(h_a, h_c, N); 624403adfb6SMatthew G Knepley if (errorSTREAMkernel) { 625403adfb6SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy:\t\tError detected in device STREAM_Copy, exiting\n");CHKERRQ(ierr); 626403adfb6SMatthew G Knepley exit(-2000); 627403adfb6SMatthew G Knepley } else { 628403adfb6SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy:\t\tPass\n");CHKERRQ(ierr); 629403adfb6SMatthew G Knepley } 630403adfb6SMatthew G Knepley 631403adfb6SMatthew G Knepley /* Initialize memory on the device */ 632403adfb6SMatthew G Knepley set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N); 633403adfb6SMatthew G Knepley set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N); 634403adfb6SMatthew G Knepley set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N); 635403adfb6SMatthew G Knepley 636403adfb6SMatthew G Knepley STREAM_Copy_Optimized<<<dimGrid,dimBlock>>>(d_a, d_c, N); 637403adfb6SMatthew G Knepley ierr = cudaMemcpy( h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 638403adfb6SMatthew G Knepley ierr = cudaMemcpy( h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 639403adfb6SMatthew G Knepley errorSTREAMkernel = STREAM_Copy_verify(h_a, h_c, N); 640403adfb6SMatthew G Knepley if (errorSTREAMkernel) { 641403adfb6SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy_Optimized:\tError detected in device STREAM_Copy_Optimized, exiting\n");CHKERRQ(ierr); 642403adfb6SMatthew G Knepley exit(-3000); 643403adfb6SMatthew G Knepley } else { 644403adfb6SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy_Optimized:\tPass\n");CHKERRQ(ierr); 645403adfb6SMatthew G Knepley } 646403adfb6SMatthew G Knepley 647403adfb6SMatthew G Knepley /* Initialize memory on the device */ 648403adfb6SMatthew G Knepley set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N); 649403adfb6SMatthew G Knepley set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N); 650403adfb6SMatthew G Knepley 651403adfb6SMatthew G Knepley STREAM_Scale<<<dimGrid,dimBlock>>>(d_b, d_c, scalar, N); 652403adfb6SMatthew G Knepley ierr = cudaMemcpy( h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 653403adfb6SMatthew G Knepley ierr = cudaMemcpy( h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 654403adfb6SMatthew G Knepley errorSTREAMkernel = STREAM_Scale_verify(h_b, h_c, scalar, N); 655403adfb6SMatthew G Knepley if (errorSTREAMkernel) { 656403adfb6SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Scale:\t\tError detected in device STREAM_Scale, exiting\n");CHKERRQ(ierr); 657403adfb6SMatthew G Knepley exit(-4000); 658403adfb6SMatthew G Knepley } else { 659403adfb6SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Scale:\t\tPass\n");CHKERRQ(ierr); 660403adfb6SMatthew G Knepley } 661403adfb6SMatthew G Knepley 662403adfb6SMatthew G Knepley /* Initialize memory on the device */ 663403adfb6SMatthew G Knepley set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N); 664403adfb6SMatthew G Knepley set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N); 665403adfb6SMatthew G Knepley set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N); 666403adfb6SMatthew G Knepley 667403adfb6SMatthew G Knepley STREAM_Add<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N); 668403adfb6SMatthew G Knepley ierr = cudaMemcpy( h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 669403adfb6SMatthew G Knepley ierr = cudaMemcpy( h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 670403adfb6SMatthew G Knepley ierr = cudaMemcpy( h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 671403adfb6SMatthew G Knepley errorSTREAMkernel = STREAM_Add_verify(h_a, h_b, h_c, N); 672403adfb6SMatthew G Knepley if (errorSTREAMkernel) { 673403adfb6SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Add:\t\tError detected in device STREAM_Add, exiting\n");CHKERRQ(ierr); 674403adfb6SMatthew G Knepley exit(-5000); 675403adfb6SMatthew G Knepley } else { 676403adfb6SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Add:\t\tPass\n");CHKERRQ(ierr); 677403adfb6SMatthew G Knepley } 678403adfb6SMatthew G Knepley 679403adfb6SMatthew G Knepley /* Initialize memory on the device */ 680403adfb6SMatthew G Knepley set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N); 681403adfb6SMatthew G Knepley set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N); 682403adfb6SMatthew G Knepley set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N); 683403adfb6SMatthew G Knepley 684403adfb6SMatthew G Knepley STREAM_Triad<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N); 685403adfb6SMatthew G Knepley ierr = cudaMemcpy( h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 686403adfb6SMatthew G Knepley ierr = cudaMemcpy( h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 687403adfb6SMatthew G Knepley ierr = cudaMemcpy( h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 688403adfb6SMatthew G Knepley errorSTREAMkernel = STREAM_Triad_verify(h_b, h_c, h_a, scalar, N); 689403adfb6SMatthew G Knepley if (errorSTREAMkernel) { 690403adfb6SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Triad:\t\tError detected in device STREAM_Triad, exiting\n");CHKERRQ(ierr); 691403adfb6SMatthew G Knepley exit(-6000); 692403adfb6SMatthew G Knepley } else { 693403adfb6SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Triad:\t\tPass\n");CHKERRQ(ierr); 694403adfb6SMatthew G Knepley } 695403adfb6SMatthew G Knepley 696403adfb6SMatthew G Knepley /* continue from here */ 697403adfb6SMatthew G Knepley printResultsReadable(times); 698403adfb6SMatthew G Knepley 699403adfb6SMatthew G Knepley //clean up timers 700403adfb6SMatthew G Knepley ierr = cudaEventDestroy( stop );CHKERRQ(ierr); 701403adfb6SMatthew G Knepley ierr = cudaEventDestroy( start );CHKERRQ(ierr); 702403adfb6SMatthew G Knepley 703403adfb6SMatthew G Knepley /* Free memory on device */ 704403adfb6SMatthew G Knepley ierr = cudaFree(d_a);CHKERRQ(ierr); 705403adfb6SMatthew G Knepley ierr = cudaFree(d_b);CHKERRQ(ierr); 706403adfb6SMatthew G Knepley ierr = cudaFree(d_c);CHKERRQ(ierr); 707403adfb6SMatthew G Knepley PetscFunctionReturn(0); 708403adfb6SMatthew G Knepley } 709403adfb6SMatthew G Knepley 710caccb7e3SMatthew G Knepley PetscErrorCode runStreamDouble(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming) 711caccb7e3SMatthew G Knepley { 712caccb7e3SMatthew G Knepley double *d_a, *d_b, *d_c; 713caccb7e3SMatthew G Knepley int k; 714caccb7e3SMatthew G Knepley float times[8][NTIMES]; 715caccb7e3SMatthew G Knepley double scalar; 716caccb7e3SMatthew G Knepley PetscErrorCode ierr; 717caccb7e3SMatthew G Knepley 718caccb7e3SMatthew G Knepley PetscFunctionBegin; 719caccb7e3SMatthew G Knepley /* Allocate memory on device */ 720caccb7e3SMatthew G Knepley ierr = cudaMalloc((void**)&d_a, sizeof(double)*N);CHKERRQ(ierr); 721caccb7e3SMatthew G Knepley ierr = cudaMalloc((void**)&d_b, sizeof(double)*N);CHKERRQ(ierr); 722caccb7e3SMatthew G Knepley ierr = cudaMalloc((void**)&d_c, sizeof(double)*N);CHKERRQ(ierr); 723caccb7e3SMatthew G Knepley 724caccb7e3SMatthew G Knepley /* Compute execution configuration */ 725caccb7e3SMatthew G Knepley 726caccb7e3SMatthew G Knepley dim3 dimBlock(iNumThreadsPerBlock); /* (iNumThreadsPerBlock,1,1) */ 727caccb7e3SMatthew G Knepley dim3 dimGrid(N/dimBlock.x); /* (N/dimBlock.x,1,1) */ 728caccb7e3SMatthew G Knepley if (N % dimBlock.x != 0) dimGrid.x+=1; 729caccb7e3SMatthew G Knepley 730caccb7e3SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, " Array size (double precision) = %u\n",N);CHKERRQ(ierr); 731caccb7e3SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, " using %u threads per block, %u blocks\n",dimBlock.x,dimGrid.x);CHKERRQ(ierr); 732caccb7e3SMatthew G Knepley 733caccb7e3SMatthew G Knepley /* Initialize memory on the device */ 734caccb7e3SMatthew G Knepley set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N); 735caccb7e3SMatthew G Knepley set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N); 736caccb7e3SMatthew G Knepley set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N); 737caccb7e3SMatthew G Knepley 738caccb7e3SMatthew G Knepley /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ 739caccb7e3SMatthew G Knepley PetscLogDouble cpuTimer = 0.0; 740caccb7e3SMatthew G Knepley cudaEvent_t start, stop; 741caccb7e3SMatthew G Knepley 742caccb7e3SMatthew G Knepley /* both timers report msec */ 743caccb7e3SMatthew G Knepley ierr = cudaEventCreate( &start );CHKERRQ(ierr); /* gpu timer facility */ 744caccb7e3SMatthew G Knepley ierr = cudaEventCreate( &stop );CHKERRQ(ierr); /* gpu timer facility */ 745caccb7e3SMatthew G Knepley 746caccb7e3SMatthew G Knepley scalar=3.0; 747caccb7e3SMatthew G Knepley for (k = 0; k < NTIMES; ++k) { 748caccb7e3SMatthew G Knepley PetscTimeSubtract(cpuTimer); 749caccb7e3SMatthew G Knepley ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr); 750caccb7e3SMatthew G Knepley STREAM_Copy_double<<<dimGrid,dimBlock>>>(d_a, d_c, N); 751caccb7e3SMatthew G Knepley ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr); 752caccb7e3SMatthew G Knepley ierr = cudaEventSynchronize(stop);CHKERRQ(ierr); 753caccb7e3SMatthew G Knepley //get the the total elapsed time in ms 754caccb7e3SMatthew G Knepley if (bDontUseGPUTiming) { 755caccb7e3SMatthew G Knepley PetscTimeAdd(cpuTimer); 756caccb7e3SMatthew G Knepley times[0][k] = cpuTimer; 757caccb7e3SMatthew G Knepley } else { 758caccb7e3SMatthew G Knepley ierr = cudaEventElapsedTime( ×[0][k], start, stop );CHKERRQ(ierr); 759caccb7e3SMatthew G Knepley } 760caccb7e3SMatthew G Knepley 761caccb7e3SMatthew G Knepley cpuTimer = 0.0; 762caccb7e3SMatthew G Knepley PetscTimeSubtract(cpuTimer); 763caccb7e3SMatthew G Knepley ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr); 764caccb7e3SMatthew G Knepley STREAM_Copy_Optimized_double<<<dimGrid,dimBlock>>>(d_a, d_c, N); 765caccb7e3SMatthew G Knepley ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr); 766caccb7e3SMatthew G Knepley ierr = cudaEventSynchronize(stop);CHKERRQ(ierr); 767caccb7e3SMatthew G Knepley //get the the total elapsed time in ms 768caccb7e3SMatthew G Knepley if (bDontUseGPUTiming) { 769caccb7e3SMatthew G Knepley PetscTimeAdd(cpuTimer); 770caccb7e3SMatthew G Knepley times[1][k] = cpuTimer; 771caccb7e3SMatthew G Knepley } else { 772caccb7e3SMatthew G Knepley ierr = cudaEventElapsedTime( ×[1][k], start, stop );CHKERRQ(ierr); 773caccb7e3SMatthew G Knepley } 774caccb7e3SMatthew G Knepley 775caccb7e3SMatthew G Knepley cpuTimer = 0.0; 776caccb7e3SMatthew G Knepley PetscTimeSubtract(cpuTimer); 777caccb7e3SMatthew G Knepley ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr); 778caccb7e3SMatthew G Knepley STREAM_Scale_double<<<dimGrid,dimBlock>>>(d_b, d_c, scalar, N); 779caccb7e3SMatthew G Knepley ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr); 780caccb7e3SMatthew G Knepley ierr = cudaEventSynchronize(stop);CHKERRQ(ierr); 781caccb7e3SMatthew G Knepley //get the the total elapsed time in ms 782caccb7e3SMatthew G Knepley PetscTimeAdd(cpuTimer); 783caccb7e3SMatthew G Knepley if (bDontUseGPUTiming) { 784caccb7e3SMatthew G Knepley times[2][k] = cpuTimer; 785caccb7e3SMatthew G Knepley } else { 786caccb7e3SMatthew G Knepley ierr = cudaEventElapsedTime( ×[2][k], start, stop );CHKERRQ(ierr); 787caccb7e3SMatthew G Knepley } 788caccb7e3SMatthew G Knepley 789caccb7e3SMatthew G Knepley cpuTimer = 0.0; 790caccb7e3SMatthew G Knepley PetscTimeSubtract(cpuTimer); 791caccb7e3SMatthew G Knepley ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr); 792caccb7e3SMatthew G Knepley STREAM_Scale_Optimized_double<<<dimGrid,dimBlock>>>(d_b, d_c, scalar, N); 793caccb7e3SMatthew G Knepley ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr); 794caccb7e3SMatthew G Knepley ierr = cudaEventSynchronize(stop);CHKERRQ(ierr); 795caccb7e3SMatthew G Knepley //get the the total elapsed time in ms 796caccb7e3SMatthew G Knepley PetscTimeAdd(cpuTimer); 797caccb7e3SMatthew G Knepley if (bDontUseGPUTiming) { 798caccb7e3SMatthew G Knepley times[3][k] = cpuTimer; 799caccb7e3SMatthew G Knepley } else { 800caccb7e3SMatthew G Knepley ierr = cudaEventElapsedTime( ×[2][k], start, stop );CHKERRQ(ierr); 801caccb7e3SMatthew G Knepley } 802caccb7e3SMatthew G Knepley 803caccb7e3SMatthew G Knepley cpuTimer = 0.0; 804caccb7e3SMatthew G Knepley PetscTimeSubtract(cpuTimer); 805caccb7e3SMatthew G Knepley ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr); 806caccb7e3SMatthew G Knepley STREAM_Add_double<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N); 807caccb7e3SMatthew G Knepley ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr); 808caccb7e3SMatthew G Knepley ierr = cudaEventSynchronize(stop);CHKERRQ(ierr); 809caccb7e3SMatthew G Knepley //get the the total elapsed time in ms 810caccb7e3SMatthew G Knepley PetscTimeAdd(cpuTimer); 811caccb7e3SMatthew G Knepley if (bDontUseGPUTiming) { 812caccb7e3SMatthew G Knepley times[4][k] = cpuTimer; 813caccb7e3SMatthew G Knepley } else { 814caccb7e3SMatthew G Knepley ierr = cudaEventElapsedTime( ×[3][k], start, stop );CHKERRQ(ierr); 815caccb7e3SMatthew G Knepley } 816caccb7e3SMatthew G Knepley 817caccb7e3SMatthew G Knepley cpuTimer = 0.0; 818caccb7e3SMatthew G Knepley PetscTimeSubtract(cpuTimer); 819caccb7e3SMatthew G Knepley ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr); 820caccb7e3SMatthew G Knepley STREAM_Add_Optimized_double<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N); 821caccb7e3SMatthew G Knepley ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr); 822caccb7e3SMatthew G Knepley ierr = cudaEventSynchronize(stop);CHKERRQ(ierr); 823caccb7e3SMatthew G Knepley //get the the total elapsed time in ms 824caccb7e3SMatthew G Knepley PetscTimeAdd(cpuTimer); 825caccb7e3SMatthew G Knepley if (bDontUseGPUTiming) { 826caccb7e3SMatthew G Knepley times[5][k] = cpuTimer; 827caccb7e3SMatthew G Knepley } else { 828caccb7e3SMatthew G Knepley ierr = cudaEventElapsedTime( ×[3][k], start, stop );CHKERRQ(ierr); 829caccb7e3SMatthew G Knepley } 830caccb7e3SMatthew G Knepley 831caccb7e3SMatthew G Knepley cpuTimer = 0.0; 832caccb7e3SMatthew G Knepley PetscTimeSubtract(cpuTimer); 833caccb7e3SMatthew G Knepley ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr); 834caccb7e3SMatthew G Knepley STREAM_Triad_double<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N); 835caccb7e3SMatthew G Knepley ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr); 836caccb7e3SMatthew G Knepley ierr = cudaEventSynchronize(stop);CHKERRQ(ierr); 837caccb7e3SMatthew G Knepley //get the the total elapsed time in ms 838caccb7e3SMatthew G Knepley PetscTimeAdd(cpuTimer); 839caccb7e3SMatthew G Knepley if (bDontUseGPUTiming) { 840caccb7e3SMatthew G Knepley times[6][k] = cpuTimer; 841caccb7e3SMatthew G Knepley } else { 842caccb7e3SMatthew G Knepley ierr = cudaEventElapsedTime( ×[4][k], start, stop );CHKERRQ(ierr); 843caccb7e3SMatthew G Knepley } 844caccb7e3SMatthew G Knepley 845caccb7e3SMatthew G Knepley cpuTimer = 0.0; 846caccb7e3SMatthew G Knepley PetscTimeSubtract(cpuTimer); 847caccb7e3SMatthew G Knepley ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr); 848caccb7e3SMatthew G Knepley STREAM_Triad_Optimized_double<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N); 849caccb7e3SMatthew G Knepley ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr); 850caccb7e3SMatthew G Knepley ierr = cudaEventSynchronize(stop);CHKERRQ(ierr); 851caccb7e3SMatthew G Knepley //get the the total elapsed time in ms 852caccb7e3SMatthew G Knepley PetscTimeAdd(cpuTimer); 853caccb7e3SMatthew G Knepley if (bDontUseGPUTiming) { 854caccb7e3SMatthew G Knepley times[7][k] = cpuTimer; 855caccb7e3SMatthew G Knepley } else { 856caccb7e3SMatthew G Knepley ierr = cudaEventElapsedTime( ×[4][k], start, stop );CHKERRQ(ierr); 857caccb7e3SMatthew G Knepley } 858caccb7e3SMatthew G Knepley 859caccb7e3SMatthew G Knepley } 860caccb7e3SMatthew G Knepley 861caccb7e3SMatthew G Knepley /* verify kernels */ 862caccb7e3SMatthew G Knepley double *h_a, *h_b, *h_c; 863caccb7e3SMatthew G Knepley bool errorSTREAMkernel = true; 864caccb7e3SMatthew G Knepley 865caccb7e3SMatthew G Knepley if ( (h_a = (double*)calloc( N, sizeof(double) )) == (double*)NULL ) { 866caccb7e3SMatthew G Knepley printf("Unable to allocate array h_a, exiting ...\n"); 867caccb7e3SMatthew G Knepley exit(1); 868caccb7e3SMatthew G Knepley } 869caccb7e3SMatthew G Knepley if ( (h_b = (double*)calloc( N, sizeof(double) )) == (double*)NULL ) { 870caccb7e3SMatthew G Knepley printf("Unable to allocate array h_b, exiting ...\n"); 871caccb7e3SMatthew G Knepley exit(1); 872caccb7e3SMatthew G Knepley } 873caccb7e3SMatthew G Knepley 874caccb7e3SMatthew G Knepley if ( (h_c = (double*)calloc( N, sizeof(double) )) == (double*)NULL ) { 875caccb7e3SMatthew G Knepley printf("Unalbe to allocate array h_c, exiting ...\n"); 876caccb7e3SMatthew G Knepley exit(1); 877caccb7e3SMatthew G Knepley } 878caccb7e3SMatthew G Knepley 879caccb7e3SMatthew G Knepley /* 880caccb7e3SMatthew G Knepley * perform kernel, copy device memory into host memory and verify each 881caccb7e3SMatthew G Knepley * device kernel output 882caccb7e3SMatthew G Knepley */ 883caccb7e3SMatthew G Knepley 884caccb7e3SMatthew G Knepley /* Initialize memory on the device */ 885caccb7e3SMatthew G Knepley set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N); 886caccb7e3SMatthew G Knepley set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N); 887caccb7e3SMatthew G Knepley set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N); 888caccb7e3SMatthew G Knepley 889caccb7e3SMatthew G Knepley STREAM_Copy_double<<<dimGrid,dimBlock>>>(d_a, d_c, N); 890caccb7e3SMatthew G Knepley ierr = cudaMemcpy( h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 891caccb7e3SMatthew G Knepley ierr = cudaMemcpy( h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 892caccb7e3SMatthew G Knepley errorSTREAMkernel = STREAM_Copy_verify_double(h_a, h_c, N); 893caccb7e3SMatthew G Knepley if (errorSTREAMkernel) { 894caccb7e3SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy:\t\tError detected in device STREAM_Copy, exiting\n");CHKERRQ(ierr); 895caccb7e3SMatthew G Knepley exit(-2000); 896caccb7e3SMatthew G Knepley } else { 897caccb7e3SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy:\t\tPass\n");CHKERRQ(ierr); 898caccb7e3SMatthew G Knepley } 899caccb7e3SMatthew G Knepley 900caccb7e3SMatthew G Knepley /* Initialize memory on the device */ 901caccb7e3SMatthew G Knepley set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N); 902caccb7e3SMatthew G Knepley set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N); 903caccb7e3SMatthew G Knepley set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N); 904caccb7e3SMatthew G Knepley 905caccb7e3SMatthew G Knepley STREAM_Copy_Optimized_double<<<dimGrid,dimBlock>>>(d_a, d_c, N); 906caccb7e3SMatthew G Knepley ierr = cudaMemcpy( h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 907caccb7e3SMatthew G Knepley ierr = cudaMemcpy( h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 908caccb7e3SMatthew G Knepley errorSTREAMkernel = STREAM_Copy_verify_double(h_a, h_c, N); 909caccb7e3SMatthew G Knepley if (errorSTREAMkernel) { 910caccb7e3SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy_Optimized:\tError detected in device STREAM_Copy_Optimized, exiting\n");CHKERRQ(ierr); 911caccb7e3SMatthew G Knepley exit(-3000); 912caccb7e3SMatthew G Knepley } else { 913caccb7e3SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy_Optimized:\tPass\n");CHKERRQ(ierr); 914caccb7e3SMatthew G Knepley } 915caccb7e3SMatthew G Knepley 916caccb7e3SMatthew G Knepley /* Initialize memory on the device */ 917caccb7e3SMatthew G Knepley set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N); 918caccb7e3SMatthew G Knepley set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N); 919caccb7e3SMatthew G Knepley 920caccb7e3SMatthew G Knepley STREAM_Scale_double<<<dimGrid,dimBlock>>>(d_b, d_c, scalar, N); 921caccb7e3SMatthew G Knepley ierr = cudaMemcpy( h_b, d_b, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 922caccb7e3SMatthew G Knepley ierr = cudaMemcpy( h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 923caccb7e3SMatthew G Knepley errorSTREAMkernel = STREAM_Scale_verify_double(h_b, h_c, scalar, N); 924caccb7e3SMatthew G Knepley if (errorSTREAMkernel) { 925caccb7e3SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Scale:\t\tError detected in device STREAM_Scale, exiting\n");CHKERRQ(ierr); 926caccb7e3SMatthew G Knepley exit(-4000); 927caccb7e3SMatthew G Knepley } else { 928caccb7e3SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Scale:\t\tPass\n");CHKERRQ(ierr); 929caccb7e3SMatthew G Knepley } 930caccb7e3SMatthew G Knepley 931caccb7e3SMatthew G Knepley /* Initialize memory on the device */ 932caccb7e3SMatthew G Knepley set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N); 933caccb7e3SMatthew G Knepley set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N); 934caccb7e3SMatthew G Knepley set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N); 935caccb7e3SMatthew G Knepley 936caccb7e3SMatthew G Knepley STREAM_Add_double<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N); 937caccb7e3SMatthew G Knepley ierr = cudaMemcpy( h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 938caccb7e3SMatthew G Knepley ierr = cudaMemcpy( h_b, d_b, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 939caccb7e3SMatthew G Knepley ierr = cudaMemcpy( h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 940caccb7e3SMatthew G Knepley errorSTREAMkernel = STREAM_Add_verify_double(h_a, h_b, h_c, N); 941caccb7e3SMatthew G Knepley if (errorSTREAMkernel) { 942caccb7e3SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Add:\t\tError detected in device STREAM_Add, exiting\n");CHKERRQ(ierr); 943caccb7e3SMatthew G Knepley exit(-5000); 944caccb7e3SMatthew G Knepley } else { 945caccb7e3SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Add:\t\tPass\n");CHKERRQ(ierr); 946caccb7e3SMatthew G Knepley } 947caccb7e3SMatthew G Knepley 948caccb7e3SMatthew G Knepley /* Initialize memory on the device */ 949caccb7e3SMatthew G Knepley set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N); 950caccb7e3SMatthew G Knepley set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N); 951caccb7e3SMatthew G Knepley set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N); 952caccb7e3SMatthew G Knepley 953caccb7e3SMatthew G Knepley STREAM_Triad_double<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N); 954caccb7e3SMatthew G Knepley ierr = cudaMemcpy( h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 955caccb7e3SMatthew G Knepley ierr = cudaMemcpy( h_b, d_b, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 956caccb7e3SMatthew G Knepley ierr = cudaMemcpy( h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 957caccb7e3SMatthew G Knepley errorSTREAMkernel = STREAM_Triad_verify_double(h_b, h_c, h_a, scalar, N); 958caccb7e3SMatthew G Knepley if (errorSTREAMkernel) { 959caccb7e3SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Triad:\t\tError detected in device STREAM_Triad, exiting\n");CHKERRQ(ierr); 960caccb7e3SMatthew G Knepley exit(-6000); 961caccb7e3SMatthew G Knepley } else { 962caccb7e3SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Triad:\t\tPass\n");CHKERRQ(ierr); 963caccb7e3SMatthew G Knepley } 964caccb7e3SMatthew G Knepley 965caccb7e3SMatthew G Knepley /* continue from here */ 966caccb7e3SMatthew G Knepley printResultsReadable(times); 967caccb7e3SMatthew G Knepley 968caccb7e3SMatthew G Knepley //clean up timers 969caccb7e3SMatthew G Knepley ierr = cudaEventDestroy( stop );CHKERRQ(ierr); 970caccb7e3SMatthew G Knepley ierr = cudaEventDestroy( start );CHKERRQ(ierr); 971caccb7e3SMatthew G Knepley 972caccb7e3SMatthew G Knepley /* Free memory on device */ 973caccb7e3SMatthew G Knepley ierr = cudaFree(d_a);CHKERRQ(ierr); 974caccb7e3SMatthew G Knepley ierr = cudaFree(d_b);CHKERRQ(ierr); 975caccb7e3SMatthew G Knepley ierr = cudaFree(d_c);CHKERRQ(ierr); 976caccb7e3SMatthew G Knepley PetscFunctionReturn(0); 977caccb7e3SMatthew G Knepley } 978caccb7e3SMatthew G Knepley 979403adfb6SMatthew G Knepley /////////////////////////////////////////////////////////////////////////// 980403adfb6SMatthew G Knepley //Print Results to Screen and File 981403adfb6SMatthew G Knepley /////////////////////////////////////////////////////////////////////////// 982*a6dfd86eSKarl Rupp PetscErrorCode printResultsReadable(float times[][NTIMES]) 983*a6dfd86eSKarl Rupp { 984403adfb6SMatthew G Knepley PetscErrorCode ierr; 985403adfb6SMatthew G Knepley PetscInt j, k; 986caccb7e3SMatthew G Knepley float avgtime[8] = {0., 0., 0., 0., 0., 0., 0., 0.}; 987caccb7e3SMatthew G Knepley float maxtime[8] = {0., 0., 0., 0., 0., 0., 0., 0.}; 988caccb7e3SMatthew G Knepley float mintime[8] = {1e30,1e30,1e30,1e30,1e30,1e30,1e30,1e30}; 989caccb7e3SMatthew G Knepley char *label[8] = {"Copy: ", "Copy Opt.: ", "Scale: ", "Scale Opt: ", "Add: ", "Add Opt: ", "Triad: ", "Triad Opt: "}; 990caccb7e3SMatthew G Knepley float bytes_per_kernel[8] = { 991403adfb6SMatthew G Knepley 2. * sizeof(float) * N, 992403adfb6SMatthew G Knepley 2. * sizeof(float) * N, 993403adfb6SMatthew G Knepley 2. * sizeof(float) * N, 994caccb7e3SMatthew G Knepley 2. * sizeof(float) * N, 995caccb7e3SMatthew G Knepley 3. * sizeof(float) * N, 996caccb7e3SMatthew G Knepley 3. * sizeof(float) * N, 997403adfb6SMatthew G Knepley 3. * sizeof(float) * N, 998403adfb6SMatthew G Knepley 3. * sizeof(float) * N 999403adfb6SMatthew G Knepley }; 1000403adfb6SMatthew G Knepley 1001403adfb6SMatthew G Knepley PetscFunctionBegin; 1002403adfb6SMatthew G Knepley /* --- SUMMARY --- */ 1003403adfb6SMatthew G Knepley for (k = 1; k < NTIMES; ++k) { /* note -- skip first iteration */ 1004caccb7e3SMatthew G Knepley for (j = 0; j < 8; ++j) { 1005403adfb6SMatthew G Knepley avgtime[j] = avgtime[j] + (1.e-03f * times[j][k]); 1006403adfb6SMatthew G Knepley mintime[j] = MIN(mintime[j], (1.e-03f * times[j][k])); 1007403adfb6SMatthew G Knepley maxtime[j] = MAX(maxtime[j], (1.e-03f * times[j][k])); 1008403adfb6SMatthew G Knepley } 1009403adfb6SMatthew G Knepley } 1010403adfb6SMatthew G Knepley 1011403adfb6SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, "Function Rate (MB/s) Avg time Min time Max time\n");CHKERRQ(ierr); 1012403adfb6SMatthew G Knepley 1013caccb7e3SMatthew G Knepley for (j = 0; j < 8; ++j) { 1014403adfb6SMatthew G Knepley avgtime[j] = avgtime[j]/(float)(NTIMES-1); 1015403adfb6SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, "%s%11.4f %11.6f %12.6f %12.6f\n", label[j], 1.0E-06 * bytes_per_kernel[j]/mintime[j], avgtime[j], mintime[j], maxtime[j]);CHKERRQ(ierr); 1016403adfb6SMatthew G Knepley } 1017403adfb6SMatthew G Knepley PetscFunctionReturn(0); 1018403adfb6SMatthew G Knepley } 1019