1403adfb6SMatthew G Knepley /* 2403adfb6SMatthew G Knepley STREAM benchmark implementation in CUDA. 3403adfb6SMatthew G Knepley 4403adfb6SMatthew G Knepley COPY: a(i) = b(i) 5403adfb6SMatthew G Knepley SCALE: a(i) = q*b(i) 6403adfb6SMatthew G Knepley SUM: a(i) = b(i) + c(i) 7403adfb6SMatthew G Knepley TRIAD: a(i) = b(i) + q*c(i) 8403adfb6SMatthew G Knepley 9403adfb6SMatthew G Knepley It measures the memory system on the device. 1019816777SMark The implementation is in double precision with a single option. 11403adfb6SMatthew G Knepley 12403adfb6SMatthew G Knepley Code based on the code developed by John D. McCalpin 13403adfb6SMatthew G Knepley http://www.cs.virginia.edu/stream/FTP/Code/stream.c 14403adfb6SMatthew G Knepley 15403adfb6SMatthew G Knepley Written by: Massimiliano Fatica, NVIDIA Corporation 16403adfb6SMatthew G Knepley Modified by: Douglas Enright (dpephd-nvidia@yahoo.com), 1 December 2010 17403adfb6SMatthew G Knepley Extensive Revisions, 4 December 2010 18403adfb6SMatthew G Knepley Modified for PETSc by: Matthew G. Knepley 14 Aug 2011 19403adfb6SMatthew G Knepley 20403adfb6SMatthew G Knepley User interface motivated by bandwidthTest NVIDIA SDK example. 21403adfb6SMatthew G Knepley */ 2219816777SMark static char help[] = "Double-Precision STREAM Benchmark implementation in CUDA\n Performs Copy, Scale, Add, and Triad double-precision kernels\n\n"; 23403adfb6SMatthew G Knepley 24403adfb6SMatthew G Knepley #include <petscconf.h> 25403adfb6SMatthew G Knepley #include <petscsys.h> 26403adfb6SMatthew G Knepley #include <petsctime.h> 275f80ce2aSJacob Faibussowitsch #include <petscdevice.h> 28403adfb6SMatthew G Knepley 2919816777SMark #define N 10000000 30403adfb6SMatthew G Knepley #define NTIMES 10 31403adfb6SMatthew G Knepley 32403adfb6SMatthew G Knepley # ifndef MIN 33403adfb6SMatthew G Knepley # define MIN(x,y) ((x)<(y) ? (x) : (y)) 34403adfb6SMatthew G Knepley # endif 35403adfb6SMatthew G Knepley # ifndef MAX 36403adfb6SMatthew G Knepley # define MAX(x,y) ((x)>(y) ? (x) : (y)) 37403adfb6SMatthew G Knepley # endif 38403adfb6SMatthew G Knepley 39403adfb6SMatthew G Knepley const float flt_eps = 1.192092896e-07f; 40caccb7e3SMatthew G Knepley const double dbl_eps = 2.2204460492503131e-16; 41403adfb6SMatthew G Knepley 42403adfb6SMatthew G Knepley __global__ void set_array(float *a, float value, size_t len) 43403adfb6SMatthew G Knepley { 44403adfb6SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 45403adfb6SMatthew G Knepley while (idx < len) { 46403adfb6SMatthew G Knepley a[idx] = value; 47403adfb6SMatthew G Knepley idx += blockDim.x * gridDim.x; 48403adfb6SMatthew G Knepley } 49403adfb6SMatthew G Knepley } 50403adfb6SMatthew G Knepley 51caccb7e3SMatthew G Knepley __global__ void set_array_double(double *a, double value, size_t len) 52caccb7e3SMatthew G Knepley { 53caccb7e3SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 54caccb7e3SMatthew G Knepley while (idx < len) { 55caccb7e3SMatthew G Knepley a[idx] = value; 56caccb7e3SMatthew G Knepley idx += blockDim.x * gridDim.x; 57caccb7e3SMatthew G Knepley } 58caccb7e3SMatthew G Knepley } 59caccb7e3SMatthew G Knepley 60403adfb6SMatthew G Knepley __global__ void STREAM_Copy(float *a, float *b, size_t len) 61403adfb6SMatthew G Knepley { 62403adfb6SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 63403adfb6SMatthew G Knepley while (idx < len) { 64403adfb6SMatthew G Knepley b[idx] = a[idx]; 65403adfb6SMatthew G Knepley idx += blockDim.x * gridDim.x; 66403adfb6SMatthew G Knepley } 67403adfb6SMatthew G Knepley } 68403adfb6SMatthew G Knepley 69caccb7e3SMatthew G Knepley __global__ void STREAM_Copy_double(double *a, double *b, size_t len) 70caccb7e3SMatthew G Knepley { 71caccb7e3SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 72caccb7e3SMatthew G Knepley while (idx < len) { 73caccb7e3SMatthew G Knepley b[idx] = a[idx]; 74caccb7e3SMatthew G Knepley idx += blockDim.x * gridDim.x; 75caccb7e3SMatthew G Knepley } 76caccb7e3SMatthew G Knepley } 77caccb7e3SMatthew G Knepley 78403adfb6SMatthew G Knepley __global__ void STREAM_Copy_Optimized(float *a, float *b, size_t len) 79403adfb6SMatthew G Knepley { 80403adfb6SMatthew G Knepley /* 81403adfb6SMatthew G Knepley * Ensure size of thread index space is as large as or greater than 82403adfb6SMatthew G Knepley * vector index space else return. 83403adfb6SMatthew G Knepley */ 84403adfb6SMatthew G Knepley if (blockDim.x * gridDim.x < len) return; 85403adfb6SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 86403adfb6SMatthew G Knepley if (idx < len) b[idx] = a[idx]; 87403adfb6SMatthew G Knepley } 88403adfb6SMatthew G Knepley 89caccb7e3SMatthew G Knepley __global__ void STREAM_Copy_Optimized_double(double *a, double *b, size_t len) 90caccb7e3SMatthew G Knepley { 91caccb7e3SMatthew G Knepley /* 92caccb7e3SMatthew G Knepley * Ensure size of thread index space is as large as or greater than 93caccb7e3SMatthew G Knepley * vector index space else return. 94caccb7e3SMatthew G Knepley */ 95caccb7e3SMatthew G Knepley if (blockDim.x * gridDim.x < len) return; 96caccb7e3SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 97caccb7e3SMatthew G Knepley if (idx < len) b[idx] = a[idx]; 98caccb7e3SMatthew G Knepley } 99caccb7e3SMatthew G Knepley 100403adfb6SMatthew G Knepley __global__ void STREAM_Scale(float *a, float *b, float scale, size_t len) 101403adfb6SMatthew G Knepley { 102403adfb6SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 103403adfb6SMatthew G Knepley while (idx < len) { 104403adfb6SMatthew G Knepley b[idx] = scale* a[idx]; 105403adfb6SMatthew G Knepley idx += blockDim.x * gridDim.x; 106403adfb6SMatthew G Knepley } 107403adfb6SMatthew G Knepley } 108403adfb6SMatthew G Knepley 109caccb7e3SMatthew G Knepley __global__ void STREAM_Scale_double(double *a, double *b, double scale, size_t len) 110caccb7e3SMatthew G Knepley { 111caccb7e3SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 112caccb7e3SMatthew G Knepley while (idx < len) { 113caccb7e3SMatthew G Knepley b[idx] = scale* a[idx]; 114caccb7e3SMatthew G Knepley idx += blockDim.x * gridDim.x; 115caccb7e3SMatthew G Knepley } 116caccb7e3SMatthew G Knepley } 117caccb7e3SMatthew G Knepley 118caccb7e3SMatthew G Knepley __global__ void STREAM_Scale_Optimized(float *a, float *b, float scale, size_t len) 119caccb7e3SMatthew G Knepley { 120caccb7e3SMatthew G Knepley /* 121caccb7e3SMatthew G Knepley * Ensure size of thread index space is as large as or greater than 122caccb7e3SMatthew G Knepley * vector index space else return. 123caccb7e3SMatthew G Knepley */ 124caccb7e3SMatthew G Knepley if (blockDim.x * gridDim.x < len) return; 125caccb7e3SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 126caccb7e3SMatthew G Knepley if (idx < len) b[idx] = scale* a[idx]; 127caccb7e3SMatthew G Knepley } 128caccb7e3SMatthew G Knepley 129caccb7e3SMatthew G Knepley __global__ void STREAM_Scale_Optimized_double(double *a, double *b, double scale, size_t len) 130caccb7e3SMatthew G Knepley { 131caccb7e3SMatthew G Knepley /* 132caccb7e3SMatthew G Knepley * Ensure size of thread index space is as large as or greater than 133caccb7e3SMatthew G Knepley * vector index space else return. 134caccb7e3SMatthew G Knepley */ 135caccb7e3SMatthew G Knepley if (blockDim.x * gridDim.x < len) return; 136caccb7e3SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 137caccb7e3SMatthew G Knepley if (idx < len) b[idx] = scale* a[idx]; 138caccb7e3SMatthew G Knepley } 139caccb7e3SMatthew G Knepley 140403adfb6SMatthew G Knepley __global__ void STREAM_Add(float *a, float *b, float *c, size_t len) 141403adfb6SMatthew G Knepley { 142403adfb6SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 143403adfb6SMatthew G Knepley while (idx < len) { 144403adfb6SMatthew G Knepley c[idx] = a[idx]+b[idx]; 145403adfb6SMatthew G Knepley idx += blockDim.x * gridDim.x; 146403adfb6SMatthew G Knepley } 147403adfb6SMatthew G Knepley } 148403adfb6SMatthew G Knepley 149caccb7e3SMatthew G Knepley __global__ void STREAM_Add_double(double *a, double *b, double *c, size_t len) 150caccb7e3SMatthew G Knepley { 151caccb7e3SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 152caccb7e3SMatthew G Knepley while (idx < len) { 153caccb7e3SMatthew G Knepley c[idx] = a[idx]+b[idx]; 154caccb7e3SMatthew G Knepley idx += blockDim.x * gridDim.x; 155caccb7e3SMatthew G Knepley } 156caccb7e3SMatthew G Knepley } 157caccb7e3SMatthew G Knepley 158caccb7e3SMatthew G Knepley __global__ void STREAM_Add_Optimized(float *a, float *b, float *c, size_t len) 159caccb7e3SMatthew G Knepley { 160caccb7e3SMatthew G Knepley /* 161caccb7e3SMatthew G Knepley * Ensure size of thread index space is as large as or greater than 162caccb7e3SMatthew G Knepley * vector index space else return. 163caccb7e3SMatthew G Knepley */ 164caccb7e3SMatthew G Knepley if (blockDim.x * gridDim.x < len) return; 165caccb7e3SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 166caccb7e3SMatthew G Knepley if (idx < len) c[idx] = a[idx]+b[idx]; 167caccb7e3SMatthew G Knepley } 168caccb7e3SMatthew G Knepley 169caccb7e3SMatthew G Knepley __global__ void STREAM_Add_Optimized_double(double *a, double *b, double *c, size_t len) 170caccb7e3SMatthew G Knepley { 171caccb7e3SMatthew G Knepley /* 172caccb7e3SMatthew G Knepley * Ensure size of thread index space is as large as or greater than 173caccb7e3SMatthew G Knepley * vector index space else return. 174caccb7e3SMatthew G Knepley */ 175caccb7e3SMatthew G Knepley if (blockDim.x * gridDim.x < len) return; 176caccb7e3SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 177caccb7e3SMatthew G Knepley if (idx < len) c[idx] = a[idx]+b[idx]; 178caccb7e3SMatthew G Knepley } 179caccb7e3SMatthew G Knepley 180403adfb6SMatthew G Knepley __global__ void STREAM_Triad(float *a, float *b, float *c, float scalar, size_t len) 181403adfb6SMatthew G Knepley { 182403adfb6SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 183403adfb6SMatthew G Knepley while (idx < len) { 184403adfb6SMatthew G Knepley c[idx] = a[idx]+scalar*b[idx]; 185403adfb6SMatthew G Knepley idx += blockDim.x * gridDim.x; 186403adfb6SMatthew G Knepley } 187403adfb6SMatthew G Knepley } 188403adfb6SMatthew G Knepley 189caccb7e3SMatthew G Knepley __global__ void STREAM_Triad_double(double *a, double *b, double *c, double scalar, size_t len) 190caccb7e3SMatthew G Knepley { 191caccb7e3SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 192caccb7e3SMatthew G Knepley while (idx < len) { 193caccb7e3SMatthew G Knepley c[idx] = a[idx]+scalar*b[idx]; 194caccb7e3SMatthew G Knepley idx += blockDim.x * gridDim.x; 195caccb7e3SMatthew G Knepley } 196caccb7e3SMatthew G Knepley } 197caccb7e3SMatthew G Knepley 198caccb7e3SMatthew G Knepley __global__ void STREAM_Triad_Optimized(float *a, float *b, float *c, float scalar, size_t len) 199caccb7e3SMatthew G Knepley { 200caccb7e3SMatthew G Knepley /* 201caccb7e3SMatthew G Knepley * Ensure size of thread index space is as large as or greater than 202caccb7e3SMatthew G Knepley * vector index space else return. 203caccb7e3SMatthew G Knepley */ 204caccb7e3SMatthew G Knepley if (blockDim.x * gridDim.x < len) return; 205caccb7e3SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 206caccb7e3SMatthew G Knepley if (idx < len) c[idx] = a[idx]+scalar*b[idx]; 207caccb7e3SMatthew G Knepley } 208caccb7e3SMatthew G Knepley 209caccb7e3SMatthew G Knepley __global__ void STREAM_Triad_Optimized_double(double *a, double *b, double *c, double scalar, size_t len) 210caccb7e3SMatthew G Knepley { 211caccb7e3SMatthew G Knepley /* 212caccb7e3SMatthew G Knepley * Ensure size of thread index space is as large as or greater than 213caccb7e3SMatthew G Knepley * vector index space else return. 214caccb7e3SMatthew G Knepley */ 215caccb7e3SMatthew G Knepley if (blockDim.x * gridDim.x < len) return; 216caccb7e3SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 217caccb7e3SMatthew G Knepley if (idx < len) c[idx] = a[idx]+scalar*b[idx]; 218caccb7e3SMatthew G Knepley } 219caccb7e3SMatthew G Knepley 220403adfb6SMatthew G Knepley /* Host side verification routines */ 221a6dfd86eSKarl Rupp bool STREAM_Copy_verify(float *a, float *b, size_t len) 222a6dfd86eSKarl Rupp { 223403adfb6SMatthew G Knepley size_t idx; 224403adfb6SMatthew G Knepley bool bDifferent = false; 225403adfb6SMatthew G Knepley 226403adfb6SMatthew G Knepley for (idx = 0; idx < len && !bDifferent; idx++) { 227403adfb6SMatthew G Knepley float expectedResult = a[idx]; 228403adfb6SMatthew G Knepley float diffResultExpected = (b[idx] - expectedResult); 229403adfb6SMatthew G Knepley float relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps; 230403adfb6SMatthew G Knepley /* element-wise relative error determination */ 231403adfb6SMatthew G Knepley bDifferent = (relErrorULPS > 2.f); 232403adfb6SMatthew G Knepley } 233403adfb6SMatthew G Knepley 234403adfb6SMatthew G Knepley return bDifferent; 235403adfb6SMatthew G Knepley } 236403adfb6SMatthew G Knepley 237a6dfd86eSKarl Rupp bool STREAM_Copy_verify_double(double *a, double *b, size_t len) 238a6dfd86eSKarl Rupp { 239caccb7e3SMatthew G Knepley size_t idx; 240caccb7e3SMatthew G Knepley bool bDifferent = false; 241caccb7e3SMatthew G Knepley 242caccb7e3SMatthew G Knepley for (idx = 0; idx < len && !bDifferent; idx++) { 243caccb7e3SMatthew G Knepley double expectedResult = a[idx]; 244caccb7e3SMatthew G Knepley double diffResultExpected = (b[idx] - expectedResult); 24519816777SMark double relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/dbl_eps; 246caccb7e3SMatthew G Knepley /* element-wise relative error determination */ 247caccb7e3SMatthew G Knepley bDifferent = (relErrorULPS > 2.); 248caccb7e3SMatthew G Knepley } 249caccb7e3SMatthew G Knepley 250caccb7e3SMatthew G Knepley return bDifferent; 251caccb7e3SMatthew G Knepley } 252caccb7e3SMatthew G Knepley 253a6dfd86eSKarl Rupp bool STREAM_Scale_verify(float *a, float *b, float scale, size_t len) 254a6dfd86eSKarl Rupp { 255403adfb6SMatthew G Knepley size_t idx; 256403adfb6SMatthew G Knepley bool bDifferent = false; 257403adfb6SMatthew G Knepley 258403adfb6SMatthew G Knepley for (idx = 0; idx < len && !bDifferent; idx++) { 259403adfb6SMatthew G Knepley float expectedResult = scale*a[idx]; 260403adfb6SMatthew G Knepley float diffResultExpected = (b[idx] - expectedResult); 261403adfb6SMatthew G Knepley float relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps; 262403adfb6SMatthew G Knepley /* element-wise relative error determination */ 263403adfb6SMatthew G Knepley bDifferent = (relErrorULPS > 2.f); 264403adfb6SMatthew G Knepley } 265403adfb6SMatthew G Knepley 266403adfb6SMatthew G Knepley return bDifferent; 267403adfb6SMatthew G Knepley } 268403adfb6SMatthew G Knepley 269a6dfd86eSKarl Rupp bool STREAM_Scale_verify_double(double *a, double *b, double scale, size_t len) 270a6dfd86eSKarl Rupp { 271caccb7e3SMatthew G Knepley size_t idx; 272caccb7e3SMatthew G Knepley bool bDifferent = false; 273caccb7e3SMatthew G Knepley 274caccb7e3SMatthew G Knepley for (idx = 0; idx < len && !bDifferent; idx++) { 275caccb7e3SMatthew G Knepley double expectedResult = scale*a[idx]; 276caccb7e3SMatthew G Knepley double diffResultExpected = (b[idx] - expectedResult); 277caccb7e3SMatthew G Knepley double relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps; 278caccb7e3SMatthew G Knepley /* element-wise relative error determination */ 279caccb7e3SMatthew G Knepley bDifferent = (relErrorULPS > 2.); 280caccb7e3SMatthew G Knepley } 281caccb7e3SMatthew G Knepley 282caccb7e3SMatthew G Knepley return bDifferent; 283caccb7e3SMatthew G Knepley } 284caccb7e3SMatthew G Knepley 285a6dfd86eSKarl Rupp bool STREAM_Add_verify(float *a, float *b, float *c, size_t len) 286a6dfd86eSKarl Rupp { 287403adfb6SMatthew G Knepley size_t idx; 288403adfb6SMatthew G Knepley bool bDifferent = false; 289403adfb6SMatthew G Knepley 290403adfb6SMatthew G Knepley for (idx = 0; idx < len && !bDifferent; idx++) { 291403adfb6SMatthew G Knepley float expectedResult = a[idx] + b[idx]; 292403adfb6SMatthew G Knepley float diffResultExpected = (c[idx] - expectedResult); 293403adfb6SMatthew G Knepley float relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps; 294403adfb6SMatthew G Knepley /* element-wise relative error determination */ 295403adfb6SMatthew G Knepley bDifferent = (relErrorULPS > 2.f); 296403adfb6SMatthew G Knepley } 297403adfb6SMatthew G Knepley 298403adfb6SMatthew G Knepley return bDifferent; 299403adfb6SMatthew G Knepley } 300403adfb6SMatthew G Knepley 301a6dfd86eSKarl Rupp bool STREAM_Add_verify_double(double *a, double *b, double *c, size_t len) 302a6dfd86eSKarl Rupp { 303caccb7e3SMatthew G Knepley size_t idx; 304caccb7e3SMatthew G Knepley bool bDifferent = false; 305caccb7e3SMatthew G Knepley 306caccb7e3SMatthew G Knepley for (idx = 0; idx < len && !bDifferent; idx++) { 307caccb7e3SMatthew G Knepley double expectedResult = a[idx] + b[idx]; 308caccb7e3SMatthew G Knepley double diffResultExpected = (c[idx] - expectedResult); 309caccb7e3SMatthew G Knepley double relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps; 310caccb7e3SMatthew G Knepley /* element-wise relative error determination */ 311caccb7e3SMatthew G Knepley bDifferent = (relErrorULPS > 2.); 312caccb7e3SMatthew G Knepley } 313caccb7e3SMatthew G Knepley 314caccb7e3SMatthew G Knepley return bDifferent; 315caccb7e3SMatthew G Knepley } 316caccb7e3SMatthew G Knepley 317a6dfd86eSKarl Rupp bool STREAM_Triad_verify(float *a, float *b, float *c, float scalar, size_t len) 318a6dfd86eSKarl Rupp { 319403adfb6SMatthew G Knepley size_t idx; 320403adfb6SMatthew G Knepley bool bDifferent = false; 321403adfb6SMatthew G Knepley 322403adfb6SMatthew G Knepley for (idx = 0; idx < len && !bDifferent; idx++) { 323403adfb6SMatthew G Knepley float expectedResult = a[idx] + scalar*b[idx]; 324403adfb6SMatthew G Knepley float diffResultExpected = (c[idx] - expectedResult); 325403adfb6SMatthew G Knepley float relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps; 326403adfb6SMatthew G Knepley /* element-wise relative error determination */ 327403adfb6SMatthew G Knepley bDifferent = (relErrorULPS > 3.f); 328403adfb6SMatthew G Knepley } 329403adfb6SMatthew G Knepley 330403adfb6SMatthew G Knepley return bDifferent; 331403adfb6SMatthew G Knepley } 332403adfb6SMatthew G Knepley 333a6dfd86eSKarl Rupp bool STREAM_Triad_verify_double(double *a, double *b, double *c, double scalar, size_t len) 334a6dfd86eSKarl Rupp { 335caccb7e3SMatthew G Knepley size_t idx; 336caccb7e3SMatthew G Knepley bool bDifferent = false; 337caccb7e3SMatthew G Knepley 338caccb7e3SMatthew G Knepley for (idx = 0; idx < len && !bDifferent; idx++) { 339caccb7e3SMatthew G Knepley double expectedResult = a[idx] + scalar*b[idx]; 340caccb7e3SMatthew G Knepley double diffResultExpected = (c[idx] - expectedResult); 341caccb7e3SMatthew G Knepley double relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps; 342caccb7e3SMatthew G Knepley /* element-wise relative error determination */ 343caccb7e3SMatthew G Knepley bDifferent = (relErrorULPS > 3.); 344caccb7e3SMatthew G Knepley } 345caccb7e3SMatthew G Knepley 346caccb7e3SMatthew G Knepley return bDifferent; 347caccb7e3SMatthew G Knepley } 348caccb7e3SMatthew G Knepley 349403adfb6SMatthew G Knepley /* forward declarations */ 350caccb7e3SMatthew G Knepley PetscErrorCode setupStream(PetscInt device, PetscBool runDouble, PetscBool cpuTiming); 351403adfb6SMatthew G Knepley PetscErrorCode runStream(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming); 352caccb7e3SMatthew G Knepley PetscErrorCode runStreamDouble(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming); 35319816777SMark PetscErrorCode printResultsReadable(float times[][NTIMES], size_t); 354403adfb6SMatthew G Knepley 355403adfb6SMatthew G Knepley int main(int argc, char *argv[]) 356403adfb6SMatthew G Knepley { 357403adfb6SMatthew G Knepley PetscInt device = 0; 35819816777SMark PetscBool runDouble = PETSC_TRUE; 35919816777SMark const PetscBool cpuTiming = PETSC_TRUE; // must be true 360403adfb6SMatthew G Knepley PetscErrorCode ierr; 361403adfb6SMatthew G Knepley 3625f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaSetDeviceFlags(cudaDeviceBlockingSync)); 36319816777SMark 364*b122ec5aSJacob Faibussowitsch CHKERRQ(PetscInitialize(&argc, &argv, 0, help)); 365403adfb6SMatthew G Knepley 366403adfb6SMatthew G Knepley ierr = PetscOptionsBegin(PETSC_COMM_WORLD, "", "STREAM Benchmark Options", "STREAM");CHKERRQ(ierr); 3675f80ce2aSJacob Faibussowitsch CHKERRQ(PetscOptionsBoundedInt("-device", "Specify the CUDA device to be used", "STREAM", device, &device, NULL,0)); 3685f80ce2aSJacob Faibussowitsch CHKERRQ(PetscOptionsBool("-double", "Also run double precision tests", "STREAM", runDouble, &runDouble, NULL)); 369403adfb6SMatthew G Knepley ierr = PetscOptionsEnd(); 370403adfb6SMatthew G Knepley 371caccb7e3SMatthew G Knepley ierr = setupStream(device, runDouble, cpuTiming); 37219816777SMark if (ierr) { 3735f80ce2aSJacob Faibussowitsch CHKERRQ(PetscPrintf(PETSC_COMM_SELF, "\n[streamBenchmark] - results:\t%s\n\n", (ierr == 0) ? "PASSES" : "FAILED")); 374403adfb6SMatthew G Knepley } 375*b122ec5aSJacob Faibussowitsch CHKERRQ(PetscFinalize()); 376*b122ec5aSJacob Faibussowitsch return 0; 377403adfb6SMatthew G Knepley } 378403adfb6SMatthew G Knepley 379403adfb6SMatthew G Knepley /////////////////////////////////////////////////////////////////////////////// 380403adfb6SMatthew G Knepley //Run the appropriate tests 381403adfb6SMatthew G Knepley /////////////////////////////////////////////////////////////////////////////// 382caccb7e3SMatthew G Knepley PetscErrorCode setupStream(PetscInt deviceNum, PetscBool runDouble, PetscBool cpuTiming) 383403adfb6SMatthew G Knepley { 384403adfb6SMatthew G Knepley PetscInt iNumThreadsPerBlock = 128; 385403adfb6SMatthew G Knepley PetscErrorCode ierr; 386403adfb6SMatthew G Knepley 387403adfb6SMatthew G Knepley PetscFunctionBegin; 388403adfb6SMatthew G Knepley // Check device 389403adfb6SMatthew G Knepley { 390403adfb6SMatthew G Knepley int deviceCount; 391403adfb6SMatthew G Knepley 392403adfb6SMatthew G Knepley cudaGetDeviceCount(&deviceCount); 393403adfb6SMatthew G Knepley if (deviceCount == 0) { 3945f80ce2aSJacob Faibussowitsch CHKERRQ(PetscPrintf(PETSC_COMM_SELF, "!!!!!No devices found!!!!!\n")); 395403adfb6SMatthew G Knepley return -1000; 396403adfb6SMatthew G Knepley } 397403adfb6SMatthew G Knepley 398403adfb6SMatthew G Knepley if (deviceNum >= deviceCount || deviceNum < 0) { 3995f80ce2aSJacob Faibussowitsch CHKERRQ(PetscPrintf(PETSC_COMM_SELF, "\n!!!!!Invalid GPU number %d given hence default gpu %d will be used !!!!!\n", deviceNum, 0)); 400403adfb6SMatthew G Knepley deviceNum = 0; 401403adfb6SMatthew G Knepley } 402403adfb6SMatthew G Knepley } 403403adfb6SMatthew G Knepley 404403adfb6SMatthew G Knepley cudaSetDevice(deviceNum); 4055f80ce2aSJacob Faibussowitsch // CHKERRQ(PetscPrintf(PETSC_COMM_SELF, "Running on...\n\n")); 406403adfb6SMatthew G Knepley cudaDeviceProp deviceProp; 40719816777SMark if (cudaGetDeviceProperties(&deviceProp, deviceNum) != cudaSuccess) { 4085f80ce2aSJacob Faibussowitsch CHKERRQ(PetscPrintf(PETSC_COMM_SELF, " Unable to determine device %d properties, exiting\n")); 409403adfb6SMatthew G Knepley return -1; 410403adfb6SMatthew G Knepley } 411403adfb6SMatthew G Knepley 412caccb7e3SMatthew G Knepley if (runDouble && deviceProp.major == 1 && deviceProp.minor < 3) { 4135f80ce2aSJacob Faibussowitsch CHKERRQ(PetscPrintf(PETSC_COMM_SELF, " Unable to run double-precision STREAM benchmark on a compute capability GPU less than 1.3\n")); 414caccb7e3SMatthew G Knepley return -1; 415caccb7e3SMatthew G Knepley } 4166f2b61bcSKarl Rupp if (deviceProp.major == 2 && deviceProp.minor == 1) iNumThreadsPerBlock = 192; /* GF104 architecture / 48 CUDA Cores per MP */ 4176f2b61bcSKarl Rupp else iNumThreadsPerBlock = 128; /* GF100 architecture / 32 CUDA Cores per MP */ 418403adfb6SMatthew G Knepley 419caccb7e3SMatthew G Knepley if (runDouble) { 4205f80ce2aSJacob Faibussowitsch CHKERRQ(runStreamDouble(iNumThreadsPerBlock, cpuTiming)); 42119816777SMark } else { 4225f80ce2aSJacob Faibussowitsch CHKERRQ(runStream(iNumThreadsPerBlock, cpuTiming)); 423caccb7e3SMatthew G Knepley } 424403adfb6SMatthew G Knepley PetscFunctionReturn(0); 425403adfb6SMatthew G Knepley } 426403adfb6SMatthew G Knepley 427403adfb6SMatthew G Knepley /////////////////////////////////////////////////////////////////////////// 428403adfb6SMatthew G Knepley // runStream 429403adfb6SMatthew G Knepley /////////////////////////////////////////////////////////////////////////// 430403adfb6SMatthew G Knepley PetscErrorCode runStream(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming) 431403adfb6SMatthew G Knepley { 432403adfb6SMatthew G Knepley float *d_a, *d_b, *d_c; 433403adfb6SMatthew G Knepley int k; 434caccb7e3SMatthew G Knepley float times[8][NTIMES]; 435403adfb6SMatthew G Knepley float scalar; 436403adfb6SMatthew G Knepley PetscErrorCode ierr; 437403adfb6SMatthew G Knepley 438403adfb6SMatthew G Knepley PetscFunctionBegin; 439403adfb6SMatthew G Knepley /* Allocate memory on device */ 4405f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMalloc((void**)&d_a, sizeof(float)*N)); 4415f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMalloc((void**)&d_b, sizeof(float)*N)); 4425f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMalloc((void**)&d_c, sizeof(float)*N)); 443403adfb6SMatthew G Knepley 444403adfb6SMatthew G Knepley /* Compute execution configuration */ 445403adfb6SMatthew G Knepley 446403adfb6SMatthew G Knepley dim3 dimBlock(iNumThreadsPerBlock); /* (iNumThreadsPerBlock,1,1) */ 447403adfb6SMatthew G Knepley dim3 dimGrid(N/dimBlock.x); /* (N/dimBlock.x,1,1) */ 448403adfb6SMatthew G Knepley if (N % dimBlock.x != 0) dimGrid.x+=1; 449403adfb6SMatthew G Knepley 450403adfb6SMatthew G Knepley /* Initialize memory on the device */ 451403adfb6SMatthew G Knepley set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N); 452403adfb6SMatthew G Knepley set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N); 453403adfb6SMatthew G Knepley set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N); 454403adfb6SMatthew G Knepley 455403adfb6SMatthew G Knepley /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ 456403adfb6SMatthew G Knepley PetscLogDouble cpuTimer = 0.0; 457403adfb6SMatthew G Knepley 458403adfb6SMatthew G Knepley scalar=3.0f; 459403adfb6SMatthew G Knepley for (k = 0; k < NTIMES; ++k) { 4608563dfccSBarry Smith PetscTimeSubtract(&cpuTimer); 461403adfb6SMatthew G Knepley STREAM_Copy<<<dimGrid,dimBlock>>>(d_a, d_c, N); 46219816777SMark cudaStreamSynchronize(NULL); 4635f80ce2aSJacob Faibussowitsch CHKERRMPI(MPI_Barrier(MPI_COMM_WORLD)); 4648563dfccSBarry Smith PetscTimeAdd(&cpuTimer); 46519816777SMark if (bDontUseGPUTiming) times[0][k] = cpuTimer*1.e3; // millisec 466403adfb6SMatthew G Knepley 467403adfb6SMatthew G Knepley cpuTimer = 0.0; 4688563dfccSBarry Smith PetscTimeSubtract(&cpuTimer); 469403adfb6SMatthew G Knepley STREAM_Copy_Optimized<<<dimGrid,dimBlock>>>(d_a, d_c, N); 47019816777SMark cudaStreamSynchronize(NULL); 4715f80ce2aSJacob Faibussowitsch CHKERRMPI(MPI_Barrier(MPI_COMM_WORLD)); 472df3898eeSBarry Smith //get the total elapsed time in ms 4738563dfccSBarry Smith PetscTimeAdd(&cpuTimer); 47419816777SMark if (bDontUseGPUTiming) times[1][k] = cpuTimer*1.e3; 475403adfb6SMatthew G Knepley 476403adfb6SMatthew G Knepley cpuTimer = 0.0; 4778563dfccSBarry Smith PetscTimeSubtract(&cpuTimer); 478403adfb6SMatthew G Knepley STREAM_Scale<<<dimGrid,dimBlock>>>(d_b, d_c, scalar, N); 47919816777SMark cudaStreamSynchronize(NULL); 4805f80ce2aSJacob Faibussowitsch CHKERRMPI(MPI_Barrier(MPI_COMM_WORLD)); 481df3898eeSBarry Smith //get the total elapsed time in ms 4828563dfccSBarry Smith PetscTimeAdd(&cpuTimer); 48319816777SMark if (bDontUseGPUTiming) times[2][k] = cpuTimer*1.e3; 484403adfb6SMatthew G Knepley 485403adfb6SMatthew G Knepley cpuTimer = 0.0; 4868563dfccSBarry Smith PetscTimeSubtract(&cpuTimer); 487caccb7e3SMatthew G Knepley STREAM_Scale_Optimized<<<dimGrid,dimBlock>>>(d_b, d_c, scalar, N); 48819816777SMark cudaStreamSynchronize(NULL); 4895f80ce2aSJacob Faibussowitsch CHKERRMPI(MPI_Barrier(MPI_COMM_WORLD)); 490df3898eeSBarry Smith //get the total elapsed time in ms 4918563dfccSBarry Smith PetscTimeAdd(&cpuTimer); 49219816777SMark if (bDontUseGPUTiming) times[3][k] = cpuTimer*1.e3; 493403adfb6SMatthew G Knepley 494403adfb6SMatthew G Knepley cpuTimer = 0.0; 4958563dfccSBarry Smith PetscTimeSubtract(&cpuTimer); 4965f80ce2aSJacob Faibussowitsch // CHKERRCUDA(cudaEventRecord(start, 0)); 497caccb7e3SMatthew G Knepley STREAM_Add<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N); 49819816777SMark cudaStreamSynchronize(NULL); 4995f80ce2aSJacob Faibussowitsch CHKERRMPI(MPI_Barrier(MPI_COMM_WORLD)); 5005f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaEventRecord(stop, 0)); 5015f80ce2aSJacob Faibussowitsch // CHKERRCUDA(cudaEventSynchronize(stop)); 502df3898eeSBarry Smith //get the total elapsed time in ms 5038563dfccSBarry Smith PetscTimeAdd(&cpuTimer); 50419816777SMark if (bDontUseGPUTiming) times[4][k] = cpuTimer*1.e3; 5056f2b61bcSKarl Rupp else { 5065f80ce2aSJacob Faibussowitsch // CHKERRCUDA(cudaEventElapsedTime(×[4][k], start, stop)); 507403adfb6SMatthew G Knepley } 508403adfb6SMatthew G Knepley 509caccb7e3SMatthew G Knepley cpuTimer = 0.0; 5108563dfccSBarry Smith PetscTimeSubtract(&cpuTimer); 511caccb7e3SMatthew G Knepley STREAM_Add_Optimized<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N); 51219816777SMark cudaStreamSynchronize(NULL); 5135f80ce2aSJacob Faibussowitsch CHKERRMPI(MPI_Barrier(MPI_COMM_WORLD)); 514df3898eeSBarry Smith //get the total elapsed time in ms 5158563dfccSBarry Smith PetscTimeAdd(&cpuTimer); 51619816777SMark if (bDontUseGPUTiming) times[5][k] = cpuTimer*1.e3; 517caccb7e3SMatthew G Knepley 518caccb7e3SMatthew G Knepley cpuTimer = 0.0; 5198563dfccSBarry Smith PetscTimeSubtract(&cpuTimer); 520caccb7e3SMatthew G Knepley STREAM_Triad<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N); 52119816777SMark cudaStreamSynchronize(NULL); 5225f80ce2aSJacob Faibussowitsch CHKERRMPI(MPI_Barrier(MPI_COMM_WORLD)); 523df3898eeSBarry Smith //get the total elapsed time in ms 5248563dfccSBarry Smith PetscTimeAdd(&cpuTimer); 52519816777SMark if (bDontUseGPUTiming) times[6][k] = cpuTimer*1.e3; 526caccb7e3SMatthew G Knepley 527caccb7e3SMatthew G Knepley cpuTimer = 0.0; 5288563dfccSBarry Smith PetscTimeSubtract(&cpuTimer); 529caccb7e3SMatthew G Knepley STREAM_Triad_Optimized<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N); 53019816777SMark cudaStreamSynchronize(NULL); 5315f80ce2aSJacob Faibussowitsch CHKERRMPI(MPI_Barrier(MPI_COMM_WORLD)); 532df3898eeSBarry Smith //get the total elapsed time in ms 5338563dfccSBarry Smith PetscTimeAdd(&cpuTimer); 53419816777SMark if (bDontUseGPUTiming) times[7][k] = cpuTimer*1.e3; 535caccb7e3SMatthew G Knepley } 536caccb7e3SMatthew G Knepley 53719816777SMark if (1) { /* verify kernels */ 538403adfb6SMatthew G Knepley float *h_a, *h_b, *h_c; 539403adfb6SMatthew G Knepley bool errorSTREAMkernel = true; 540403adfb6SMatthew G Knepley 541403adfb6SMatthew G Knepley if ((h_a = (float*)calloc(N, sizeof(float))) == (float*)NULL) { 542403adfb6SMatthew G Knepley printf("Unable to allocate array h_a, exiting ...\n"); 543403adfb6SMatthew G Knepley exit(1); 544403adfb6SMatthew G Knepley } 545403adfb6SMatthew G Knepley if ((h_b = (float*)calloc(N, sizeof(float))) == (float*)NULL) { 546403adfb6SMatthew G Knepley printf("Unable to allocate array h_b, exiting ...\n"); 547403adfb6SMatthew G Knepley exit(1); 548403adfb6SMatthew G Knepley } 549403adfb6SMatthew G Knepley 550403adfb6SMatthew G Knepley if ((h_c = (float*)calloc(N, sizeof(float))) == (float*)NULL) { 551403adfb6SMatthew G Knepley printf("Unalbe to allocate array h_c, exiting ...\n"); 552403adfb6SMatthew G Knepley exit(1); 553403adfb6SMatthew G Knepley } 554403adfb6SMatthew G Knepley 555403adfb6SMatthew G Knepley /* 556403adfb6SMatthew G Knepley * perform kernel, copy device memory into host memory and verify each 557403adfb6SMatthew G Knepley * device kernel output 558403adfb6SMatthew G Knepley */ 559403adfb6SMatthew G Knepley 560403adfb6SMatthew G Knepley /* Initialize memory on the device */ 561403adfb6SMatthew G Knepley set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N); 562403adfb6SMatthew G Knepley set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N); 563403adfb6SMatthew G Knepley set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N); 564403adfb6SMatthew G Knepley 565403adfb6SMatthew G Knepley STREAM_Copy<<<dimGrid,dimBlock>>>(d_a, d_c, N); 5665f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMemcpy(h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost)); 5675f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost)); 568403adfb6SMatthew G Knepley errorSTREAMkernel = STREAM_Copy_verify(h_a, h_c, N); 569403adfb6SMatthew G Knepley if (errorSTREAMkernel) { 5705f80ce2aSJacob Faibussowitsch CHKERRQ(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy:\t\tError detected in device STREAM_Copy, exiting\n")); 571403adfb6SMatthew G Knepley exit(-2000); 572403adfb6SMatthew G Knepley } 573403adfb6SMatthew G Knepley 574403adfb6SMatthew G Knepley /* Initialize memory on the device */ 575403adfb6SMatthew G Knepley set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N); 576403adfb6SMatthew G Knepley set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N); 577403adfb6SMatthew G Knepley set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N); 578403adfb6SMatthew G Knepley 579403adfb6SMatthew G Knepley STREAM_Copy_Optimized<<<dimGrid,dimBlock>>>(d_a, d_c, N); 5805f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMemcpy(h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost)); 5815f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost)); 582403adfb6SMatthew G Knepley errorSTREAMkernel = STREAM_Copy_verify(h_a, h_c, N); 583403adfb6SMatthew G Knepley if (errorSTREAMkernel) { 5845f80ce2aSJacob Faibussowitsch CHKERRQ(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy_Optimized:\tError detected in device STREAM_Copy_Optimized, exiting\n")); 585403adfb6SMatthew G Knepley exit(-3000); 586403adfb6SMatthew G Knepley } 587403adfb6SMatthew G Knepley 588403adfb6SMatthew G Knepley /* Initialize memory on the device */ 58919816777SMark set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N); 590403adfb6SMatthew G Knepley set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N); 591403adfb6SMatthew G Knepley set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N); 592403adfb6SMatthew G Knepley 593403adfb6SMatthew G Knepley STREAM_Scale<<<dimGrid,dimBlock>>>(d_b, d_c, scalar, N); 5945f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMemcpy(h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost)); 5955f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost)); 596403adfb6SMatthew G Knepley errorSTREAMkernel = STREAM_Scale_verify(h_b, h_c, scalar, N); 597403adfb6SMatthew G Knepley if (errorSTREAMkernel) { 5985f80ce2aSJacob Faibussowitsch CHKERRQ(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Scale:\t\tError detected in device STREAM_Scale, exiting\n")); 599403adfb6SMatthew G Knepley exit(-4000); 600403adfb6SMatthew G Knepley } 601403adfb6SMatthew G Knepley 602403adfb6SMatthew G Knepley /* Initialize memory on the device */ 603403adfb6SMatthew G Knepley set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N); 604403adfb6SMatthew G Knepley set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N); 605403adfb6SMatthew G Knepley set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N); 606403adfb6SMatthew G Knepley 607403adfb6SMatthew G Knepley STREAM_Add<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N); 6085f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMemcpy(h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost)); 6095f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMemcpy(h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost)); 6105f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost)); 611403adfb6SMatthew G Knepley errorSTREAMkernel = STREAM_Add_verify(h_a, h_b, h_c, N); 612403adfb6SMatthew G Knepley if (errorSTREAMkernel) { 6135f80ce2aSJacob Faibussowitsch CHKERRQ(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Add:\t\tError detected in device STREAM_Add, exiting\n")); 614403adfb6SMatthew G Knepley exit(-5000); 615403adfb6SMatthew G Knepley } 616403adfb6SMatthew G Knepley 617403adfb6SMatthew G Knepley /* Initialize memory on the device */ 618403adfb6SMatthew G Knepley set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N); 619403adfb6SMatthew G Knepley set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N); 620403adfb6SMatthew G Knepley set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N); 621403adfb6SMatthew G Knepley 622403adfb6SMatthew G Knepley STREAM_Triad<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N); 6235f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMemcpy(h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost)); 6245f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMemcpy(h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost)); 6255f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost)); 626403adfb6SMatthew G Knepley errorSTREAMkernel = STREAM_Triad_verify(h_b, h_c, h_a, scalar, N); 627403adfb6SMatthew G Knepley if (errorSTREAMkernel) { 6285f80ce2aSJacob Faibussowitsch CHKERRQ(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Triad:\t\tError detected in device STREAM_Triad, exiting\n")); 629403adfb6SMatthew G Knepley exit(-6000); 630403adfb6SMatthew G Knepley } 631403adfb6SMatthew G Knepley 63219816777SMark free(h_a); 63319816777SMark free(h_b); 63419816777SMark free(h_c); 63519816777SMark } 636403adfb6SMatthew G Knepley /* continue from here */ 63719816777SMark printResultsReadable(times, sizeof(float)); 638403adfb6SMatthew G Knepley 639403adfb6SMatthew G Knepley /* Free memory on device */ 6405f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaFree(d_a)); 6415f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaFree(d_b)); 6425f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaFree(d_c)); 64319816777SMark 644403adfb6SMatthew G Knepley PetscFunctionReturn(0); 645403adfb6SMatthew G Knepley } 646403adfb6SMatthew G Knepley 647caccb7e3SMatthew G Knepley PetscErrorCode runStreamDouble(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming) 648caccb7e3SMatthew G Knepley { 649caccb7e3SMatthew G Knepley double *d_a, *d_b, *d_c; 650caccb7e3SMatthew G Knepley int k; 651caccb7e3SMatthew G Knepley float times[8][NTIMES]; 652caccb7e3SMatthew G Knepley double scalar; 653caccb7e3SMatthew G Knepley PetscErrorCode ierr; 654caccb7e3SMatthew G Knepley 655caccb7e3SMatthew G Knepley PetscFunctionBegin; 656caccb7e3SMatthew G Knepley /* Allocate memory on device */ 6575f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMalloc((void**)&d_a, sizeof(double)*N)); 6585f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMalloc((void**)&d_b, sizeof(double)*N)); 6595f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMalloc((void**)&d_c, sizeof(double)*N)); 660caccb7e3SMatthew G Knepley 661caccb7e3SMatthew G Knepley /* Compute execution configuration */ 662caccb7e3SMatthew G Knepley 663caccb7e3SMatthew G Knepley dim3 dimBlock(iNumThreadsPerBlock); /* (iNumThreadsPerBlock,1,1) */ 664caccb7e3SMatthew G Knepley dim3 dimGrid(N/dimBlock.x); /* (N/dimBlock.x,1,1) */ 665caccb7e3SMatthew G Knepley if (N % dimBlock.x != 0) dimGrid.x+=1; 666caccb7e3SMatthew G Knepley 667caccb7e3SMatthew G Knepley /* Initialize memory on the device */ 668caccb7e3SMatthew G Knepley set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N); 669caccb7e3SMatthew G Knepley set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N); 670caccb7e3SMatthew G Knepley set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N); 671caccb7e3SMatthew G Knepley 672caccb7e3SMatthew G Knepley /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ 673caccb7e3SMatthew G Knepley PetscLogDouble cpuTimer = 0.0; 674caccb7e3SMatthew G Knepley 675caccb7e3SMatthew G Knepley scalar=3.0; 676caccb7e3SMatthew G Knepley for (k = 0; k < NTIMES; ++k) { 6778563dfccSBarry Smith PetscTimeSubtract(&cpuTimer); 678caccb7e3SMatthew G Knepley STREAM_Copy_double<<<dimGrid,dimBlock>>>(d_a, d_c, N); 67919816777SMark cudaStreamSynchronize(NULL); 6805f80ce2aSJacob Faibussowitsch CHKERRMPI(MPI_Barrier(MPI_COMM_WORLD)); 681df3898eeSBarry Smith //get the total elapsed time in ms 682caccb7e3SMatthew G Knepley if (bDontUseGPUTiming) { 6838563dfccSBarry Smith PetscTimeAdd(&cpuTimer); 68419816777SMark times[0][k] = cpuTimer*1.e3; 685caccb7e3SMatthew G Knepley } 686caccb7e3SMatthew G Knepley 687caccb7e3SMatthew G Knepley cpuTimer = 0.0; 6888563dfccSBarry Smith PetscTimeSubtract(&cpuTimer); 689caccb7e3SMatthew G Knepley STREAM_Copy_Optimized_double<<<dimGrid,dimBlock>>>(d_a, d_c, N); 69019816777SMark cudaStreamSynchronize(NULL); 6915f80ce2aSJacob Faibussowitsch CHKERRMPI(MPI_Barrier(MPI_COMM_WORLD)); 692df3898eeSBarry Smith //get the total elapsed time in ms 693caccb7e3SMatthew G Knepley if (bDontUseGPUTiming) { 6948563dfccSBarry Smith PetscTimeAdd(&cpuTimer); 69519816777SMark times[1][k] = cpuTimer*1.e3; 696caccb7e3SMatthew G Knepley } 697caccb7e3SMatthew G Knepley 698caccb7e3SMatthew G Knepley cpuTimer = 0.0; 6998563dfccSBarry Smith PetscTimeSubtract(&cpuTimer); 700caccb7e3SMatthew G Knepley STREAM_Scale_double<<<dimGrid,dimBlock>>>(d_b, d_c, scalar, N); 70119816777SMark cudaStreamSynchronize(NULL); 7025f80ce2aSJacob Faibussowitsch CHKERRMPI(MPI_Barrier(MPI_COMM_WORLD)); 703df3898eeSBarry Smith //get the total elapsed time in ms 7048563dfccSBarry Smith PetscTimeAdd(&cpuTimer); 70519816777SMark if (bDontUseGPUTiming) times[2][k] = cpuTimer*1.e3; 706caccb7e3SMatthew G Knepley 707caccb7e3SMatthew G Knepley cpuTimer = 0.0; 7088563dfccSBarry Smith PetscTimeSubtract(&cpuTimer); 709caccb7e3SMatthew G Knepley STREAM_Scale_Optimized_double<<<dimGrid,dimBlock>>>(d_b, d_c, scalar, N); 71019816777SMark cudaStreamSynchronize(NULL); 7115f80ce2aSJacob Faibussowitsch CHKERRMPI(MPI_Barrier(MPI_COMM_WORLD)); 712df3898eeSBarry Smith //get the total elapsed time in ms 7138563dfccSBarry Smith PetscTimeAdd(&cpuTimer); 71419816777SMark if (bDontUseGPUTiming) times[3][k] = cpuTimer*1.e3; 715caccb7e3SMatthew G Knepley 716caccb7e3SMatthew G Knepley cpuTimer = 0.0; 7178563dfccSBarry Smith PetscTimeSubtract(&cpuTimer); 718caccb7e3SMatthew G Knepley STREAM_Add_double<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N); 71919816777SMark cudaStreamSynchronize(NULL); 7205f80ce2aSJacob Faibussowitsch CHKERRMPI(MPI_Barrier(MPI_COMM_WORLD)); 721df3898eeSBarry Smith //get the total elapsed time in ms 7228563dfccSBarry Smith PetscTimeAdd(&cpuTimer); 72319816777SMark if (bDontUseGPUTiming) times[4][k] = cpuTimer*1.e3; 724caccb7e3SMatthew G Knepley 725caccb7e3SMatthew G Knepley cpuTimer = 0.0; 7268563dfccSBarry Smith PetscTimeSubtract(&cpuTimer); 727caccb7e3SMatthew G Knepley STREAM_Add_Optimized_double<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N); 72819816777SMark cudaStreamSynchronize(NULL); 7295f80ce2aSJacob Faibussowitsch CHKERRMPI(MPI_Barrier(MPI_COMM_WORLD)); 730df3898eeSBarry Smith //get the total elapsed time in ms 7318563dfccSBarry Smith PetscTimeAdd(&cpuTimer); 73219816777SMark if (bDontUseGPUTiming) times[5][k] = cpuTimer*1.e3; 733caccb7e3SMatthew G Knepley 734caccb7e3SMatthew G Knepley cpuTimer = 0.0; 7358563dfccSBarry Smith PetscTimeSubtract(&cpuTimer); 736caccb7e3SMatthew G Knepley STREAM_Triad_double<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N); 73719816777SMark cudaStreamSynchronize(NULL); 7385f80ce2aSJacob Faibussowitsch CHKERRMPI(MPI_Barrier(MPI_COMM_WORLD)); 739df3898eeSBarry Smith //get the total elapsed time in ms 7408563dfccSBarry Smith PetscTimeAdd(&cpuTimer); 74119816777SMark if (bDontUseGPUTiming) times[6][k] = cpuTimer*1.e3; 742caccb7e3SMatthew G Knepley 743caccb7e3SMatthew G Knepley cpuTimer = 0.0; 7448563dfccSBarry Smith PetscTimeSubtract(&cpuTimer); 745caccb7e3SMatthew G Knepley STREAM_Triad_Optimized_double<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N); 74619816777SMark cudaStreamSynchronize(NULL); 7475f80ce2aSJacob Faibussowitsch CHKERRMPI(MPI_Barrier(MPI_COMM_WORLD)); 748df3898eeSBarry Smith //get the total elapsed time in ms 7498563dfccSBarry Smith PetscTimeAdd(&cpuTimer); 75019816777SMark if (bDontUseGPUTiming) times[7][k] = cpuTimer*1.e3; 751caccb7e3SMatthew G Knepley } 752caccb7e3SMatthew G Knepley 75319816777SMark if (1) { /* verify kernels */ 754caccb7e3SMatthew G Knepley double *h_a, *h_b, *h_c; 755caccb7e3SMatthew G Knepley bool errorSTREAMkernel = true; 756caccb7e3SMatthew G Knepley 757caccb7e3SMatthew G Knepley if ((h_a = (double*)calloc(N, sizeof(double))) == (double*)NULL) { 758caccb7e3SMatthew G Knepley printf("Unable to allocate array h_a, exiting ...\n"); 759caccb7e3SMatthew G Knepley exit(1); 760caccb7e3SMatthew G Knepley } 761caccb7e3SMatthew G Knepley if ((h_b = (double*)calloc(N, sizeof(double))) == (double*)NULL) { 762caccb7e3SMatthew G Knepley printf("Unable to allocate array h_b, exiting ...\n"); 763caccb7e3SMatthew G Knepley exit(1); 764caccb7e3SMatthew G Knepley } 765caccb7e3SMatthew G Knepley 766caccb7e3SMatthew G Knepley if ((h_c = (double*)calloc(N, sizeof(double))) == (double*)NULL) { 767caccb7e3SMatthew G Knepley printf("Unalbe to allocate array h_c, exiting ...\n"); 768caccb7e3SMatthew G Knepley exit(1); 769caccb7e3SMatthew G Knepley } 770caccb7e3SMatthew G Knepley 771caccb7e3SMatthew G Knepley /* 772caccb7e3SMatthew G Knepley * perform kernel, copy device memory into host memory and verify each 773caccb7e3SMatthew G Knepley * device kernel output 774caccb7e3SMatthew G Knepley */ 775caccb7e3SMatthew G Knepley 776caccb7e3SMatthew G Knepley /* Initialize memory on the device */ 777caccb7e3SMatthew G Knepley set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N); 778caccb7e3SMatthew G Knepley set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N); 779caccb7e3SMatthew G Knepley set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N); 780caccb7e3SMatthew G Knepley 781caccb7e3SMatthew G Knepley STREAM_Copy_double<<<dimGrid,dimBlock>>>(d_a, d_c, N); 7825f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMemcpy(h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost)); 7835f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost)); 784caccb7e3SMatthew G Knepley errorSTREAMkernel = STREAM_Copy_verify_double(h_a, h_c, N); 785caccb7e3SMatthew G Knepley if (errorSTREAMkernel) { 7865f80ce2aSJacob Faibussowitsch CHKERRQ(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy:\t\tError detected in device STREAM_Copy, exiting\n")); 787caccb7e3SMatthew G Knepley exit(-2000); 788caccb7e3SMatthew G Knepley } 789caccb7e3SMatthew G Knepley 790caccb7e3SMatthew G Knepley /* Initialize memory on the device */ 791caccb7e3SMatthew G Knepley set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N); 792caccb7e3SMatthew G Knepley set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N); 793caccb7e3SMatthew G Knepley set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N); 794caccb7e3SMatthew G Knepley 795caccb7e3SMatthew G Knepley STREAM_Copy_Optimized_double<<<dimGrid,dimBlock>>>(d_a, d_c, N); 7965f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMemcpy(h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost)); 7975f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost)); 798caccb7e3SMatthew G Knepley errorSTREAMkernel = STREAM_Copy_verify_double(h_a, h_c, N); 799caccb7e3SMatthew G Knepley if (errorSTREAMkernel) { 8005f80ce2aSJacob Faibussowitsch CHKERRQ(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy_Optimized:\tError detected in device STREAM_Copy_Optimized, exiting\n")); 801caccb7e3SMatthew G Knepley exit(-3000); 802caccb7e3SMatthew G Knepley } 803caccb7e3SMatthew G Knepley 804caccb7e3SMatthew G Knepley /* Initialize memory on the device */ 805caccb7e3SMatthew G Knepley set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N); 806caccb7e3SMatthew G Knepley set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N); 807caccb7e3SMatthew G Knepley 808caccb7e3SMatthew G Knepley STREAM_Scale_double<<<dimGrid,dimBlock>>>(d_b, d_c, scalar, N); 8095f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMemcpy(h_b, d_b, sizeof(double) * N, cudaMemcpyDeviceToHost)); 8105f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost)); 811caccb7e3SMatthew G Knepley errorSTREAMkernel = STREAM_Scale_verify_double(h_b, h_c, scalar, N); 812caccb7e3SMatthew G Knepley if (errorSTREAMkernel) { 8135f80ce2aSJacob Faibussowitsch CHKERRQ(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Scale:\t\tError detected in device STREAM_Scale, exiting\n")); 814caccb7e3SMatthew G Knepley exit(-4000); 815caccb7e3SMatthew G Knepley } 816caccb7e3SMatthew G Knepley 817caccb7e3SMatthew G Knepley /* Initialize memory on the device */ 818caccb7e3SMatthew G Knepley set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N); 819caccb7e3SMatthew G Knepley set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N); 820caccb7e3SMatthew G Knepley set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N); 821caccb7e3SMatthew G Knepley 822caccb7e3SMatthew G Knepley STREAM_Add_double<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N); 8235f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMemcpy(h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost)); 8245f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMemcpy(h_b, d_b, sizeof(double) * N, cudaMemcpyDeviceToHost)); 8255f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost)); 826caccb7e3SMatthew G Knepley errorSTREAMkernel = STREAM_Add_verify_double(h_a, h_b, h_c, N); 827caccb7e3SMatthew G Knepley if (errorSTREAMkernel) { 8285f80ce2aSJacob Faibussowitsch CHKERRQ(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Add:\t\tError detected in device STREAM_Add, exiting\n")); 829caccb7e3SMatthew G Knepley exit(-5000); 830caccb7e3SMatthew G Knepley } 831caccb7e3SMatthew G Knepley 832caccb7e3SMatthew G Knepley /* Initialize memory on the device */ 833caccb7e3SMatthew G Knepley set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N); 834caccb7e3SMatthew G Knepley set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N); 835caccb7e3SMatthew G Knepley set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N); 836caccb7e3SMatthew G Knepley 837caccb7e3SMatthew G Knepley STREAM_Triad_double<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N); 8385f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMemcpy(h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost)); 8395f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMemcpy(h_b, d_b, sizeof(double) * N, cudaMemcpyDeviceToHost)); 8405f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost)); 841caccb7e3SMatthew G Knepley errorSTREAMkernel = STREAM_Triad_verify_double(h_b, h_c, h_a, scalar, N); 842caccb7e3SMatthew G Knepley if (errorSTREAMkernel) { 8435f80ce2aSJacob Faibussowitsch CHKERRQ(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Triad:\t\tError detected in device STREAM_Triad, exiting\n")); 844caccb7e3SMatthew G Knepley exit(-6000); 845caccb7e3SMatthew G Knepley } 846caccb7e3SMatthew G Knepley 84719816777SMark free(h_a); 84819816777SMark free(h_b); 84919816777SMark free(h_c); 85019816777SMark } 851caccb7e3SMatthew G Knepley /* continue from here */ 85219816777SMark printResultsReadable(times,sizeof(double)); 853caccb7e3SMatthew G Knepley 854caccb7e3SMatthew G Knepley /* Free memory on device */ 8555f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaFree(d_a)); 8565f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaFree(d_b)); 8575f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaFree(d_c)); 85819816777SMark 859caccb7e3SMatthew G Knepley PetscFunctionReturn(0); 860caccb7e3SMatthew G Knepley } 861caccb7e3SMatthew G Knepley 862403adfb6SMatthew G Knepley /////////////////////////////////////////////////////////////////////////// 863403adfb6SMatthew G Knepley //Print Results to Screen and File 864403adfb6SMatthew G Knepley /////////////////////////////////////////////////////////////////////////// 86519816777SMark PetscErrorCode printResultsReadable(float times[][NTIMES], const size_t bsize) 866a6dfd86eSKarl Rupp { 867403adfb6SMatthew G Knepley PetscErrorCode ierr; 868403adfb6SMatthew G Knepley PetscInt j, k; 869caccb7e3SMatthew G Knepley float avgtime[8] = {0., 0., 0., 0., 0., 0., 0., 0.}; 870caccb7e3SMatthew G Knepley float maxtime[8] = {0., 0., 0., 0., 0., 0., 0., 0.}; 871caccb7e3SMatthew G Knepley float mintime[8] = {1e30,1e30,1e30,1e30,1e30,1e30,1e30,1e30}; 87219816777SMark // char *label[8] = {"Copy: ", "Copy Opt.: ", "Scale: ", "Scale Opt: ", "Add: ", "Add Opt: ", "Triad: ", "Triad Opt: "}; 87319816777SMark const float bytes_per_kernel[8] = { 87419816777SMark 2. * bsize * N, 87519816777SMark 2. * bsize * N, 87619816777SMark 2. * bsize * N, 87719816777SMark 2. * bsize * N, 87819816777SMark 3. * bsize * N, 87919816777SMark 3. * bsize * N, 88019816777SMark 3. * bsize * N, 88119816777SMark 3. * bsize * N 882403adfb6SMatthew G Knepley }; 88319816777SMark double rate,irate; 88419816777SMark int rank,size; 885403adfb6SMatthew G Knepley PetscFunctionBegin; 8865f80ce2aSJacob Faibussowitsch CHKERRMPI(MPI_Comm_rank(MPI_COMM_WORLD,&rank)); 8875f80ce2aSJacob Faibussowitsch CHKERRMPI(MPI_Comm_size(MPI_COMM_WORLD,&size)); 888403adfb6SMatthew G Knepley /* --- SUMMARY --- */ 88919816777SMark for (k = 0; k < NTIMES; ++k) { 890caccb7e3SMatthew G Knepley for (j = 0; j < 8; ++j) { 89119816777SMark avgtime[j] = avgtime[j] + (1.e-03f * times[j][k]); // millisec --> sec 892403adfb6SMatthew G Knepley mintime[j] = MIN(mintime[j], (1.e-03f * times[j][k])); 893403adfb6SMatthew G Knepley maxtime[j] = MAX(maxtime[j], (1.e-03f * times[j][k])); 894403adfb6SMatthew G Knepley } 89519816777SMark } 896caccb7e3SMatthew G Knepley for (j = 0; j < 8; ++j) { 897403adfb6SMatthew G Knepley avgtime[j] = avgtime[j]/(float)(NTIMES-1); 898403adfb6SMatthew G Knepley } 89919816777SMark j = 7; 90019816777SMark irate = 1.0E-06 * bytes_per_kernel[j]/mintime[j]; 90119816777SMark ierr = MPI_Reduce(&irate,&rate,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); 902dd400576SPatrick Sanan if (rank == 0) { 90319816777SMark FILE *fd; 90419816777SMark if (size == 1) { 90519816777SMark printf("%d %11.4f Rate (MB/s)\n",size, rate); 90619816777SMark fd = fopen("flops","w"); 90719816777SMark fprintf(fd,"%g\n",rate); 90819816777SMark fclose(fd); 90919816777SMark } else { 91019816777SMark double prate; 91119816777SMark fd = fopen("flops","r"); 91219816777SMark fscanf(fd,"%lg",&prate); 91319816777SMark fclose(fd); 91419816777SMark printf("%d %11.4f Rate (MB/s) %g \n", size, rate, rate/prate); 91519816777SMark } 91619816777SMark } 91719816777SMark 918403adfb6SMatthew G Knepley PetscFunctionReturn(0); 919403adfb6SMatthew G Knepley } 920