1403adfb6SMatthew G Knepley /* 2403adfb6SMatthew G Knepley STREAM benchmark implementation in CUDA. 3403adfb6SMatthew G Knepley 4403adfb6SMatthew G Knepley COPY: a(i) = b(i) 5403adfb6SMatthew G Knepley SCALE: a(i) = q*b(i) 6403adfb6SMatthew G Knepley SUM: a(i) = b(i) + c(i) 7403adfb6SMatthew G Knepley TRIAD: a(i) = b(i) + q*c(i) 8403adfb6SMatthew G Knepley 9403adfb6SMatthew G Knepley It measures the memory system on the device. 10*19816777SMark The implementation is in double precision with a single option. 11403adfb6SMatthew G Knepley 12403adfb6SMatthew G Knepley Code based on the code developed by John D. McCalpin 13403adfb6SMatthew G Knepley http://www.cs.virginia.edu/stream/FTP/Code/stream.c 14403adfb6SMatthew G Knepley 15403adfb6SMatthew G Knepley Written by: Massimiliano Fatica, NVIDIA Corporation 16403adfb6SMatthew G Knepley Modified by: Douglas Enright (dpephd-nvidia@yahoo.com), 1 December 2010 17403adfb6SMatthew G Knepley Extensive Revisions, 4 December 2010 18403adfb6SMatthew G Knepley Modified for PETSc by: Matthew G. Knepley 14 Aug 2011 19403adfb6SMatthew G Knepley 20403adfb6SMatthew G Knepley User interface motivated by bandwidthTest NVIDIA SDK example. 21403adfb6SMatthew G Knepley */ 22*19816777SMark static char help[] = "Double-Precision STREAM Benchmark implementation in CUDA\n Performs Copy, Scale, Add, and Triad double-precision kernels\n\n"; 23403adfb6SMatthew G Knepley 24403adfb6SMatthew G Knepley #include <petscconf.h> 25403adfb6SMatthew G Knepley #include <petscsys.h> 26403adfb6SMatthew G Knepley #include <petsctime.h> 27403adfb6SMatthew G Knepley 28*19816777SMark #define N 10000000 29403adfb6SMatthew G Knepley #define NTIMES 10 30403adfb6SMatthew G Knepley 31403adfb6SMatthew G Knepley # ifndef MIN 32403adfb6SMatthew G Knepley # define MIN(x,y) ((x)<(y) ? (x) : (y)) 33403adfb6SMatthew G Knepley # endif 34403adfb6SMatthew G Knepley # ifndef MAX 35403adfb6SMatthew G Knepley # define MAX(x,y) ((x)>(y) ? (x) : (y)) 36403adfb6SMatthew G Knepley # endif 37403adfb6SMatthew G Knepley 38403adfb6SMatthew G Knepley const float flt_eps = 1.192092896e-07f; 39caccb7e3SMatthew G Knepley const double dbl_eps = 2.2204460492503131e-16; 40403adfb6SMatthew G Knepley 41403adfb6SMatthew G Knepley __global__ void set_array(float *a, float value, size_t len) 42403adfb6SMatthew G Knepley { 43403adfb6SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 44403adfb6SMatthew G Knepley while (idx < len) { 45403adfb6SMatthew G Knepley a[idx] = value; 46403adfb6SMatthew G Knepley idx += blockDim.x * gridDim.x; 47403adfb6SMatthew G Knepley } 48403adfb6SMatthew G Knepley } 49403adfb6SMatthew G Knepley 50caccb7e3SMatthew G Knepley __global__ void set_array_double(double *a, double value, size_t len) 51caccb7e3SMatthew G Knepley { 52caccb7e3SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 53caccb7e3SMatthew G Knepley while (idx < len) { 54caccb7e3SMatthew G Knepley a[idx] = value; 55caccb7e3SMatthew G Knepley idx += blockDim.x * gridDim.x; 56caccb7e3SMatthew G Knepley } 57caccb7e3SMatthew G Knepley } 58caccb7e3SMatthew G Knepley 59403adfb6SMatthew G Knepley __global__ void STREAM_Copy(float *a, float *b, size_t len) 60403adfb6SMatthew G Knepley { 61403adfb6SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 62403adfb6SMatthew G Knepley while (idx < len) { 63403adfb6SMatthew G Knepley b[idx] = a[idx]; 64403adfb6SMatthew G Knepley idx += blockDim.x * gridDim.x; 65403adfb6SMatthew G Knepley } 66403adfb6SMatthew G Knepley } 67403adfb6SMatthew G Knepley 68caccb7e3SMatthew G Knepley __global__ void STREAM_Copy_double(double *a, double *b, size_t len) 69caccb7e3SMatthew G Knepley { 70caccb7e3SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 71caccb7e3SMatthew G Knepley while (idx < len) { 72caccb7e3SMatthew G Knepley b[idx] = a[idx]; 73caccb7e3SMatthew G Knepley idx += blockDim.x * gridDim.x; 74caccb7e3SMatthew G Knepley } 75caccb7e3SMatthew G Knepley } 76caccb7e3SMatthew G Knepley 77403adfb6SMatthew G Knepley __global__ void STREAM_Copy_Optimized(float *a, float *b, size_t len) 78403adfb6SMatthew G Knepley { 79403adfb6SMatthew G Knepley /* 80403adfb6SMatthew G Knepley * Ensure size of thread index space is as large as or greater than 81403adfb6SMatthew G Knepley * vector index space else return. 82403adfb6SMatthew G Knepley */ 83403adfb6SMatthew G Knepley if (blockDim.x * gridDim.x < len) return; 84403adfb6SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 85403adfb6SMatthew G Knepley if (idx < len) b[idx] = a[idx]; 86403adfb6SMatthew G Knepley } 87403adfb6SMatthew G Knepley 88caccb7e3SMatthew G Knepley __global__ void STREAM_Copy_Optimized_double(double *a, double *b, size_t len) 89caccb7e3SMatthew G Knepley { 90caccb7e3SMatthew G Knepley /* 91caccb7e3SMatthew G Knepley * Ensure size of thread index space is as large as or greater than 92caccb7e3SMatthew G Knepley * vector index space else return. 93caccb7e3SMatthew G Knepley */ 94caccb7e3SMatthew G Knepley if (blockDim.x * gridDim.x < len) return; 95caccb7e3SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 96caccb7e3SMatthew G Knepley if (idx < len) b[idx] = a[idx]; 97caccb7e3SMatthew G Knepley } 98caccb7e3SMatthew G Knepley 99403adfb6SMatthew G Knepley __global__ void STREAM_Scale(float *a, float *b, float scale, size_t len) 100403adfb6SMatthew G Knepley { 101403adfb6SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 102403adfb6SMatthew G Knepley while (idx < len) { 103403adfb6SMatthew G Knepley b[idx] = scale* a[idx]; 104403adfb6SMatthew G Knepley idx += blockDim.x * gridDim.x; 105403adfb6SMatthew G Knepley } 106403adfb6SMatthew G Knepley } 107403adfb6SMatthew G Knepley 108caccb7e3SMatthew G Knepley __global__ void STREAM_Scale_double(double *a, double *b, double scale, size_t len) 109caccb7e3SMatthew G Knepley { 110caccb7e3SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 111caccb7e3SMatthew G Knepley while (idx < len) { 112caccb7e3SMatthew G Knepley b[idx] = scale* a[idx]; 113caccb7e3SMatthew G Knepley idx += blockDim.x * gridDim.x; 114caccb7e3SMatthew G Knepley } 115caccb7e3SMatthew G Knepley } 116caccb7e3SMatthew G Knepley 117caccb7e3SMatthew G Knepley __global__ void STREAM_Scale_Optimized(float *a, float *b, float scale, size_t len) 118caccb7e3SMatthew G Knepley { 119caccb7e3SMatthew G Knepley /* 120caccb7e3SMatthew G Knepley * Ensure size of thread index space is as large as or greater than 121caccb7e3SMatthew G Knepley * vector index space else return. 122caccb7e3SMatthew G Knepley */ 123caccb7e3SMatthew G Knepley if (blockDim.x * gridDim.x < len) return; 124caccb7e3SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 125caccb7e3SMatthew G Knepley if (idx < len) b[idx] = scale* a[idx]; 126caccb7e3SMatthew G Knepley } 127caccb7e3SMatthew G Knepley 128caccb7e3SMatthew G Knepley __global__ void STREAM_Scale_Optimized_double(double *a, double *b, double scale, size_t len) 129caccb7e3SMatthew G Knepley { 130caccb7e3SMatthew G Knepley /* 131caccb7e3SMatthew G Knepley * Ensure size of thread index space is as large as or greater than 132caccb7e3SMatthew G Knepley * vector index space else return. 133caccb7e3SMatthew G Knepley */ 134caccb7e3SMatthew G Knepley if (blockDim.x * gridDim.x < len) return; 135caccb7e3SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 136caccb7e3SMatthew G Knepley if (idx < len) b[idx] = scale* a[idx]; 137caccb7e3SMatthew G Knepley } 138caccb7e3SMatthew G Knepley 139403adfb6SMatthew G Knepley __global__ void STREAM_Add(float *a, float *b, float *c, size_t len) 140403adfb6SMatthew G Knepley { 141403adfb6SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 142403adfb6SMatthew G Knepley while (idx < len) { 143403adfb6SMatthew G Knepley c[idx] = a[idx]+b[idx]; 144403adfb6SMatthew G Knepley idx += blockDim.x * gridDim.x; 145403adfb6SMatthew G Knepley } 146403adfb6SMatthew G Knepley } 147403adfb6SMatthew G Knepley 148caccb7e3SMatthew G Knepley __global__ void STREAM_Add_double(double *a, double *b, double *c, size_t len) 149caccb7e3SMatthew G Knepley { 150caccb7e3SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 151caccb7e3SMatthew G Knepley while (idx < len) { 152caccb7e3SMatthew G Knepley c[idx] = a[idx]+b[idx]; 153caccb7e3SMatthew G Knepley idx += blockDim.x * gridDim.x; 154caccb7e3SMatthew G Knepley } 155caccb7e3SMatthew G Knepley } 156caccb7e3SMatthew G Knepley 157caccb7e3SMatthew G Knepley __global__ void STREAM_Add_Optimized(float *a, float *b, float *c, size_t len) 158caccb7e3SMatthew G Knepley { 159caccb7e3SMatthew G Knepley /* 160caccb7e3SMatthew G Knepley * Ensure size of thread index space is as large as or greater than 161caccb7e3SMatthew G Knepley * vector index space else return. 162caccb7e3SMatthew G Knepley */ 163caccb7e3SMatthew G Knepley if (blockDim.x * gridDim.x < len) return; 164caccb7e3SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 165caccb7e3SMatthew G Knepley if (idx < len) c[idx] = a[idx]+b[idx]; 166caccb7e3SMatthew G Knepley } 167caccb7e3SMatthew G Knepley 168caccb7e3SMatthew G Knepley __global__ void STREAM_Add_Optimized_double(double *a, double *b, double *c, size_t len) 169caccb7e3SMatthew G Knepley { 170caccb7e3SMatthew G Knepley /* 171caccb7e3SMatthew G Knepley * Ensure size of thread index space is as large as or greater than 172caccb7e3SMatthew G Knepley * vector index space else return. 173caccb7e3SMatthew G Knepley */ 174caccb7e3SMatthew G Knepley if (blockDim.x * gridDim.x < len) return; 175caccb7e3SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 176caccb7e3SMatthew G Knepley if (idx < len) c[idx] = a[idx]+b[idx]; 177caccb7e3SMatthew G Knepley } 178caccb7e3SMatthew G Knepley 179403adfb6SMatthew G Knepley __global__ void STREAM_Triad(float *a, float *b, float *c, float scalar, size_t len) 180403adfb6SMatthew G Knepley { 181403adfb6SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 182403adfb6SMatthew G Knepley while (idx < len) { 183403adfb6SMatthew G Knepley c[idx] = a[idx]+scalar*b[idx]; 184403adfb6SMatthew G Knepley idx += blockDim.x * gridDim.x; 185403adfb6SMatthew G Knepley } 186403adfb6SMatthew G Knepley } 187403adfb6SMatthew G Knepley 188caccb7e3SMatthew G Knepley __global__ void STREAM_Triad_double(double *a, double *b, double *c, double scalar, size_t len) 189caccb7e3SMatthew G Knepley { 190caccb7e3SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 191caccb7e3SMatthew G Knepley while (idx < len) { 192caccb7e3SMatthew G Knepley c[idx] = a[idx]+scalar*b[idx]; 193caccb7e3SMatthew G Knepley idx += blockDim.x * gridDim.x; 194caccb7e3SMatthew G Knepley } 195caccb7e3SMatthew G Knepley } 196caccb7e3SMatthew G Knepley 197caccb7e3SMatthew G Knepley __global__ void STREAM_Triad_Optimized(float *a, float *b, float *c, float scalar, size_t len) 198caccb7e3SMatthew G Knepley { 199caccb7e3SMatthew G Knepley /* 200caccb7e3SMatthew G Knepley * Ensure size of thread index space is as large as or greater than 201caccb7e3SMatthew G Knepley * vector index space else return. 202caccb7e3SMatthew G Knepley */ 203caccb7e3SMatthew G Knepley if (blockDim.x * gridDim.x < len) return; 204caccb7e3SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 205caccb7e3SMatthew G Knepley if (idx < len) c[idx] = a[idx]+scalar*b[idx]; 206caccb7e3SMatthew G Knepley } 207caccb7e3SMatthew G Knepley 208caccb7e3SMatthew G Knepley __global__ void STREAM_Triad_Optimized_double(double *a, double *b, double *c, double scalar, size_t len) 209caccb7e3SMatthew G Knepley { 210caccb7e3SMatthew G Knepley /* 211caccb7e3SMatthew G Knepley * Ensure size of thread index space is as large as or greater than 212caccb7e3SMatthew G Knepley * vector index space else return. 213caccb7e3SMatthew G Knepley */ 214caccb7e3SMatthew G Knepley if (blockDim.x * gridDim.x < len) return; 215caccb7e3SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 216caccb7e3SMatthew G Knepley if (idx < len) c[idx] = a[idx]+scalar*b[idx]; 217caccb7e3SMatthew G Knepley } 218caccb7e3SMatthew G Knepley 219403adfb6SMatthew G Knepley /* Host side verification routines */ 220a6dfd86eSKarl Rupp bool STREAM_Copy_verify(float *a, float *b, size_t len) 221a6dfd86eSKarl Rupp { 222403adfb6SMatthew G Knepley size_t idx; 223403adfb6SMatthew G Knepley bool bDifferent = false; 224403adfb6SMatthew G Knepley 225403adfb6SMatthew G Knepley for (idx = 0; idx < len && !bDifferent; idx++) { 226403adfb6SMatthew G Knepley float expectedResult = a[idx]; 227403adfb6SMatthew G Knepley float diffResultExpected = (b[idx] - expectedResult); 228403adfb6SMatthew G Knepley float relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps; 229403adfb6SMatthew G Knepley /* element-wise relative error determination */ 230403adfb6SMatthew G Knepley bDifferent = (relErrorULPS > 2.f); 231403adfb6SMatthew G Knepley } 232403adfb6SMatthew G Knepley 233403adfb6SMatthew G Knepley return bDifferent; 234403adfb6SMatthew G Knepley } 235403adfb6SMatthew G Knepley 236a6dfd86eSKarl Rupp bool STREAM_Copy_verify_double(double *a, double *b, size_t len) 237a6dfd86eSKarl Rupp { 238caccb7e3SMatthew G Knepley size_t idx; 239caccb7e3SMatthew G Knepley bool bDifferent = false; 240caccb7e3SMatthew G Knepley 241caccb7e3SMatthew G Knepley for (idx = 0; idx < len && !bDifferent; idx++) { 242caccb7e3SMatthew G Knepley double expectedResult = a[idx]; 243caccb7e3SMatthew G Knepley double diffResultExpected = (b[idx] - expectedResult); 244*19816777SMark double relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/dbl_eps; 245caccb7e3SMatthew G Knepley /* element-wise relative error determination */ 246caccb7e3SMatthew G Knepley bDifferent = (relErrorULPS > 2.); 247caccb7e3SMatthew G Knepley } 248caccb7e3SMatthew G Knepley 249caccb7e3SMatthew G Knepley return bDifferent; 250caccb7e3SMatthew G Knepley } 251caccb7e3SMatthew G Knepley 252a6dfd86eSKarl Rupp bool STREAM_Scale_verify(float *a, float *b, float scale, size_t len) 253a6dfd86eSKarl Rupp { 254403adfb6SMatthew G Knepley size_t idx; 255403adfb6SMatthew G Knepley bool bDifferent = false; 256403adfb6SMatthew G Knepley 257403adfb6SMatthew G Knepley for (idx = 0; idx < len && !bDifferent; idx++) { 258403adfb6SMatthew G Knepley float expectedResult = scale*a[idx]; 259403adfb6SMatthew G Knepley float diffResultExpected = (b[idx] - expectedResult); 260403adfb6SMatthew G Knepley float relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps; 261403adfb6SMatthew G Knepley /* element-wise relative error determination */ 262403adfb6SMatthew G Knepley bDifferent = (relErrorULPS > 2.f); 263403adfb6SMatthew G Knepley } 264403adfb6SMatthew G Knepley 265403adfb6SMatthew G Knepley return bDifferent; 266403adfb6SMatthew G Knepley } 267403adfb6SMatthew G Knepley 268a6dfd86eSKarl Rupp bool STREAM_Scale_verify_double(double *a, double *b, double scale, size_t len) 269a6dfd86eSKarl Rupp { 270caccb7e3SMatthew G Knepley size_t idx; 271caccb7e3SMatthew G Knepley bool bDifferent = false; 272caccb7e3SMatthew G Knepley 273caccb7e3SMatthew G Knepley for (idx = 0; idx < len && !bDifferent; idx++) { 274caccb7e3SMatthew G Knepley double expectedResult = scale*a[idx]; 275caccb7e3SMatthew G Knepley double diffResultExpected = (b[idx] - expectedResult); 276caccb7e3SMatthew G Knepley double relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps; 277caccb7e3SMatthew G Knepley /* element-wise relative error determination */ 278caccb7e3SMatthew G Knepley bDifferent = (relErrorULPS > 2.); 279caccb7e3SMatthew G Knepley } 280caccb7e3SMatthew G Knepley 281caccb7e3SMatthew G Knepley return bDifferent; 282caccb7e3SMatthew G Knepley } 283caccb7e3SMatthew G Knepley 284a6dfd86eSKarl Rupp bool STREAM_Add_verify(float *a, float *b, float *c, size_t len) 285a6dfd86eSKarl Rupp { 286403adfb6SMatthew G Knepley size_t idx; 287403adfb6SMatthew G Knepley bool bDifferent = false; 288403adfb6SMatthew G Knepley 289403adfb6SMatthew G Knepley for (idx = 0; idx < len && !bDifferent; idx++) { 290403adfb6SMatthew G Knepley float expectedResult = a[idx] + b[idx]; 291403adfb6SMatthew G Knepley float diffResultExpected = (c[idx] - expectedResult); 292403adfb6SMatthew G Knepley float relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps; 293403adfb6SMatthew G Knepley /* element-wise relative error determination */ 294403adfb6SMatthew G Knepley bDifferent = (relErrorULPS > 2.f); 295403adfb6SMatthew G Knepley } 296403adfb6SMatthew G Knepley 297403adfb6SMatthew G Knepley return bDifferent; 298403adfb6SMatthew G Knepley } 299403adfb6SMatthew G Knepley 300a6dfd86eSKarl Rupp bool STREAM_Add_verify_double(double *a, double *b, double *c, size_t len) 301a6dfd86eSKarl Rupp { 302caccb7e3SMatthew G Knepley size_t idx; 303caccb7e3SMatthew G Knepley bool bDifferent = false; 304caccb7e3SMatthew G Knepley 305caccb7e3SMatthew G Knepley for (idx = 0; idx < len && !bDifferent; idx++) { 306caccb7e3SMatthew G Knepley double expectedResult = a[idx] + b[idx]; 307caccb7e3SMatthew G Knepley double diffResultExpected = (c[idx] - expectedResult); 308caccb7e3SMatthew G Knepley double relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps; 309caccb7e3SMatthew G Knepley /* element-wise relative error determination */ 310caccb7e3SMatthew G Knepley bDifferent = (relErrorULPS > 2.); 311caccb7e3SMatthew G Knepley } 312caccb7e3SMatthew G Knepley 313caccb7e3SMatthew G Knepley return bDifferent; 314caccb7e3SMatthew G Knepley } 315caccb7e3SMatthew G Knepley 316a6dfd86eSKarl Rupp bool STREAM_Triad_verify(float *a, float *b, float *c, float scalar, size_t len) 317a6dfd86eSKarl Rupp { 318403adfb6SMatthew G Knepley size_t idx; 319403adfb6SMatthew G Knepley bool bDifferent = false; 320403adfb6SMatthew G Knepley 321403adfb6SMatthew G Knepley for (idx = 0; idx < len && !bDifferent; idx++) { 322403adfb6SMatthew G Knepley float expectedResult = a[idx] + scalar*b[idx]; 323403adfb6SMatthew G Knepley float diffResultExpected = (c[idx] - expectedResult); 324403adfb6SMatthew G Knepley float relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps; 325403adfb6SMatthew G Knepley /* element-wise relative error determination */ 326403adfb6SMatthew G Knepley bDifferent = (relErrorULPS > 3.f); 327403adfb6SMatthew G Knepley } 328403adfb6SMatthew G Knepley 329403adfb6SMatthew G Knepley return bDifferent; 330403adfb6SMatthew G Knepley } 331403adfb6SMatthew G Knepley 332a6dfd86eSKarl Rupp bool STREAM_Triad_verify_double(double *a, double *b, double *c, double scalar, size_t len) 333a6dfd86eSKarl Rupp { 334caccb7e3SMatthew G Knepley size_t idx; 335caccb7e3SMatthew G Knepley bool bDifferent = false; 336caccb7e3SMatthew G Knepley 337caccb7e3SMatthew G Knepley for (idx = 0; idx < len && !bDifferent; idx++) { 338caccb7e3SMatthew G Knepley double expectedResult = a[idx] + scalar*b[idx]; 339caccb7e3SMatthew G Knepley double diffResultExpected = (c[idx] - expectedResult); 340caccb7e3SMatthew G Knepley double relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps; 341caccb7e3SMatthew G Knepley /* element-wise relative error determination */ 342caccb7e3SMatthew G Knepley bDifferent = (relErrorULPS > 3.); 343caccb7e3SMatthew G Knepley } 344caccb7e3SMatthew G Knepley 345caccb7e3SMatthew G Knepley return bDifferent; 346caccb7e3SMatthew G Knepley } 347caccb7e3SMatthew G Knepley 348403adfb6SMatthew G Knepley /* forward declarations */ 349caccb7e3SMatthew G Knepley PetscErrorCode setupStream(PetscInt device, PetscBool runDouble, PetscBool cpuTiming); 350403adfb6SMatthew G Knepley PetscErrorCode runStream(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming); 351caccb7e3SMatthew G Knepley PetscErrorCode runStreamDouble(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming); 352*19816777SMark PetscErrorCode printResultsReadable(float times[][NTIMES], size_t); 353403adfb6SMatthew G Knepley 354403adfb6SMatthew G Knepley int main(int argc, char *argv[]) 355403adfb6SMatthew G Knepley { 356403adfb6SMatthew G Knepley PetscInt device = 0; 357*19816777SMark PetscBool runDouble = PETSC_TRUE; 358*19816777SMark const PetscBool cpuTiming = PETSC_TRUE; // must be true 359403adfb6SMatthew G Knepley PetscErrorCode ierr; 360403adfb6SMatthew G Knepley 361*19816777SMark ierr = cudaSetDeviceFlags(cudaDeviceBlockingSync);CHKERRQ(ierr); 362*19816777SMark 363a438ae71SBarry Smith ierr = PetscInitialize(&argc, &argv, 0, help);if (ierr) return ierr; 364403adfb6SMatthew G Knepley 365403adfb6SMatthew G Knepley ierr = PetscOptionsBegin(PETSC_COMM_WORLD, "", "STREAM Benchmark Options", "STREAM");CHKERRQ(ierr); 3665a856986SBarry Smith ierr = PetscOptionsBoundedInt("-device", "Specify the CUDA device to be used", "STREAM", device, &device, NULL,0);CHKERRQ(ierr); 3670298fd71SBarry Smith ierr = PetscOptionsBool("-double", "Also run double precision tests", "STREAM", runDouble, &runDouble, NULL);CHKERRQ(ierr); 368403adfb6SMatthew G Knepley ierr = PetscOptionsEnd(); 369403adfb6SMatthew G Knepley 370caccb7e3SMatthew G Knepley ierr = setupStream(device, runDouble, cpuTiming); 371*19816777SMark if (ierr) { 37226f47effSBarry Smith ierr = PetscPrintf(PETSC_COMM_SELF, "\n[streamBenchmark] - results:\t%s\n\n", (ierr == 0) ? "PASSES" : "FAILED");CHKERRQ(ierr); 373403adfb6SMatthew G Knepley } 37426f47effSBarry Smith ierr = PetscFinalize(); 37526f47effSBarry Smith return ierr; 376403adfb6SMatthew G Knepley } 377403adfb6SMatthew G Knepley 378403adfb6SMatthew G Knepley /////////////////////////////////////////////////////////////////////////////// 379403adfb6SMatthew G Knepley //Run the appropriate tests 380403adfb6SMatthew G Knepley /////////////////////////////////////////////////////////////////////////////// 381caccb7e3SMatthew G Knepley PetscErrorCode setupStream(PetscInt deviceNum, PetscBool runDouble, PetscBool cpuTiming) 382403adfb6SMatthew G Knepley { 383403adfb6SMatthew G Knepley PetscInt iNumThreadsPerBlock = 128; 384403adfb6SMatthew G Knepley PetscErrorCode ierr; 385403adfb6SMatthew G Knepley 386403adfb6SMatthew G Knepley PetscFunctionBegin; 387403adfb6SMatthew G Knepley // Check device 388403adfb6SMatthew G Knepley { 389403adfb6SMatthew G Knepley int deviceCount; 390403adfb6SMatthew G Knepley 391403adfb6SMatthew G Knepley cudaGetDeviceCount(&deviceCount); 392403adfb6SMatthew G Knepley if (deviceCount == 0) { 393403adfb6SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, "!!!!!No devices found!!!!!\n");CHKERRQ(ierr); 394403adfb6SMatthew G Knepley return -1000; 395403adfb6SMatthew G Knepley } 396403adfb6SMatthew G Knepley 397403adfb6SMatthew G Knepley if (deviceNum >= deviceCount || deviceNum < 0) { 398403adfb6SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, "\n!!!!!Invalid GPU number %d given hence default gpu %d will be used !!!!!\n", deviceNum, 0);CHKERRQ(ierr); 399403adfb6SMatthew G Knepley deviceNum = 0; 400403adfb6SMatthew G Knepley } 401403adfb6SMatthew G Knepley } 402403adfb6SMatthew G Knepley 403403adfb6SMatthew G Knepley cudaSetDevice(deviceNum); 404*19816777SMark // ierr = PetscPrintf(PETSC_COMM_SELF, "Running on...\n\n");CHKERRQ(ierr); 405403adfb6SMatthew G Knepley cudaDeviceProp deviceProp; 406*19816777SMark if (cudaGetDeviceProperties(&deviceProp, deviceNum) != cudaSuccess) { 407403adfb6SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, " Unable to determine device %d properties, exiting\n");CHKERRQ(ierr); 408403adfb6SMatthew G Knepley return -1; 409403adfb6SMatthew G Knepley } 410403adfb6SMatthew G Knepley 411caccb7e3SMatthew G Knepley if (runDouble && deviceProp.major == 1 && deviceProp.minor < 3) { 412caccb7e3SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, " Unable to run double-precision STREAM benchmark on a compute capability GPU less than 1.3\n");CHKERRQ(ierr); 413caccb7e3SMatthew G Knepley return -1; 414caccb7e3SMatthew G Knepley } 4156f2b61bcSKarl Rupp if (deviceProp.major == 2 && deviceProp.minor == 1) iNumThreadsPerBlock = 192; /* GF104 architecture / 48 CUDA Cores per MP */ 4166f2b61bcSKarl Rupp else iNumThreadsPerBlock = 128; /* GF100 architecture / 32 CUDA Cores per MP */ 417403adfb6SMatthew G Knepley 418caccb7e3SMatthew G Knepley if (runDouble) { 419caccb7e3SMatthew G Knepley ierr = runStreamDouble(iNumThreadsPerBlock, cpuTiming);CHKERRQ(ierr); 420*19816777SMark } else { 421*19816777SMark ierr = runStream(iNumThreadsPerBlock, cpuTiming);CHKERRQ(ierr); 422caccb7e3SMatthew G Knepley } 423403adfb6SMatthew G Knepley PetscFunctionReturn(0); 424403adfb6SMatthew G Knepley } 425403adfb6SMatthew G Knepley 426403adfb6SMatthew G Knepley /////////////////////////////////////////////////////////////////////////// 427403adfb6SMatthew G Knepley // runStream 428403adfb6SMatthew G Knepley /////////////////////////////////////////////////////////////////////////// 429403adfb6SMatthew G Knepley PetscErrorCode runStream(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming) 430403adfb6SMatthew G Knepley { 431403adfb6SMatthew G Knepley float *d_a, *d_b, *d_c; 432403adfb6SMatthew G Knepley int k; 433caccb7e3SMatthew G Knepley float times[8][NTIMES]; 434403adfb6SMatthew G Knepley float scalar; 435403adfb6SMatthew G Knepley PetscErrorCode ierr; 436403adfb6SMatthew G Knepley 437403adfb6SMatthew G Knepley PetscFunctionBegin; 438403adfb6SMatthew G Knepley /* Allocate memory on device */ 439403adfb6SMatthew G Knepley ierr = cudaMalloc((void**)&d_a, sizeof(float)*N);CHKERRQ(ierr); 440403adfb6SMatthew G Knepley ierr = cudaMalloc((void**)&d_b, sizeof(float)*N);CHKERRQ(ierr); 441403adfb6SMatthew G Knepley ierr = cudaMalloc((void**)&d_c, sizeof(float)*N);CHKERRQ(ierr); 442403adfb6SMatthew G Knepley 443403adfb6SMatthew G Knepley /* Compute execution configuration */ 444403adfb6SMatthew G Knepley 445403adfb6SMatthew G Knepley dim3 dimBlock(iNumThreadsPerBlock); /* (iNumThreadsPerBlock,1,1) */ 446403adfb6SMatthew G Knepley dim3 dimGrid(N/dimBlock.x); /* (N/dimBlock.x,1,1) */ 447403adfb6SMatthew G Knepley if (N % dimBlock.x != 0) dimGrid.x+=1; 448403adfb6SMatthew G Knepley 449403adfb6SMatthew G Knepley /* Initialize memory on the device */ 450403adfb6SMatthew G Knepley set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N); 451403adfb6SMatthew G Knepley set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N); 452403adfb6SMatthew G Knepley set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N); 453403adfb6SMatthew G Knepley 454403adfb6SMatthew G Knepley /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ 455403adfb6SMatthew G Knepley PetscLogDouble cpuTimer = 0.0; 456403adfb6SMatthew G Knepley 457403adfb6SMatthew G Knepley scalar=3.0f; 458403adfb6SMatthew G Knepley for (k = 0; k < NTIMES; ++k) { 4598563dfccSBarry Smith PetscTimeSubtract(&cpuTimer); 460403adfb6SMatthew G Knepley STREAM_Copy<<<dimGrid,dimBlock>>>(d_a, d_c, N); 461*19816777SMark cudaStreamSynchronize(NULL); 462*19816777SMark ierr = MPI_Barrier(MPI_COMM_WORLD);CHKERRQ(ierr); 4638563dfccSBarry Smith PetscTimeAdd(&cpuTimer); 464*19816777SMark if (bDontUseGPUTiming) times[0][k] = cpuTimer*1.e3; // millisec 465403adfb6SMatthew G Knepley 466403adfb6SMatthew G Knepley cpuTimer = 0.0; 4678563dfccSBarry Smith PetscTimeSubtract(&cpuTimer); 468403adfb6SMatthew G Knepley STREAM_Copy_Optimized<<<dimGrid,dimBlock>>>(d_a, d_c, N); 469*19816777SMark cudaStreamSynchronize(NULL); 470*19816777SMark ierr = MPI_Barrier(MPI_COMM_WORLD);CHKERRQ(ierr); 471df3898eeSBarry Smith //get the total elapsed time in ms 4728563dfccSBarry Smith PetscTimeAdd(&cpuTimer); 473*19816777SMark if (bDontUseGPUTiming) times[1][k] = cpuTimer*1.e3; 474403adfb6SMatthew G Knepley 475403adfb6SMatthew G Knepley cpuTimer = 0.0; 4768563dfccSBarry Smith PetscTimeSubtract(&cpuTimer); 477403adfb6SMatthew G Knepley STREAM_Scale<<<dimGrid,dimBlock>>>(d_b, d_c, scalar, N); 478*19816777SMark cudaStreamSynchronize(NULL); 479*19816777SMark ierr = MPI_Barrier(MPI_COMM_WORLD);CHKERRQ(ierr); 480df3898eeSBarry Smith //get the total elapsed time in ms 4818563dfccSBarry Smith PetscTimeAdd(&cpuTimer); 482*19816777SMark if (bDontUseGPUTiming) times[2][k] = cpuTimer*1.e3; 483403adfb6SMatthew G Knepley 484403adfb6SMatthew G Knepley cpuTimer = 0.0; 4858563dfccSBarry Smith PetscTimeSubtract(&cpuTimer); 486caccb7e3SMatthew G Knepley STREAM_Scale_Optimized<<<dimGrid,dimBlock>>>(d_b, d_c, scalar, N); 487*19816777SMark cudaStreamSynchronize(NULL); 488*19816777SMark ierr = MPI_Barrier(MPI_COMM_WORLD);CHKERRQ(ierr); 489df3898eeSBarry Smith //get the total elapsed time in ms 4908563dfccSBarry Smith PetscTimeAdd(&cpuTimer); 491*19816777SMark if (bDontUseGPUTiming) times[3][k] = cpuTimer*1.e3; 492403adfb6SMatthew G Knepley 493403adfb6SMatthew G Knepley cpuTimer = 0.0; 4948563dfccSBarry Smith PetscTimeSubtract(&cpuTimer); 495*19816777SMark // ierr = cudaEventRecord(start, 0);CHKERRQ(ierr); 496caccb7e3SMatthew G Knepley STREAM_Add<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N); 497*19816777SMark cudaStreamSynchronize(NULL); 498*19816777SMark ierr = MPI_Barrier(MPI_COMM_WORLD);CHKERRQ(ierr); // ierr = cudaEventRecord(stop, 0);CHKERRQ(ierr); 499*19816777SMark // ierr = cudaEventSynchronize(stop);CHKERRQ(ierr); 500df3898eeSBarry Smith //get the total elapsed time in ms 5018563dfccSBarry Smith PetscTimeAdd(&cpuTimer); 502*19816777SMark if (bDontUseGPUTiming) times[4][k] = cpuTimer*1.e3; 5036f2b61bcSKarl Rupp else { 504*19816777SMark // ierr = cudaEventElapsedTime(×[4][k], start, stop);CHKERRQ(ierr); 505403adfb6SMatthew G Knepley } 506403adfb6SMatthew G Knepley 507caccb7e3SMatthew G Knepley cpuTimer = 0.0; 5088563dfccSBarry Smith PetscTimeSubtract(&cpuTimer); 509caccb7e3SMatthew G Knepley STREAM_Add_Optimized<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N); 510*19816777SMark cudaStreamSynchronize(NULL); 511*19816777SMark ierr = MPI_Barrier(MPI_COMM_WORLD);CHKERRQ(ierr); 512df3898eeSBarry Smith //get the total elapsed time in ms 5138563dfccSBarry Smith PetscTimeAdd(&cpuTimer); 514*19816777SMark if (bDontUseGPUTiming) times[5][k] = cpuTimer*1.e3; 515caccb7e3SMatthew G Knepley 516caccb7e3SMatthew G Knepley cpuTimer = 0.0; 5178563dfccSBarry Smith PetscTimeSubtract(&cpuTimer); 518caccb7e3SMatthew G Knepley STREAM_Triad<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N); 519*19816777SMark cudaStreamSynchronize(NULL); 520*19816777SMark ierr = MPI_Barrier(MPI_COMM_WORLD);CHKERRQ(ierr); 521df3898eeSBarry Smith //get the total elapsed time in ms 5228563dfccSBarry Smith PetscTimeAdd(&cpuTimer); 523*19816777SMark if (bDontUseGPUTiming) times[6][k] = cpuTimer*1.e3; 524caccb7e3SMatthew G Knepley 525caccb7e3SMatthew G Knepley cpuTimer = 0.0; 5268563dfccSBarry Smith PetscTimeSubtract(&cpuTimer); 527caccb7e3SMatthew G Knepley STREAM_Triad_Optimized<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N); 528*19816777SMark cudaStreamSynchronize(NULL); 529*19816777SMark ierr = MPI_Barrier(MPI_COMM_WORLD);CHKERRQ(ierr); 530df3898eeSBarry Smith //get the total elapsed time in ms 5318563dfccSBarry Smith PetscTimeAdd(&cpuTimer); 532*19816777SMark if (bDontUseGPUTiming) times[7][k] = cpuTimer*1.e3; 533caccb7e3SMatthew G Knepley } 534caccb7e3SMatthew G Knepley 535*19816777SMark if (1) { /* verify kernels */ 536403adfb6SMatthew G Knepley float *h_a, *h_b, *h_c; 537403adfb6SMatthew G Knepley bool errorSTREAMkernel = true; 538403adfb6SMatthew G Knepley 539403adfb6SMatthew G Knepley if ((h_a = (float*)calloc(N, sizeof(float))) == (float*)NULL) { 540403adfb6SMatthew G Knepley printf("Unable to allocate array h_a, exiting ...\n"); 541403adfb6SMatthew G Knepley exit(1); 542403adfb6SMatthew G Knepley } 543403adfb6SMatthew G Knepley if ((h_b = (float*)calloc(N, sizeof(float))) == (float*)NULL) { 544403adfb6SMatthew G Knepley printf("Unable to allocate array h_b, exiting ...\n"); 545403adfb6SMatthew G Knepley exit(1); 546403adfb6SMatthew G Knepley } 547403adfb6SMatthew G Knepley 548403adfb6SMatthew G Knepley if ((h_c = (float*)calloc(N, sizeof(float))) == (float*)NULL) { 549403adfb6SMatthew G Knepley printf("Unalbe to allocate array h_c, exiting ...\n"); 550403adfb6SMatthew G Knepley exit(1); 551403adfb6SMatthew G Knepley } 552403adfb6SMatthew G Knepley 553403adfb6SMatthew G Knepley /* 554403adfb6SMatthew G Knepley * perform kernel, copy device memory into host memory and verify each 555403adfb6SMatthew G Knepley * device kernel output 556403adfb6SMatthew G Knepley */ 557403adfb6SMatthew G Knepley 558403adfb6SMatthew G Knepley /* Initialize memory on the device */ 559403adfb6SMatthew G Knepley set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N); 560403adfb6SMatthew G Knepley set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N); 561403adfb6SMatthew G Knepley set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N); 562403adfb6SMatthew G Knepley 563403adfb6SMatthew G Knepley STREAM_Copy<<<dimGrid,dimBlock>>>(d_a, d_c, N); 564403adfb6SMatthew G Knepley ierr = cudaMemcpy(h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 565403adfb6SMatthew G Knepley ierr = cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 566403adfb6SMatthew G Knepley errorSTREAMkernel = STREAM_Copy_verify(h_a, h_c, N); 567403adfb6SMatthew G Knepley if (errorSTREAMkernel) { 568403adfb6SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy:\t\tError detected in device STREAM_Copy, exiting\n");CHKERRQ(ierr); 569403adfb6SMatthew G Knepley exit(-2000); 570403adfb6SMatthew G Knepley } 571403adfb6SMatthew G Knepley 572403adfb6SMatthew G Knepley /* Initialize memory on the device */ 573403adfb6SMatthew G Knepley set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N); 574403adfb6SMatthew G Knepley set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N); 575403adfb6SMatthew G Knepley set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N); 576403adfb6SMatthew G Knepley 577403adfb6SMatthew G Knepley STREAM_Copy_Optimized<<<dimGrid,dimBlock>>>(d_a, d_c, N); 578403adfb6SMatthew G Knepley ierr = cudaMemcpy(h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 579403adfb6SMatthew G Knepley ierr = cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 580403adfb6SMatthew G Knepley errorSTREAMkernel = STREAM_Copy_verify(h_a, h_c, N); 581403adfb6SMatthew G Knepley if (errorSTREAMkernel) { 582403adfb6SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy_Optimized:\tError detected in device STREAM_Copy_Optimized, exiting\n");CHKERRQ(ierr); 583403adfb6SMatthew G Knepley exit(-3000); 584403adfb6SMatthew G Knepley } 585403adfb6SMatthew G Knepley 586403adfb6SMatthew G Knepley /* Initialize memory on the device */ 587*19816777SMark set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N); 588403adfb6SMatthew G Knepley set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N); 589403adfb6SMatthew G Knepley set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N); 590403adfb6SMatthew G Knepley 591403adfb6SMatthew G Knepley STREAM_Scale<<<dimGrid,dimBlock>>>(d_b, d_c, scalar, N); 592403adfb6SMatthew G Knepley ierr = cudaMemcpy(h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 593403adfb6SMatthew G Knepley ierr = cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 594403adfb6SMatthew G Knepley errorSTREAMkernel = STREAM_Scale_verify(h_b, h_c, scalar, N); 595403adfb6SMatthew G Knepley if (errorSTREAMkernel) { 596403adfb6SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Scale:\t\tError detected in device STREAM_Scale, exiting\n");CHKERRQ(ierr); 597403adfb6SMatthew G Knepley exit(-4000); 598403adfb6SMatthew G Knepley } 599403adfb6SMatthew G Knepley 600403adfb6SMatthew G Knepley /* Initialize memory on the device */ 601403adfb6SMatthew G Knepley set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N); 602403adfb6SMatthew G Knepley set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N); 603403adfb6SMatthew G Knepley set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N); 604403adfb6SMatthew G Knepley 605403adfb6SMatthew G Knepley STREAM_Add<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N); 606403adfb6SMatthew G Knepley ierr = cudaMemcpy(h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 607403adfb6SMatthew G Knepley ierr = cudaMemcpy(h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 608403adfb6SMatthew G Knepley ierr = cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 609403adfb6SMatthew G Knepley errorSTREAMkernel = STREAM_Add_verify(h_a, h_b, h_c, N); 610403adfb6SMatthew G Knepley if (errorSTREAMkernel) { 611403adfb6SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Add:\t\tError detected in device STREAM_Add, exiting\n");CHKERRQ(ierr); 612403adfb6SMatthew G Knepley exit(-5000); 613403adfb6SMatthew G Knepley } 614403adfb6SMatthew G Knepley 615403adfb6SMatthew G Knepley /* Initialize memory on the device */ 616403adfb6SMatthew G Knepley set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N); 617403adfb6SMatthew G Knepley set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N); 618403adfb6SMatthew G Knepley set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N); 619403adfb6SMatthew G Knepley 620403adfb6SMatthew G Knepley STREAM_Triad<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N); 621403adfb6SMatthew G Knepley ierr = cudaMemcpy(h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 622403adfb6SMatthew G Knepley ierr = cudaMemcpy(h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 623403adfb6SMatthew G Knepley ierr = cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 624403adfb6SMatthew G Knepley errorSTREAMkernel = STREAM_Triad_verify(h_b, h_c, h_a, scalar, N); 625403adfb6SMatthew G Knepley if (errorSTREAMkernel) { 626403adfb6SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Triad:\t\tError detected in device STREAM_Triad, exiting\n");CHKERRQ(ierr); 627403adfb6SMatthew G Knepley exit(-6000); 628403adfb6SMatthew G Knepley } 629403adfb6SMatthew G Knepley 630*19816777SMark free(h_a); 631*19816777SMark free(h_b); 632*19816777SMark free(h_c); 633*19816777SMark } 634403adfb6SMatthew G Knepley /* continue from here */ 635*19816777SMark printResultsReadable(times, sizeof(float)); 636403adfb6SMatthew G Knepley 637403adfb6SMatthew G Knepley /* Free memory on device */ 638403adfb6SMatthew G Knepley ierr = cudaFree(d_a);CHKERRQ(ierr); 639403adfb6SMatthew G Knepley ierr = cudaFree(d_b);CHKERRQ(ierr); 640403adfb6SMatthew G Knepley ierr = cudaFree(d_c);CHKERRQ(ierr); 641*19816777SMark 642403adfb6SMatthew G Knepley PetscFunctionReturn(0); 643403adfb6SMatthew G Knepley } 644403adfb6SMatthew G Knepley 645caccb7e3SMatthew G Knepley PetscErrorCode runStreamDouble(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming) 646caccb7e3SMatthew G Knepley { 647caccb7e3SMatthew G Knepley double *d_a, *d_b, *d_c; 648caccb7e3SMatthew G Knepley int k; 649caccb7e3SMatthew G Knepley float times[8][NTIMES]; 650caccb7e3SMatthew G Knepley double scalar; 651caccb7e3SMatthew G Knepley PetscErrorCode ierr; 652caccb7e3SMatthew G Knepley 653caccb7e3SMatthew G Knepley PetscFunctionBegin; 654caccb7e3SMatthew G Knepley /* Allocate memory on device */ 655caccb7e3SMatthew G Knepley ierr = cudaMalloc((void**)&d_a, sizeof(double)*N);CHKERRQ(ierr); 656caccb7e3SMatthew G Knepley ierr = cudaMalloc((void**)&d_b, sizeof(double)*N);CHKERRQ(ierr); 657caccb7e3SMatthew G Knepley ierr = cudaMalloc((void**)&d_c, sizeof(double)*N);CHKERRQ(ierr); 658caccb7e3SMatthew G Knepley 659caccb7e3SMatthew G Knepley /* Compute execution configuration */ 660caccb7e3SMatthew G Knepley 661caccb7e3SMatthew G Knepley dim3 dimBlock(iNumThreadsPerBlock); /* (iNumThreadsPerBlock,1,1) */ 662caccb7e3SMatthew G Knepley dim3 dimGrid(N/dimBlock.x); /* (N/dimBlock.x,1,1) */ 663caccb7e3SMatthew G Knepley if (N % dimBlock.x != 0) dimGrid.x+=1; 664caccb7e3SMatthew G Knepley 665caccb7e3SMatthew G Knepley /* Initialize memory on the device */ 666caccb7e3SMatthew G Knepley set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N); 667caccb7e3SMatthew G Knepley set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N); 668caccb7e3SMatthew G Knepley set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N); 669caccb7e3SMatthew G Knepley 670caccb7e3SMatthew G Knepley /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ 671caccb7e3SMatthew G Knepley PetscLogDouble cpuTimer = 0.0; 672caccb7e3SMatthew G Knepley 673caccb7e3SMatthew G Knepley scalar=3.0; 674caccb7e3SMatthew G Knepley for (k = 0; k < NTIMES; ++k) { 6758563dfccSBarry Smith PetscTimeSubtract(&cpuTimer); 676caccb7e3SMatthew G Knepley STREAM_Copy_double<<<dimGrid,dimBlock>>>(d_a, d_c, N); 677*19816777SMark cudaStreamSynchronize(NULL); 678*19816777SMark ierr = MPI_Barrier(MPI_COMM_WORLD);CHKERRQ(ierr); 679df3898eeSBarry Smith //get the total elapsed time in ms 680caccb7e3SMatthew G Knepley if (bDontUseGPUTiming) { 6818563dfccSBarry Smith PetscTimeAdd(&cpuTimer); 682*19816777SMark times[0][k] = cpuTimer*1.e3; 683caccb7e3SMatthew G Knepley } 684caccb7e3SMatthew G Knepley 685caccb7e3SMatthew G Knepley cpuTimer = 0.0; 6868563dfccSBarry Smith PetscTimeSubtract(&cpuTimer); 687caccb7e3SMatthew G Knepley STREAM_Copy_Optimized_double<<<dimGrid,dimBlock>>>(d_a, d_c, N); 688*19816777SMark cudaStreamSynchronize(NULL); 689*19816777SMark ierr = MPI_Barrier(MPI_COMM_WORLD);CHKERRQ(ierr); 690df3898eeSBarry Smith //get the total elapsed time in ms 691caccb7e3SMatthew G Knepley if (bDontUseGPUTiming) { 6928563dfccSBarry Smith PetscTimeAdd(&cpuTimer); 693*19816777SMark times[1][k] = cpuTimer*1.e3; 694caccb7e3SMatthew G Knepley } 695caccb7e3SMatthew G Knepley 696caccb7e3SMatthew G Knepley cpuTimer = 0.0; 6978563dfccSBarry Smith PetscTimeSubtract(&cpuTimer); 698caccb7e3SMatthew G Knepley STREAM_Scale_double<<<dimGrid,dimBlock>>>(d_b, d_c, scalar, N); 699*19816777SMark cudaStreamSynchronize(NULL); 700*19816777SMark ierr = MPI_Barrier(MPI_COMM_WORLD);CHKERRQ(ierr); 701df3898eeSBarry Smith //get the total elapsed time in ms 7028563dfccSBarry Smith PetscTimeAdd(&cpuTimer); 703*19816777SMark if (bDontUseGPUTiming) times[2][k] = cpuTimer*1.e3; 704caccb7e3SMatthew G Knepley 705caccb7e3SMatthew G Knepley cpuTimer = 0.0; 7068563dfccSBarry Smith PetscTimeSubtract(&cpuTimer); 707caccb7e3SMatthew G Knepley STREAM_Scale_Optimized_double<<<dimGrid,dimBlock>>>(d_b, d_c, scalar, N); 708*19816777SMark cudaStreamSynchronize(NULL); 709*19816777SMark ierr = MPI_Barrier(MPI_COMM_WORLD);CHKERRQ(ierr); 710df3898eeSBarry Smith //get the total elapsed time in ms 7118563dfccSBarry Smith PetscTimeAdd(&cpuTimer); 712*19816777SMark if (bDontUseGPUTiming) times[3][k] = cpuTimer*1.e3; 713caccb7e3SMatthew G Knepley 714caccb7e3SMatthew G Knepley cpuTimer = 0.0; 7158563dfccSBarry Smith PetscTimeSubtract(&cpuTimer); 716caccb7e3SMatthew G Knepley STREAM_Add_double<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N); 717*19816777SMark cudaStreamSynchronize(NULL); 718*19816777SMark ierr = MPI_Barrier(MPI_COMM_WORLD);CHKERRQ(ierr); 719df3898eeSBarry Smith //get the total elapsed time in ms 7208563dfccSBarry Smith PetscTimeAdd(&cpuTimer); 721*19816777SMark if (bDontUseGPUTiming) times[4][k] = cpuTimer*1.e3; 722caccb7e3SMatthew G Knepley 723caccb7e3SMatthew G Knepley cpuTimer = 0.0; 7248563dfccSBarry Smith PetscTimeSubtract(&cpuTimer); 725caccb7e3SMatthew G Knepley STREAM_Add_Optimized_double<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N); 726*19816777SMark cudaStreamSynchronize(NULL); 727*19816777SMark ierr = MPI_Barrier(MPI_COMM_WORLD);CHKERRQ(ierr); 728df3898eeSBarry Smith //get the total elapsed time in ms 7298563dfccSBarry Smith PetscTimeAdd(&cpuTimer); 730*19816777SMark if (bDontUseGPUTiming) times[5][k] = cpuTimer*1.e3; 731caccb7e3SMatthew G Knepley 732caccb7e3SMatthew G Knepley cpuTimer = 0.0; 7338563dfccSBarry Smith PetscTimeSubtract(&cpuTimer); 734caccb7e3SMatthew G Knepley STREAM_Triad_double<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N); 735*19816777SMark cudaStreamSynchronize(NULL); 736*19816777SMark ierr = MPI_Barrier(MPI_COMM_WORLD);CHKERRQ(ierr); 737df3898eeSBarry Smith //get the total elapsed time in ms 7388563dfccSBarry Smith PetscTimeAdd(&cpuTimer); 739*19816777SMark if (bDontUseGPUTiming) times[6][k] = cpuTimer*1.e3; 740caccb7e3SMatthew G Knepley 741caccb7e3SMatthew G Knepley cpuTimer = 0.0; 7428563dfccSBarry Smith PetscTimeSubtract(&cpuTimer); 743caccb7e3SMatthew G Knepley STREAM_Triad_Optimized_double<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N); 744*19816777SMark cudaStreamSynchronize(NULL); 745*19816777SMark ierr = MPI_Barrier(MPI_COMM_WORLD);CHKERRQ(ierr); 746df3898eeSBarry Smith //get the total elapsed time in ms 7478563dfccSBarry Smith PetscTimeAdd(&cpuTimer); 748*19816777SMark if (bDontUseGPUTiming) times[7][k] = cpuTimer*1.e3; 749caccb7e3SMatthew G Knepley } 750caccb7e3SMatthew G Knepley 751*19816777SMark if (1) { /* verify kernels */ 752caccb7e3SMatthew G Knepley double *h_a, *h_b, *h_c; 753caccb7e3SMatthew G Knepley bool errorSTREAMkernel = true; 754caccb7e3SMatthew G Knepley 755caccb7e3SMatthew G Knepley if ((h_a = (double*)calloc(N, sizeof(double))) == (double*)NULL) { 756caccb7e3SMatthew G Knepley printf("Unable to allocate array h_a, exiting ...\n"); 757caccb7e3SMatthew G Knepley exit(1); 758caccb7e3SMatthew G Knepley } 759caccb7e3SMatthew G Knepley if ((h_b = (double*)calloc(N, sizeof(double))) == (double*)NULL) { 760caccb7e3SMatthew G Knepley printf("Unable to allocate array h_b, exiting ...\n"); 761caccb7e3SMatthew G Knepley exit(1); 762caccb7e3SMatthew G Knepley } 763caccb7e3SMatthew G Knepley 764caccb7e3SMatthew G Knepley if ((h_c = (double*)calloc(N, sizeof(double))) == (double*)NULL) { 765caccb7e3SMatthew G Knepley printf("Unalbe to allocate array h_c, exiting ...\n"); 766caccb7e3SMatthew G Knepley exit(1); 767caccb7e3SMatthew G Knepley } 768caccb7e3SMatthew G Knepley 769caccb7e3SMatthew G Knepley /* 770caccb7e3SMatthew G Knepley * perform kernel, copy device memory into host memory and verify each 771caccb7e3SMatthew G Knepley * device kernel output 772caccb7e3SMatthew G Knepley */ 773caccb7e3SMatthew G Knepley 774caccb7e3SMatthew G Knepley /* Initialize memory on the device */ 775caccb7e3SMatthew G Knepley set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N); 776caccb7e3SMatthew G Knepley set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N); 777caccb7e3SMatthew G Knepley set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N); 778caccb7e3SMatthew G Knepley 779caccb7e3SMatthew G Knepley STREAM_Copy_double<<<dimGrid,dimBlock>>>(d_a, d_c, N); 780caccb7e3SMatthew G Knepley ierr = cudaMemcpy(h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 781caccb7e3SMatthew G Knepley ierr = cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 782caccb7e3SMatthew G Knepley errorSTREAMkernel = STREAM_Copy_verify_double(h_a, h_c, N); 783caccb7e3SMatthew G Knepley if (errorSTREAMkernel) { 784caccb7e3SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy:\t\tError detected in device STREAM_Copy, exiting\n");CHKERRQ(ierr); 785caccb7e3SMatthew G Knepley exit(-2000); 786caccb7e3SMatthew G Knepley } 787caccb7e3SMatthew G Knepley 788caccb7e3SMatthew G Knepley /* Initialize memory on the device */ 789caccb7e3SMatthew G Knepley set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N); 790caccb7e3SMatthew G Knepley set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N); 791caccb7e3SMatthew G Knepley set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N); 792caccb7e3SMatthew G Knepley 793caccb7e3SMatthew G Knepley STREAM_Copy_Optimized_double<<<dimGrid,dimBlock>>>(d_a, d_c, N); 794caccb7e3SMatthew G Knepley ierr = cudaMemcpy(h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 795caccb7e3SMatthew G Knepley ierr = cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 796caccb7e3SMatthew G Knepley errorSTREAMkernel = STREAM_Copy_verify_double(h_a, h_c, N); 797caccb7e3SMatthew G Knepley if (errorSTREAMkernel) { 798caccb7e3SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy_Optimized:\tError detected in device STREAM_Copy_Optimized, exiting\n");CHKERRQ(ierr); 799caccb7e3SMatthew G Knepley exit(-3000); 800caccb7e3SMatthew G Knepley } 801caccb7e3SMatthew G Knepley 802caccb7e3SMatthew G Knepley /* Initialize memory on the device */ 803caccb7e3SMatthew G Knepley set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N); 804caccb7e3SMatthew G Knepley set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N); 805caccb7e3SMatthew G Knepley 806caccb7e3SMatthew G Knepley STREAM_Scale_double<<<dimGrid,dimBlock>>>(d_b, d_c, scalar, N); 807caccb7e3SMatthew G Knepley ierr = cudaMemcpy(h_b, d_b, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 808caccb7e3SMatthew G Knepley ierr = cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 809caccb7e3SMatthew G Knepley errorSTREAMkernel = STREAM_Scale_verify_double(h_b, h_c, scalar, N); 810caccb7e3SMatthew G Knepley if (errorSTREAMkernel) { 811caccb7e3SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Scale:\t\tError detected in device STREAM_Scale, exiting\n");CHKERRQ(ierr); 812caccb7e3SMatthew G Knepley exit(-4000); 813caccb7e3SMatthew G Knepley } 814caccb7e3SMatthew G Knepley 815caccb7e3SMatthew G Knepley /* Initialize memory on the device */ 816caccb7e3SMatthew G Knepley set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N); 817caccb7e3SMatthew G Knepley set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N); 818caccb7e3SMatthew G Knepley set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N); 819caccb7e3SMatthew G Knepley 820caccb7e3SMatthew G Knepley STREAM_Add_double<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N); 821caccb7e3SMatthew G Knepley ierr = cudaMemcpy(h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 822caccb7e3SMatthew G Knepley ierr = cudaMemcpy(h_b, d_b, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 823caccb7e3SMatthew G Knepley ierr = cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 824caccb7e3SMatthew G Knepley errorSTREAMkernel = STREAM_Add_verify_double(h_a, h_b, h_c, N); 825caccb7e3SMatthew G Knepley if (errorSTREAMkernel) { 826caccb7e3SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Add:\t\tError detected in device STREAM_Add, exiting\n");CHKERRQ(ierr); 827caccb7e3SMatthew G Knepley exit(-5000); 828caccb7e3SMatthew G Knepley } 829caccb7e3SMatthew G Knepley 830caccb7e3SMatthew G Knepley /* Initialize memory on the device */ 831caccb7e3SMatthew G Knepley set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N); 832caccb7e3SMatthew G Knepley set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N); 833caccb7e3SMatthew G Knepley set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N); 834caccb7e3SMatthew G Knepley 835caccb7e3SMatthew G Knepley STREAM_Triad_double<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N); 836caccb7e3SMatthew G Knepley ierr = cudaMemcpy(h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 837caccb7e3SMatthew G Knepley ierr = cudaMemcpy(h_b, d_b, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 838caccb7e3SMatthew G Knepley ierr = cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 839caccb7e3SMatthew G Knepley errorSTREAMkernel = STREAM_Triad_verify_double(h_b, h_c, h_a, scalar, N); 840caccb7e3SMatthew G Knepley if (errorSTREAMkernel) { 841caccb7e3SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Triad:\t\tError detected in device STREAM_Triad, exiting\n");CHKERRQ(ierr); 842caccb7e3SMatthew G Knepley exit(-6000); 843caccb7e3SMatthew G Knepley } 844caccb7e3SMatthew G Knepley 845*19816777SMark free(h_a); 846*19816777SMark free(h_b); 847*19816777SMark free(h_c); 848*19816777SMark } 849caccb7e3SMatthew G Knepley /* continue from here */ 850*19816777SMark printResultsReadable(times,sizeof(double)); 851caccb7e3SMatthew G Knepley 852caccb7e3SMatthew G Knepley /* Free memory on device */ 853caccb7e3SMatthew G Knepley ierr = cudaFree(d_a);CHKERRQ(ierr); 854caccb7e3SMatthew G Knepley ierr = cudaFree(d_b);CHKERRQ(ierr); 855caccb7e3SMatthew G Knepley ierr = cudaFree(d_c);CHKERRQ(ierr); 856*19816777SMark 857caccb7e3SMatthew G Knepley PetscFunctionReturn(0); 858caccb7e3SMatthew G Knepley } 859caccb7e3SMatthew G Knepley 860403adfb6SMatthew G Knepley /////////////////////////////////////////////////////////////////////////// 861403adfb6SMatthew G Knepley //Print Results to Screen and File 862403adfb6SMatthew G Knepley /////////////////////////////////////////////////////////////////////////// 863*19816777SMark PetscErrorCode printResultsReadable(float times[][NTIMES], const size_t bsize) 864a6dfd86eSKarl Rupp { 865403adfb6SMatthew G Knepley PetscErrorCode ierr; 866403adfb6SMatthew G Knepley PetscInt j, k; 867caccb7e3SMatthew G Knepley float avgtime[8] = {0., 0., 0., 0., 0., 0., 0., 0.}; 868caccb7e3SMatthew G Knepley float maxtime[8] = {0., 0., 0., 0., 0., 0., 0., 0.}; 869caccb7e3SMatthew G Knepley float mintime[8] = {1e30,1e30,1e30,1e30,1e30,1e30,1e30,1e30}; 870*19816777SMark // char *label[8] = {"Copy: ", "Copy Opt.: ", "Scale: ", "Scale Opt: ", "Add: ", "Add Opt: ", "Triad: ", "Triad Opt: "}; 871*19816777SMark const float bytes_per_kernel[8] = { 872*19816777SMark 2. * bsize * N, 873*19816777SMark 2. * bsize * N, 874*19816777SMark 2. * bsize * N, 875*19816777SMark 2. * bsize * N, 876*19816777SMark 3. * bsize * N, 877*19816777SMark 3. * bsize * N, 878*19816777SMark 3. * bsize * N, 879*19816777SMark 3. * bsize * N 880403adfb6SMatthew G Knepley }; 881*19816777SMark double rate,irate; 882*19816777SMark int rank,size; 883403adfb6SMatthew G Knepley PetscFunctionBegin; 884*19816777SMark ierr = MPI_Comm_rank(MPI_COMM_WORLD,&rank);CHKERRQ(ierr); 885*19816777SMark ierr = MPI_Comm_size(MPI_COMM_WORLD,&size);CHKERRQ(ierr); 886403adfb6SMatthew G Knepley /* --- SUMMARY --- */ 887*19816777SMark for (k = 0; k < NTIMES; ++k) { 888caccb7e3SMatthew G Knepley for (j = 0; j < 8; ++j) { 889*19816777SMark avgtime[j] = avgtime[j] + (1.e-03f * times[j][k]); // millisec --> sec 890403adfb6SMatthew G Knepley mintime[j] = MIN(mintime[j], (1.e-03f * times[j][k])); 891403adfb6SMatthew G Knepley maxtime[j] = MAX(maxtime[j], (1.e-03f * times[j][k])); 892403adfb6SMatthew G Knepley } 893*19816777SMark } 894caccb7e3SMatthew G Knepley for (j = 0; j < 8; ++j) { 895403adfb6SMatthew G Knepley avgtime[j] = avgtime[j]/(float)(NTIMES-1); 896403adfb6SMatthew G Knepley } 897*19816777SMark j = 7; 898*19816777SMark irate = 1.0E-06 * bytes_per_kernel[j]/mintime[j]; 899*19816777SMark ierr = MPI_Reduce(&irate,&rate,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); 900*19816777SMark if (!rank) { 901*19816777SMark FILE *fd; 902*19816777SMark if (size == 1) { 903*19816777SMark printf("%d %11.4f Rate (MB/s)\n",size, rate); 904*19816777SMark fd = fopen("flops","w"); 905*19816777SMark fprintf(fd,"%g\n",rate); 906*19816777SMark fclose(fd); 907*19816777SMark } else { 908*19816777SMark double prate; 909*19816777SMark fd = fopen("flops","r"); 910*19816777SMark fscanf(fd,"%lg",&prate); 911*19816777SMark fclose(fd); 912*19816777SMark printf("%d %11.4f Rate (MB/s) %g \n", size, rate, rate/prate); 913*19816777SMark } 914*19816777SMark } 915*19816777SMark 916403adfb6SMatthew G Knepley PetscFunctionReturn(0); 917403adfb6SMatthew G Knepley } 918