benchmarks/streams/CUDAVersion.cu

403adfb6SMatthew G Knepley/*
403adfb6SMatthew G Knepley  STREAM benchmark implementation in CUDA.
403adfb6SMatthew G Knepley
403adfb6SMatthew G Knepley    COPY:       a(i) = b(i)
403adfb6SMatthew G Knepley    SCALE:      a(i) = q*b(i)
403adfb6SMatthew G Knepley    SUM:        a(i) = b(i) + c(i)
403adfb6SMatthew G Knepley    TRIAD:      a(i) = b(i) + q*c(i)
403adfb6SMatthew G Knepley
403adfb6SMatthew G Knepley  It measures the memory system on the device.
19816777SMark  The implementation is in double precision with a single option.
403adfb6SMatthew G Knepley
403adfb6SMatthew G Knepley  Code based on the code developed by John D. McCalpin
403adfb6SMatthew G Knepley  http://www.cs.virginia.edu/stream/FTP/Code/stream.c
403adfb6SMatthew G Knepley
403adfb6SMatthew G Knepley  Written by: Massimiliano Fatica, NVIDIA Corporation
403adfb6SMatthew G Knepley  Modified by: Douglas Enright (dpephd-nvidia@yahoo.com), 1 December 2010
403adfb6SMatthew G Knepley  Extensive Revisions, 4 December 2010
403adfb6SMatthew G Knepley  Modified for PETSc by: Matthew G. Knepley 14 Aug 2011
403adfb6SMatthew G Knepley
403adfb6SMatthew G Knepley  User interface motivated by bandwidthTest NVIDIA SDK example.
403adfb6SMatthew G Knepley*/
19816777SMarkstatic char help[] = "Double-Precision STREAM Benchmark implementation in CUDA\n Performs Copy, Scale, Add, and Triad double-precision kernels\n\n";
403adfb6SMatthew G Knepley
403adfb6SMatthew G Knepley#include <petscconf.h>
403adfb6SMatthew G Knepley#include <petscsys.h>
403adfb6SMatthew G Knepley#include <petsctime.h>
5f80ce2aSJacob Faibussowitsch#include <petscdevice.h>
403adfb6SMatthew G Knepley
19816777SMark#define N        10000000
403adfb6SMatthew G Knepley#define NTIMES   10
403adfb6SMatthew G Knepley
403adfb6SMatthew G Knepley# ifndef MIN
403adfb6SMatthew G Knepley# define MIN(x,y) ((x)<(y) ? (x) : (y))
403adfb6SMatthew G Knepley# endif
403adfb6SMatthew G Knepley# ifndef MAX
403adfb6SMatthew G Knepley# define MAX(x,y) ((x)>(y) ? (x) : (y))
403adfb6SMatthew G Knepley# endif
403adfb6SMatthew G Knepley
403adfb6SMatthew G Knepleyconst float  flt_eps = 1.192092896e-07f;
caccb7e3SMatthew G Knepleyconst double dbl_eps = 2.2204460492503131e-16;
403adfb6SMatthew G Knepley
403adfb6SMatthew G Knepley__global__ void set_array(float *a,  float value, size_t len)
403adfb6SMatthew G Knepley{
403adfb6SMatthew G Knepley  size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
403adfb6SMatthew G Knepley  while (idx < len) {
403adfb6SMatthew G Knepley    a[idx] = value;
403adfb6SMatthew G Knepley    idx   += blockDim.x * gridDim.x;
403adfb6SMatthew G Knepley  }
403adfb6SMatthew G Knepley}
403adfb6SMatthew G Knepley
caccb7e3SMatthew G Knepley__global__ void set_array_double(double *a,  double value, size_t len)
caccb7e3SMatthew G Knepley{
caccb7e3SMatthew G Knepley  size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
caccb7e3SMatthew G Knepley  while (idx < len) {
caccb7e3SMatthew G Knepley    a[idx] = value;
caccb7e3SMatthew G Knepley    idx   += blockDim.x * gridDim.x;
caccb7e3SMatthew G Knepley  }
caccb7e3SMatthew G Knepley}
caccb7e3SMatthew G Knepley
403adfb6SMatthew G Knepley__global__ void STREAM_Copy(float *a, float *b, size_t len)
403adfb6SMatthew G Knepley{
403adfb6SMatthew G Knepley  size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
403adfb6SMatthew G Knepley  while (idx < len) {
403adfb6SMatthew G Knepley    b[idx] = a[idx];
403adfb6SMatthew G Knepley    idx   += blockDim.x * gridDim.x;
403adfb6SMatthew G Knepley  }
403adfb6SMatthew G Knepley}
403adfb6SMatthew G Knepley
caccb7e3SMatthew G Knepley__global__ void STREAM_Copy_double(double *a, double *b, size_t len)
caccb7e3SMatthew G Knepley{
caccb7e3SMatthew G Knepley  size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
caccb7e3SMatthew G Knepley  while (idx < len) {
caccb7e3SMatthew G Knepley    b[idx] = a[idx];
caccb7e3SMatthew G Knepley    idx   += blockDim.x * gridDim.x;
caccb7e3SMatthew G Knepley  }
caccb7e3SMatthew G Knepley}
caccb7e3SMatthew G Knepley
403adfb6SMatthew G Knepley__global__ void STREAM_Copy_Optimized(float *a, float *b, size_t len)
403adfb6SMatthew G Knepley{
403adfb6SMatthew G Knepley  /*
403adfb6SMatthew G Knepley   * Ensure size of thread index space is as large as or greater than
403adfb6SMatthew G Knepley   * vector index space else return.
403adfb6SMatthew G Knepley   */
403adfb6SMatthew G Knepley  if (blockDim.x * gridDim.x < len) return;
403adfb6SMatthew G Knepley  size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
403adfb6SMatthew G Knepley  if (idx < len) b[idx] = a[idx];
403adfb6SMatthew G Knepley}
403adfb6SMatthew G Knepley
caccb7e3SMatthew G Knepley__global__ void STREAM_Copy_Optimized_double(double *a, double *b, size_t len)
caccb7e3SMatthew G Knepley{
caccb7e3SMatthew G Knepley  /*
caccb7e3SMatthew G Knepley   * Ensure size of thread index space is as large as or greater than
caccb7e3SMatthew G Knepley   * vector index space else return.
caccb7e3SMatthew G Knepley   */
caccb7e3SMatthew G Knepley  if (blockDim.x * gridDim.x < len) return;
caccb7e3SMatthew G Knepley  size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
caccb7e3SMatthew G Knepley  if (idx < len) b[idx] = a[idx];
caccb7e3SMatthew G Knepley}
caccb7e3SMatthew G Knepley
403adfb6SMatthew G Knepley__global__ void STREAM_Scale(float *a, float *b, float scale,  size_t len)
403adfb6SMatthew G Knepley{
403adfb6SMatthew G Knepley  size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
403adfb6SMatthew G Knepley  while (idx < len) {
403adfb6SMatthew G Knepley    b[idx] = scale* a[idx];
403adfb6SMatthew G Knepley    idx   += blockDim.x * gridDim.x;
403adfb6SMatthew G Knepley  }
403adfb6SMatthew G Knepley}
403adfb6SMatthew G Knepley
caccb7e3SMatthew G Knepley__global__ void STREAM_Scale_double(double *a, double *b, double scale,  size_t len)
caccb7e3SMatthew G Knepley{
caccb7e3SMatthew G Knepley  size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
caccb7e3SMatthew G Knepley  while (idx < len) {
caccb7e3SMatthew G Knepley    b[idx] = scale* a[idx];
caccb7e3SMatthew G Knepley    idx   += blockDim.x * gridDim.x;
caccb7e3SMatthew G Knepley  }
caccb7e3SMatthew G Knepley}
caccb7e3SMatthew G Knepley
caccb7e3SMatthew G Knepley__global__ void STREAM_Scale_Optimized(float *a, float *b, float scale,  size_t len)
caccb7e3SMatthew G Knepley{
caccb7e3SMatthew G Knepley  /*
caccb7e3SMatthew G Knepley   * Ensure size of thread index space is as large as or greater than
caccb7e3SMatthew G Knepley   * vector index space else return.
caccb7e3SMatthew G Knepley   */
caccb7e3SMatthew G Knepley  if (blockDim.x * gridDim.x < len) return;
caccb7e3SMatthew G Knepley  size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
caccb7e3SMatthew G Knepley  if (idx < len) b[idx] = scale* a[idx];
caccb7e3SMatthew G Knepley}
caccb7e3SMatthew G Knepley
caccb7e3SMatthew G Knepley__global__ void STREAM_Scale_Optimized_double(double *a, double *b, double scale,  size_t len)
caccb7e3SMatthew G Knepley{
caccb7e3SMatthew G Knepley  /*
caccb7e3SMatthew G Knepley   * Ensure size of thread index space is as large as or greater than
caccb7e3SMatthew G Knepley   * vector index space else return.
caccb7e3SMatthew G Knepley   */
caccb7e3SMatthew G Knepley  if (blockDim.x * gridDim.x < len) return;
caccb7e3SMatthew G Knepley  size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
caccb7e3SMatthew G Knepley  if (idx < len) b[idx] = scale* a[idx];
caccb7e3SMatthew G Knepley}
caccb7e3SMatthew G Knepley
403adfb6SMatthew G Knepley__global__ void STREAM_Add(float *a, float *b, float *c,  size_t len)
403adfb6SMatthew G Knepley{
403adfb6SMatthew G Knepley  size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
403adfb6SMatthew G Knepley  while (idx < len) {
403adfb6SMatthew G Knepley    c[idx] = a[idx]+b[idx];
403adfb6SMatthew G Knepley    idx   += blockDim.x * gridDim.x;
403adfb6SMatthew G Knepley  }
403adfb6SMatthew G Knepley}
403adfb6SMatthew G Knepley
caccb7e3SMatthew G Knepley__global__ void STREAM_Add_double(double *a, double *b, double *c,  size_t len)
caccb7e3SMatthew G Knepley{
caccb7e3SMatthew G Knepley  size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
caccb7e3SMatthew G Knepley  while (idx < len) {
caccb7e3SMatthew G Knepley    c[idx] = a[idx]+b[idx];
caccb7e3SMatthew G Knepley    idx   += blockDim.x * gridDim.x;
caccb7e3SMatthew G Knepley  }
caccb7e3SMatthew G Knepley}
caccb7e3SMatthew G Knepley
caccb7e3SMatthew G Knepley__global__ void STREAM_Add_Optimized(float *a, float *b, float *c,  size_t len)
caccb7e3SMatthew G Knepley{
caccb7e3SMatthew G Knepley  /*
caccb7e3SMatthew G Knepley   * Ensure size of thread index space is as large as or greater than
caccb7e3SMatthew G Knepley   * vector index space else return.
caccb7e3SMatthew G Knepley   */
caccb7e3SMatthew G Knepley  if (blockDim.x * gridDim.x < len) return;
caccb7e3SMatthew G Knepley  size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
caccb7e3SMatthew G Knepley  if (idx < len) c[idx] = a[idx]+b[idx];
caccb7e3SMatthew G Knepley}
caccb7e3SMatthew G Knepley
caccb7e3SMatthew G Knepley__global__ void STREAM_Add_Optimized_double(double *a, double *b, double *c,  size_t len)
caccb7e3SMatthew G Knepley{
caccb7e3SMatthew G Knepley  /*
caccb7e3SMatthew G Knepley   * Ensure size of thread index space is as large as or greater than
caccb7e3SMatthew G Knepley   * vector index space else return.
caccb7e3SMatthew G Knepley   */
caccb7e3SMatthew G Knepley  if (blockDim.x * gridDim.x < len) return;
caccb7e3SMatthew G Knepley  size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
caccb7e3SMatthew G Knepley  if (idx < len) c[idx] = a[idx]+b[idx];
caccb7e3SMatthew G Knepley}
caccb7e3SMatthew G Knepley
403adfb6SMatthew G Knepley__global__ void STREAM_Triad(float *a, float *b, float *c, float scalar, size_t len)
403adfb6SMatthew G Knepley{
403adfb6SMatthew G Knepley  size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
403adfb6SMatthew G Knepley  while (idx < len) {
403adfb6SMatthew G Knepley    c[idx] = a[idx]+scalar*b[idx];
403adfb6SMatthew G Knepley    idx   += blockDim.x * gridDim.x;
403adfb6SMatthew G Knepley  }
403adfb6SMatthew G Knepley}
403adfb6SMatthew G Knepley
caccb7e3SMatthew G Knepley__global__ void STREAM_Triad_double(double *a, double *b, double *c, double scalar, size_t len)
caccb7e3SMatthew G Knepley{
caccb7e3SMatthew G Knepley  size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
caccb7e3SMatthew G Knepley  while (idx < len) {
caccb7e3SMatthew G Knepley    c[idx] = a[idx]+scalar*b[idx];
caccb7e3SMatthew G Knepley    idx   += blockDim.x * gridDim.x;
caccb7e3SMatthew G Knepley  }
caccb7e3SMatthew G Knepley}
caccb7e3SMatthew G Knepley
caccb7e3SMatthew G Knepley__global__ void STREAM_Triad_Optimized(float *a, float *b, float *c, float scalar, size_t len)
caccb7e3SMatthew G Knepley{
caccb7e3SMatthew G Knepley  /*
caccb7e3SMatthew G Knepley   * Ensure size of thread index space is as large as or greater than
caccb7e3SMatthew G Knepley   * vector index space else return.
caccb7e3SMatthew G Knepley   */
caccb7e3SMatthew G Knepley  if (blockDim.x * gridDim.x < len) return;
caccb7e3SMatthew G Knepley  size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
caccb7e3SMatthew G Knepley  if (idx < len) c[idx] = a[idx]+scalar*b[idx];
caccb7e3SMatthew G Knepley}
caccb7e3SMatthew G Knepley
caccb7e3SMatthew G Knepley__global__ void STREAM_Triad_Optimized_double(double *a, double *b, double *c, double scalar, size_t len)
caccb7e3SMatthew G Knepley{
caccb7e3SMatthew G Knepley  /*
caccb7e3SMatthew G Knepley   * Ensure size of thread index space is as large as or greater than
caccb7e3SMatthew G Knepley   * vector index space else return.
caccb7e3SMatthew G Knepley   */
caccb7e3SMatthew G Knepley  if (blockDim.x * gridDim.x < len) return;
caccb7e3SMatthew G Knepley  size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
caccb7e3SMatthew G Knepley  if (idx < len) c[idx] = a[idx]+scalar*b[idx];
caccb7e3SMatthew G Knepley}
caccb7e3SMatthew G Knepley
403adfb6SMatthew G Knepley/* Host side verification routines */
a6dfd86eSKarl Ruppbool STREAM_Copy_verify(float *a, float *b, size_t len)
a6dfd86eSKarl Rupp{
403adfb6SMatthew G Knepley  size_t idx;
403adfb6SMatthew G Knepley  bool   bDifferent = false;
403adfb6SMatthew G Knepley
403adfb6SMatthew G Knepley  for (idx = 0; idx < len && !bDifferent; idx++) {
403adfb6SMatthew G Knepley    float expectedResult     = a[idx];
403adfb6SMatthew G Knepley    float diffResultExpected = (b[idx] - expectedResult);
403adfb6SMatthew G Knepley    float relErrorULPS       = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps;
403adfb6SMatthew G Knepley    /* element-wise relative error determination */
403adfb6SMatthew G Knepley    bDifferent = (relErrorULPS > 2.f);
403adfb6SMatthew G Knepley  }
403adfb6SMatthew G Knepley
403adfb6SMatthew G Knepley  return bDifferent;
403adfb6SMatthew G Knepley}
403adfb6SMatthew G Knepley
a6dfd86eSKarl Ruppbool STREAM_Copy_verify_double(double *a, double *b, size_t len)
a6dfd86eSKarl Rupp{
caccb7e3SMatthew G Knepley  size_t idx;
caccb7e3SMatthew G Knepley  bool   bDifferent = false;
caccb7e3SMatthew G Knepley
caccb7e3SMatthew G Knepley  for (idx = 0; idx < len && !bDifferent; idx++) {
caccb7e3SMatthew G Knepley    double expectedResult     = a[idx];
caccb7e3SMatthew G Knepley    double diffResultExpected = (b[idx] - expectedResult);
19816777SMark    double relErrorULPS       = (fabsf(diffResultExpected)/fabsf(expectedResult))/dbl_eps;
caccb7e3SMatthew G Knepley    /* element-wise relative error determination */
caccb7e3SMatthew G Knepley    bDifferent = (relErrorULPS > 2.);
caccb7e3SMatthew G Knepley  }
caccb7e3SMatthew G Knepley
caccb7e3SMatthew G Knepley  return bDifferent;
caccb7e3SMatthew G Knepley}
caccb7e3SMatthew G Knepley
a6dfd86eSKarl Ruppbool STREAM_Scale_verify(float *a, float *b, float scale, size_t len)
a6dfd86eSKarl Rupp{
403adfb6SMatthew G Knepley  size_t idx;
403adfb6SMatthew G Knepley  bool   bDifferent = false;
403adfb6SMatthew G Knepley
403adfb6SMatthew G Knepley  for (idx = 0; idx < len && !bDifferent; idx++) {
403adfb6SMatthew G Knepley    float expectedResult     = scale*a[idx];
403adfb6SMatthew G Knepley    float diffResultExpected = (b[idx] - expectedResult);
403adfb6SMatthew G Knepley    float relErrorULPS       = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps;
403adfb6SMatthew G Knepley    /* element-wise relative error determination */
403adfb6SMatthew G Knepley    bDifferent = (relErrorULPS > 2.f);
403adfb6SMatthew G Knepley  }
403adfb6SMatthew G Knepley
403adfb6SMatthew G Knepley  return bDifferent;
403adfb6SMatthew G Knepley}
403adfb6SMatthew G Knepley
a6dfd86eSKarl Ruppbool STREAM_Scale_verify_double(double *a, double *b, double scale, size_t len)
a6dfd86eSKarl Rupp{
caccb7e3SMatthew G Knepley  size_t idx;
caccb7e3SMatthew G Knepley  bool   bDifferent = false;
caccb7e3SMatthew G Knepley
caccb7e3SMatthew G Knepley  for (idx = 0; idx < len && !bDifferent; idx++) {
caccb7e3SMatthew G Knepley    double expectedResult     = scale*a[idx];
caccb7e3SMatthew G Knepley    double diffResultExpected = (b[idx] - expectedResult);
caccb7e3SMatthew G Knepley    double relErrorULPS       = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps;
caccb7e3SMatthew G Knepley    /* element-wise relative error determination */
caccb7e3SMatthew G Knepley    bDifferent = (relErrorULPS > 2.);
caccb7e3SMatthew G Knepley  }
caccb7e3SMatthew G Knepley
caccb7e3SMatthew G Knepley  return bDifferent;
caccb7e3SMatthew G Knepley}
caccb7e3SMatthew G Knepley
a6dfd86eSKarl Ruppbool STREAM_Add_verify(float *a, float *b, float *c, size_t len)
a6dfd86eSKarl Rupp{
403adfb6SMatthew G Knepley  size_t idx;
403adfb6SMatthew G Knepley  bool   bDifferent = false;
403adfb6SMatthew G Knepley
403adfb6SMatthew G Knepley  for (idx = 0; idx < len && !bDifferent; idx++) {
403adfb6SMatthew G Knepley    float expectedResult     = a[idx] + b[idx];
403adfb6SMatthew G Knepley    float diffResultExpected = (c[idx] - expectedResult);
403adfb6SMatthew G Knepley    float relErrorULPS       = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps;
403adfb6SMatthew G Knepley    /* element-wise relative error determination */
403adfb6SMatthew G Knepley    bDifferent = (relErrorULPS > 2.f);
403adfb6SMatthew G Knepley  }
403adfb6SMatthew G Knepley
403adfb6SMatthew G Knepley  return bDifferent;
403adfb6SMatthew G Knepley}
403adfb6SMatthew G Knepley
a6dfd86eSKarl Ruppbool STREAM_Add_verify_double(double *a, double *b, double *c, size_t len)
a6dfd86eSKarl Rupp{
caccb7e3SMatthew G Knepley  size_t idx;
caccb7e3SMatthew G Knepley  bool   bDifferent = false;
caccb7e3SMatthew G Knepley
caccb7e3SMatthew G Knepley  for (idx = 0; idx < len && !bDifferent; idx++) {
caccb7e3SMatthew G Knepley    double expectedResult     = a[idx] + b[idx];
caccb7e3SMatthew G Knepley    double diffResultExpected = (c[idx] - expectedResult);
caccb7e3SMatthew G Knepley    double relErrorULPS       = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps;
caccb7e3SMatthew G Knepley    /* element-wise relative error determination */
caccb7e3SMatthew G Knepley    bDifferent = (relErrorULPS > 2.);
caccb7e3SMatthew G Knepley  }
caccb7e3SMatthew G Knepley
caccb7e3SMatthew G Knepley  return bDifferent;
caccb7e3SMatthew G Knepley}
caccb7e3SMatthew G Knepley
a6dfd86eSKarl Ruppbool STREAM_Triad_verify(float *a, float *b, float *c, float scalar, size_t len)
a6dfd86eSKarl Rupp{
403adfb6SMatthew G Knepley  size_t idx;
403adfb6SMatthew G Knepley  bool   bDifferent = false;
403adfb6SMatthew G Knepley
403adfb6SMatthew G Knepley  for (idx = 0; idx < len && !bDifferent; idx++) {
403adfb6SMatthew G Knepley    float expectedResult     = a[idx] + scalar*b[idx];
403adfb6SMatthew G Knepley    float diffResultExpected = (c[idx] - expectedResult);
403adfb6SMatthew G Knepley    float relErrorULPS       = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps;
403adfb6SMatthew G Knepley    /* element-wise relative error determination */
403adfb6SMatthew G Knepley    bDifferent = (relErrorULPS > 3.f);
403adfb6SMatthew G Knepley  }
403adfb6SMatthew G Knepley
403adfb6SMatthew G Knepley  return bDifferent;
403adfb6SMatthew G Knepley}
403adfb6SMatthew G Knepley
a6dfd86eSKarl Ruppbool STREAM_Triad_verify_double(double *a, double *b, double *c, double scalar, size_t len)
a6dfd86eSKarl Rupp{
caccb7e3SMatthew G Knepley  size_t idx;
caccb7e3SMatthew G Knepley  bool   bDifferent = false;
caccb7e3SMatthew G Knepley
caccb7e3SMatthew G Knepley  for (idx = 0; idx < len && !bDifferent; idx++) {
caccb7e3SMatthew G Knepley    double expectedResult     = a[idx] + scalar*b[idx];
caccb7e3SMatthew G Knepley    double diffResultExpected = (c[idx] - expectedResult);
caccb7e3SMatthew G Knepley    double relErrorULPS       = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps;
caccb7e3SMatthew G Knepley    /* element-wise relative error determination */
caccb7e3SMatthew G Knepley    bDifferent = (relErrorULPS > 3.);
caccb7e3SMatthew G Knepley  }
caccb7e3SMatthew G Knepley
caccb7e3SMatthew G Knepley  return bDifferent;
caccb7e3SMatthew G Knepley}
caccb7e3SMatthew G Knepley
403adfb6SMatthew G Knepley/* forward declarations */
caccb7e3SMatthew G KnepleyPetscErrorCode setupStream(PetscInt device, PetscBool runDouble, PetscBool cpuTiming);
403adfb6SMatthew G KnepleyPetscErrorCode runStream(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming);
caccb7e3SMatthew G KnepleyPetscErrorCode runStreamDouble(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming);
19816777SMarkPetscErrorCode printResultsReadable(float times[][NTIMES], size_t);
403adfb6SMatthew G Knepley
403adfb6SMatthew G Knepleyint main(int argc, char *argv[])
403adfb6SMatthew G Knepley{
403adfb6SMatthew G Knepley  PetscInt       device    = 0;
19816777SMark  PetscBool      runDouble = PETSC_TRUE;
19816777SMark  const PetscBool cpuTiming = PETSC_TRUE; // must be true
403adfb6SMatthew G Knepley  PetscErrorCode ierr;
403adfb6SMatthew G Knepley
9566063dSJacob Faibussowitsch  PetscCallCUDA(cudaSetDeviceFlags(cudaDeviceBlockingSync));
19816777SMark
9566063dSJacob Faibussowitsch  PetscCall(PetscInitialize(&argc, &argv, 0, help));
403adfb6SMatthew G Knepley
d0609cedSBarry Smith  PetscOptionsBegin(PETSC_COMM_WORLD, "", "STREAM Benchmark Options", "STREAM");
9566063dSJacob Faibussowitsch  PetscCall(PetscOptionsBoundedInt("-device", "Specify the CUDA device to be used", "STREAM", device, &device, NULL,0));
9566063dSJacob Faibussowitsch  PetscCall(PetscOptionsBool("-double",    "Also run double precision tests",   "STREAM", runDouble, &runDouble, NULL));
d0609cedSBarry Smith  PetscOptionsEnd();
403adfb6SMatthew G Knepley
caccb7e3SMatthew G Knepley  ierr = setupStream(device, runDouble, cpuTiming);
19816777SMark  if (ierr) {
9566063dSJacob Faibussowitsch    PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n[streamBenchmark] - results:\t%s\n\n", (ierr == 0) ? "PASSES" : "FAILED"));
403adfb6SMatthew G Knepley  }
9566063dSJacob Faibussowitsch  PetscCall(PetscFinalize());
b122ec5aSJacob Faibussowitsch  return 0;
403adfb6SMatthew G Knepley}
403adfb6SMatthew G Knepley
403adfb6SMatthew G Knepley///////////////////////////////////////////////////////////////////////////////
403adfb6SMatthew G Knepley//Run the appropriate tests
403adfb6SMatthew G Knepley///////////////////////////////////////////////////////////////////////////////
caccb7e3SMatthew G KnepleyPetscErrorCode setupStream(PetscInt deviceNum, PetscBool runDouble, PetscBool cpuTiming)
403adfb6SMatthew G Knepley{
403adfb6SMatthew G Knepley  PetscInt       iNumThreadsPerBlock = 128;
403adfb6SMatthew G Knepley
403adfb6SMatthew G Knepley  PetscFunctionBegin;
403adfb6SMatthew G Knepley  // Check device
403adfb6SMatthew G Knepley  {
403adfb6SMatthew G Knepley    int deviceCount;
403adfb6SMatthew G Knepley
403adfb6SMatthew G Knepley    cudaGetDeviceCount(&deviceCount);
403adfb6SMatthew G Knepley    if (deviceCount == 0) {
9566063dSJacob Faibussowitsch      PetscCall(PetscPrintf(PETSC_COMM_SELF, "!!!!!No devices found!!!!!\n"));
403adfb6SMatthew G Knepley      return -1000;
403adfb6SMatthew G Knepley    }
403adfb6SMatthew G Knepley
403adfb6SMatthew G Knepley    if (deviceNum >= deviceCount || deviceNum < 0) {
9566063dSJacob Faibussowitsch      PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n!!!!!Invalid GPU number %d given hence default gpu %d will be used !!!!!\n", deviceNum, 0));
403adfb6SMatthew G Knepley      deviceNum = 0;
403adfb6SMatthew G Knepley    }
403adfb6SMatthew G Knepley  }
403adfb6SMatthew G Knepley
403adfb6SMatthew G Knepley  cudaSetDevice(deviceNum);
9566063dSJacob Faibussowitsch  // PetscCall(PetscPrintf(PETSC_COMM_SELF, "Running on...\n\n"));
403adfb6SMatthew G Knepley  cudaDeviceProp deviceProp;
19816777SMark  if (cudaGetDeviceProperties(&deviceProp, deviceNum) != cudaSuccess) {
9566063dSJacob Faibussowitsch    PetscCall(PetscPrintf(PETSC_COMM_SELF, " Unable to determine device %d properties, exiting\n"));
403adfb6SMatthew G Knepley    return -1;
403adfb6SMatthew G Knepley  }
403adfb6SMatthew G Knepley
caccb7e3SMatthew G Knepley  if (runDouble && deviceProp.major == 1 && deviceProp.minor < 3) {
9566063dSJacob Faibussowitsch    PetscCall(PetscPrintf(PETSC_COMM_SELF, " Unable to run double-precision STREAM benchmark on a compute capability GPU less than 1.3\n"));
caccb7e3SMatthew G Knepley    return -1;
caccb7e3SMatthew G Knepley  }
6f2b61bcSKarl Rupp  if (deviceProp.major == 2 && deviceProp.minor == 1) iNumThreadsPerBlock = 192; /* GF104 architecture / 48 CUDA Cores per MP */
6f2b61bcSKarl Rupp  else iNumThreadsPerBlock = 128; /* GF100 architecture / 32 CUDA Cores per MP */
403adfb6SMatthew G Knepley
*1baa6e33SBarry Smith  if (runDouble) PetscCall(runStreamDouble(iNumThreadsPerBlock, cpuTiming));
*1baa6e33SBarry Smith  else PetscCall(runStream(iNumThreadsPerBlock, cpuTiming));
403adfb6SMatthew G Knepley  PetscFunctionReturn(0);
403adfb6SMatthew G Knepley}
403adfb6SMatthew G Knepley
403adfb6SMatthew G Knepley///////////////////////////////////////////////////////////////////////////
403adfb6SMatthew G Knepley// runStream
403adfb6SMatthew G Knepley///////////////////////////////////////////////////////////////////////////
403adfb6SMatthew G KnepleyPetscErrorCode runStream(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming)
403adfb6SMatthew G Knepley{
403adfb6SMatthew G Knepley  float          *d_a, *d_b, *d_c;
403adfb6SMatthew G Knepley  int            k;
caccb7e3SMatthew G Knepley  float          times[8][NTIMES];
403adfb6SMatthew G Knepley  float          scalar;
403adfb6SMatthew G Knepley
403adfb6SMatthew G Knepley  PetscFunctionBegin;
403adfb6SMatthew G Knepley  /* Allocate memory on device */
9566063dSJacob Faibussowitsch  PetscCallCUDA(cudaMalloc((void**)&d_a, sizeof(float)*N));
9566063dSJacob Faibussowitsch  PetscCallCUDA(cudaMalloc((void**)&d_b, sizeof(float)*N));
9566063dSJacob Faibussowitsch  PetscCallCUDA(cudaMalloc((void**)&d_c, sizeof(float)*N));
403adfb6SMatthew G Knepley
403adfb6SMatthew G Knepley  /* Compute execution configuration */
403adfb6SMatthew G Knepley
403adfb6SMatthew G Knepley  dim3 dimBlock(iNumThreadsPerBlock); /* (iNumThreadsPerBlock,1,1) */
403adfb6SMatthew G Knepley  dim3 dimGrid(N/dimBlock.x); /* (N/dimBlock.x,1,1) */
403adfb6SMatthew G Knepley  if (N % dimBlock.x != 0) dimGrid.x+=1;
403adfb6SMatthew G Knepley
403adfb6SMatthew G Knepley  /* Initialize memory on the device */
403adfb6SMatthew G Knepley  set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N);
403adfb6SMatthew G Knepley  set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N);
403adfb6SMatthew G Knepley  set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N);
403adfb6SMatthew G Knepley
403adfb6SMatthew G Knepley  /* --- MAIN LOOP --- repeat test cases NTIMES times --- */
403adfb6SMatthew G Knepley  PetscLogDouble cpuTimer = 0.0;
403adfb6SMatthew G Knepley
403adfb6SMatthew G Knepley  scalar=3.0f;
403adfb6SMatthew G Knepley  for (k = 0; k < NTIMES; ++k) {
8563dfccSBarry Smith    PetscTimeSubtract(&cpuTimer);
403adfb6SMatthew G Knepley    STREAM_Copy<<<dimGrid,dimBlock>>>(d_a, d_c, N);
19816777SMark    cudaStreamSynchronize(NULL);
9566063dSJacob Faibussowitsch    PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
8563dfccSBarry Smith    PetscTimeAdd(&cpuTimer);
19816777SMark    if (bDontUseGPUTiming) times[0][k] = cpuTimer*1.e3; // millisec
403adfb6SMatthew G Knepley
403adfb6SMatthew G Knepley    cpuTimer = 0.0;
8563dfccSBarry Smith    PetscTimeSubtract(&cpuTimer);
403adfb6SMatthew G Knepley    STREAM_Copy_Optimized<<<dimGrid,dimBlock>>>(d_a, d_c, N);
19816777SMark    cudaStreamSynchronize(NULL);
9566063dSJacob Faibussowitsch    PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
df3898eeSBarry Smith    //get the total elapsed time in ms
8563dfccSBarry Smith    PetscTimeAdd(&cpuTimer);
19816777SMark    if (bDontUseGPUTiming) times[1][k] = cpuTimer*1.e3;
403adfb6SMatthew G Knepley
403adfb6SMatthew G Knepley    cpuTimer = 0.0;
8563dfccSBarry Smith    PetscTimeSubtract(&cpuTimer);
403adfb6SMatthew G Knepley    STREAM_Scale<<<dimGrid,dimBlock>>>(d_b, d_c, scalar,  N);
19816777SMark    cudaStreamSynchronize(NULL);
9566063dSJacob Faibussowitsch    PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
df3898eeSBarry Smith    //get the total elapsed time in ms
8563dfccSBarry Smith    PetscTimeAdd(&cpuTimer);
19816777SMark    if (bDontUseGPUTiming) times[2][k] = cpuTimer*1.e3;
403adfb6SMatthew G Knepley
403adfb6SMatthew G Knepley    cpuTimer = 0.0;
8563dfccSBarry Smith    PetscTimeSubtract(&cpuTimer);
caccb7e3SMatthew G Knepley    STREAM_Scale_Optimized<<<dimGrid,dimBlock>>>(d_b, d_c, scalar,  N);
19816777SMark    cudaStreamSynchronize(NULL);
9566063dSJacob Faibussowitsch    PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
df3898eeSBarry Smith    //get the total elapsed time in ms
8563dfccSBarry Smith    PetscTimeAdd(&cpuTimer);
19816777SMark    if (bDontUseGPUTiming) times[3][k] = cpuTimer*1.e3;
403adfb6SMatthew G Knepley
403adfb6SMatthew G Knepley    cpuTimer = 0.0;
8563dfccSBarry Smith    PetscTimeSubtract(&cpuTimer);
9566063dSJacob Faibussowitsch    // PetscCallCUDA(cudaEventRecord(start, 0));
caccb7e3SMatthew G Knepley    STREAM_Add<<<dimGrid,dimBlock>>>(d_a, d_b, d_c,  N);
19816777SMark    cudaStreamSynchronize(NULL);
9566063dSJacob Faibussowitsch    PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
9566063dSJacob Faibussowitsch    PetscCallCUDA(cudaEventRecord(stop, 0));
9566063dSJacob Faibussowitsch    // PetscCallCUDA(cudaEventSynchronize(stop));
df3898eeSBarry Smith    //get the total elapsed time in ms
8563dfccSBarry Smith    PetscTimeAdd(&cpuTimer);
19816777SMark    if (bDontUseGPUTiming) times[4][k] = cpuTimer*1.e3;
6f2b61bcSKarl Rupp    else {
9566063dSJacob Faibussowitsch      // PetscCallCUDA(cudaEventElapsedTime(&times[4][k], start, stop));
403adfb6SMatthew G Knepley    }
403adfb6SMatthew G Knepley
caccb7e3SMatthew G Knepley    cpuTimer = 0.0;
8563dfccSBarry Smith    PetscTimeSubtract(&cpuTimer);
caccb7e3SMatthew G Knepley    STREAM_Add_Optimized<<<dimGrid,dimBlock>>>(d_a, d_b, d_c,  N);
19816777SMark    cudaStreamSynchronize(NULL);
9566063dSJacob Faibussowitsch    PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
df3898eeSBarry Smith    //get the total elapsed time in ms
8563dfccSBarry Smith    PetscTimeAdd(&cpuTimer);
19816777SMark    if (bDontUseGPUTiming) times[5][k] = cpuTimer*1.e3;
caccb7e3SMatthew G Knepley
caccb7e3SMatthew G Knepley    cpuTimer = 0.0;
8563dfccSBarry Smith    PetscTimeSubtract(&cpuTimer);
caccb7e3SMatthew G Knepley    STREAM_Triad<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar,  N);
19816777SMark    cudaStreamSynchronize(NULL);
9566063dSJacob Faibussowitsch    PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
df3898eeSBarry Smith    //get the total elapsed time in ms
8563dfccSBarry Smith    PetscTimeAdd(&cpuTimer);
19816777SMark    if (bDontUseGPUTiming) times[6][k] = cpuTimer*1.e3;
caccb7e3SMatthew G Knepley
caccb7e3SMatthew G Knepley    cpuTimer = 0.0;
8563dfccSBarry Smith    PetscTimeSubtract(&cpuTimer);
caccb7e3SMatthew G Knepley    STREAM_Triad_Optimized<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar,  N);
19816777SMark    cudaStreamSynchronize(NULL);
9566063dSJacob Faibussowitsch    PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
df3898eeSBarry Smith    //get the total elapsed time in ms
8563dfccSBarry Smith    PetscTimeAdd(&cpuTimer);
19816777SMark    if (bDontUseGPUTiming) times[7][k] = cpuTimer*1.e3;
caccb7e3SMatthew G Knepley  }
caccb7e3SMatthew G Knepley
19816777SMark  if (1) { /* verify kernels */
403adfb6SMatthew G Knepley  float *h_a, *h_b, *h_c;
403adfb6SMatthew G Knepley  bool  errorSTREAMkernel = true;
403adfb6SMatthew G Knepley
403adfb6SMatthew G Knepley  if ((h_a = (float*)calloc(N, sizeof(float))) == (float*)NULL) {
403adfb6SMatthew G Knepley    printf("Unable to allocate array h_a, exiting ...\n");
403adfb6SMatthew G Knepley    exit(1);
403adfb6SMatthew G Knepley  }
403adfb6SMatthew G Knepley  if ((h_b = (float*)calloc(N, sizeof(float))) == (float*)NULL) {
403adfb6SMatthew G Knepley    printf("Unable to allocate array h_b, exiting ...\n");
403adfb6SMatthew G Knepley    exit(1);
403adfb6SMatthew G Knepley  }
403adfb6SMatthew G Knepley
403adfb6SMatthew G Knepley  if ((h_c = (float*)calloc(N, sizeof(float))) == (float*)NULL) {
403adfb6SMatthew G Knepley    printf("Unalbe to allocate array h_c, exiting ...\n");
403adfb6SMatthew G Knepley    exit(1);
403adfb6SMatthew G Knepley  }
403adfb6SMatthew G Knepley
403adfb6SMatthew G Knepley  /*
403adfb6SMatthew G Knepley   * perform kernel, copy device memory into host memory and verify each
403adfb6SMatthew G Knepley   * device kernel output
403adfb6SMatthew G Knepley   */
403adfb6SMatthew G Knepley
403adfb6SMatthew G Knepley  /* Initialize memory on the device */
403adfb6SMatthew G Knepley  set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N);
403adfb6SMatthew G Knepley  set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N);
403adfb6SMatthew G Knepley  set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N);
403adfb6SMatthew G Knepley
403adfb6SMatthew G Knepley  STREAM_Copy<<<dimGrid,dimBlock>>>(d_a, d_c, N);
9566063dSJacob Faibussowitsch  PetscCallCUDA(cudaMemcpy(h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost));
9566063dSJacob Faibussowitsch  PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost));
403adfb6SMatthew G Knepley  errorSTREAMkernel = STREAM_Copy_verify(h_a, h_c, N);
403adfb6SMatthew G Knepley  if (errorSTREAMkernel) {
9566063dSJacob Faibussowitsch    PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy:\t\tError detected in device STREAM_Copy, exiting\n"));
403adfb6SMatthew G Knepley    exit(-2000);
403adfb6SMatthew G Knepley  }
403adfb6SMatthew G Knepley
403adfb6SMatthew G Knepley  /* Initialize memory on the device */
403adfb6SMatthew G Knepley  set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N);
403adfb6SMatthew G Knepley  set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N);
403adfb6SMatthew G Knepley  set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N);
403adfb6SMatthew G Knepley
403adfb6SMatthew G Knepley  STREAM_Copy_Optimized<<<dimGrid,dimBlock>>>(d_a, d_c, N);
9566063dSJacob Faibussowitsch  PetscCallCUDA(cudaMemcpy(h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost));
9566063dSJacob Faibussowitsch  PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost));
403adfb6SMatthew G Knepley  errorSTREAMkernel = STREAM_Copy_verify(h_a, h_c, N);
403adfb6SMatthew G Knepley  if (errorSTREAMkernel) {
9566063dSJacob Faibussowitsch    PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy_Optimized:\tError detected in device STREAM_Copy_Optimized, exiting\n"));
403adfb6SMatthew G Knepley    exit(-3000);
403adfb6SMatthew G Knepley  }
403adfb6SMatthew G Knepley
403adfb6SMatthew G Knepley  /* Initialize memory on the device */
19816777SMark  set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N);
403adfb6SMatthew G Knepley  set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N);
403adfb6SMatthew G Knepley  set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N);
403adfb6SMatthew G Knepley
403adfb6SMatthew G Knepley  STREAM_Scale<<<dimGrid,dimBlock>>>(d_b, d_c, scalar, N);
9566063dSJacob Faibussowitsch  PetscCallCUDA(cudaMemcpy(h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost));
9566063dSJacob Faibussowitsch  PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost));
403adfb6SMatthew G Knepley  errorSTREAMkernel = STREAM_Scale_verify(h_b, h_c, scalar, N);
403adfb6SMatthew G Knepley  if (errorSTREAMkernel) {
9566063dSJacob Faibussowitsch    PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Scale:\t\tError detected in device STREAM_Scale, exiting\n"));
403adfb6SMatthew G Knepley    exit(-4000);
403adfb6SMatthew G Knepley  }
403adfb6SMatthew G Knepley
403adfb6SMatthew G Knepley  /* Initialize memory on the device */
403adfb6SMatthew G Knepley  set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N);
403adfb6SMatthew G Knepley  set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N);
403adfb6SMatthew G Knepley  set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N);
403adfb6SMatthew G Knepley
403adfb6SMatthew G Knepley  STREAM_Add<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N);
9566063dSJacob Faibussowitsch  PetscCallCUDA(cudaMemcpy(h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost));
9566063dSJacob Faibussowitsch  PetscCallCUDA(cudaMemcpy(h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost));
9566063dSJacob Faibussowitsch  PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost));
403adfb6SMatthew G Knepley  errorSTREAMkernel = STREAM_Add_verify(h_a, h_b, h_c, N);
403adfb6SMatthew G Knepley  if (errorSTREAMkernel) {
9566063dSJacob Faibussowitsch    PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Add:\t\tError detected in device STREAM_Add, exiting\n"));
403adfb6SMatthew G Knepley    exit(-5000);
403adfb6SMatthew G Knepley  }
403adfb6SMatthew G Knepley
403adfb6SMatthew G Knepley  /* Initialize memory on the device */
403adfb6SMatthew G Knepley  set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N);
403adfb6SMatthew G Knepley  set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N);
403adfb6SMatthew G Knepley  set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N);
403adfb6SMatthew G Knepley
403adfb6SMatthew G Knepley  STREAM_Triad<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N);
9566063dSJacob Faibussowitsch  PetscCallCUDA(cudaMemcpy(h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost));
9566063dSJacob Faibussowitsch  PetscCallCUDA(cudaMemcpy(h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost));
9566063dSJacob Faibussowitsch  PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost));
403adfb6SMatthew G Knepley  errorSTREAMkernel = STREAM_Triad_verify(h_b, h_c, h_a, scalar, N);
403adfb6SMatthew G Knepley  if (errorSTREAMkernel) {
9566063dSJacob Faibussowitsch    PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Triad:\t\tError detected in device STREAM_Triad, exiting\n"));
403adfb6SMatthew G Knepley    exit(-6000);
403adfb6SMatthew G Knepley  }
403adfb6SMatthew G Knepley
19816777SMark  free(h_a);
19816777SMark  free(h_b);
19816777SMark  free(h_c);
19816777SMark  }
403adfb6SMatthew G Knepley  /* continue from here */
19816777SMark  printResultsReadable(times, sizeof(float));
403adfb6SMatthew G Knepley
403adfb6SMatthew G Knepley  /* Free memory on device */
9566063dSJacob Faibussowitsch  PetscCallCUDA(cudaFree(d_a));
9566063dSJacob Faibussowitsch  PetscCallCUDA(cudaFree(d_b));
9566063dSJacob Faibussowitsch  PetscCallCUDA(cudaFree(d_c));
19816777SMark
403adfb6SMatthew G Knepley  PetscFunctionReturn(0);
403adfb6SMatthew G Knepley}
403adfb6SMatthew G Knepley
caccb7e3SMatthew G KnepleyPetscErrorCode runStreamDouble(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming)
caccb7e3SMatthew G Knepley{
caccb7e3SMatthew G Knepley  double         *d_a, *d_b, *d_c;
caccb7e3SMatthew G Knepley  int            k;
caccb7e3SMatthew G Knepley  float          times[8][NTIMES];
caccb7e3SMatthew G Knepley  double         scalar;
caccb7e3SMatthew G Knepley
caccb7e3SMatthew G Knepley  PetscFunctionBegin;
caccb7e3SMatthew G Knepley  /* Allocate memory on device */
9566063dSJacob Faibussowitsch  PetscCallCUDA(cudaMalloc((void**)&d_a, sizeof(double)*N));
9566063dSJacob Faibussowitsch  PetscCallCUDA(cudaMalloc((void**)&d_b, sizeof(double)*N));
9566063dSJacob Faibussowitsch  PetscCallCUDA(cudaMalloc((void**)&d_c, sizeof(double)*N));
caccb7e3SMatthew G Knepley
caccb7e3SMatthew G Knepley  /* Compute execution configuration */
caccb7e3SMatthew G Knepley
caccb7e3SMatthew G Knepley  dim3 dimBlock(iNumThreadsPerBlock); /* (iNumThreadsPerBlock,1,1) */
caccb7e3SMatthew G Knepley  dim3 dimGrid(N/dimBlock.x); /* (N/dimBlock.x,1,1) */
caccb7e3SMatthew G Knepley  if (N % dimBlock.x != 0) dimGrid.x+=1;
caccb7e3SMatthew G Knepley
caccb7e3SMatthew G Knepley  /* Initialize memory on the device */
caccb7e3SMatthew G Knepley  set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N);
caccb7e3SMatthew G Knepley  set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N);
caccb7e3SMatthew G Knepley  set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N);
caccb7e3SMatthew G Knepley
caccb7e3SMatthew G Knepley  /* --- MAIN LOOP --- repeat test cases NTIMES times --- */
caccb7e3SMatthew G Knepley  PetscLogDouble cpuTimer = 0.0;
caccb7e3SMatthew G Knepley
caccb7e3SMatthew G Knepley  scalar=3.0;
caccb7e3SMatthew G Knepley  for (k = 0; k < NTIMES; ++k) {
8563dfccSBarry Smith    PetscTimeSubtract(&cpuTimer);
caccb7e3SMatthew G Knepley    STREAM_Copy_double<<<dimGrid,dimBlock>>>(d_a, d_c, N);
19816777SMark    cudaStreamSynchronize(NULL);
9566063dSJacob Faibussowitsch    PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
df3898eeSBarry Smith    //get the total elapsed time in ms
caccb7e3SMatthew G Knepley    if (bDontUseGPUTiming) {
8563dfccSBarry Smith      PetscTimeAdd(&cpuTimer);
19816777SMark      times[0][k] = cpuTimer*1.e3;
caccb7e3SMatthew G Knepley    }
caccb7e3SMatthew G Knepley
caccb7e3SMatthew G Knepley    cpuTimer = 0.0;
8563dfccSBarry Smith    PetscTimeSubtract(&cpuTimer);
caccb7e3SMatthew G Knepley    STREAM_Copy_Optimized_double<<<dimGrid,dimBlock>>>(d_a, d_c, N);
19816777SMark    cudaStreamSynchronize(NULL);
9566063dSJacob Faibussowitsch    PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
df3898eeSBarry Smith    //get the total elapsed time in ms
caccb7e3SMatthew G Knepley    if (bDontUseGPUTiming) {
8563dfccSBarry Smith      PetscTimeAdd(&cpuTimer);
19816777SMark      times[1][k] = cpuTimer*1.e3;
caccb7e3SMatthew G Knepley    }
caccb7e3SMatthew G Knepley
caccb7e3SMatthew G Knepley    cpuTimer = 0.0;
8563dfccSBarry Smith    PetscTimeSubtract(&cpuTimer);
caccb7e3SMatthew G Knepley    STREAM_Scale_double<<<dimGrid,dimBlock>>>(d_b, d_c, scalar,  N);
19816777SMark    cudaStreamSynchronize(NULL);
9566063dSJacob Faibussowitsch    PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
df3898eeSBarry Smith    //get the total elapsed time in ms
8563dfccSBarry Smith    PetscTimeAdd(&cpuTimer);
19816777SMark    if (bDontUseGPUTiming) times[2][k] = cpuTimer*1.e3;
caccb7e3SMatthew G Knepley
caccb7e3SMatthew G Knepley    cpuTimer = 0.0;
8563dfccSBarry Smith    PetscTimeSubtract(&cpuTimer);
caccb7e3SMatthew G Knepley    STREAM_Scale_Optimized_double<<<dimGrid,dimBlock>>>(d_b, d_c, scalar,  N);
19816777SMark    cudaStreamSynchronize(NULL);
9566063dSJacob Faibussowitsch    PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
df3898eeSBarry Smith    //get the total elapsed time in ms
8563dfccSBarry Smith    PetscTimeAdd(&cpuTimer);
19816777SMark    if (bDontUseGPUTiming) times[3][k] = cpuTimer*1.e3;
caccb7e3SMatthew G Knepley
caccb7e3SMatthew G Knepley    cpuTimer = 0.0;
8563dfccSBarry Smith    PetscTimeSubtract(&cpuTimer);
caccb7e3SMatthew G Knepley    STREAM_Add_double<<<dimGrid,dimBlock>>>(d_a, d_b, d_c,  N);
19816777SMark    cudaStreamSynchronize(NULL);
9566063dSJacob Faibussowitsch    PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
df3898eeSBarry Smith    //get the total elapsed time in ms
8563dfccSBarry Smith    PetscTimeAdd(&cpuTimer);
19816777SMark    if (bDontUseGPUTiming) times[4][k] = cpuTimer*1.e3;
caccb7e3SMatthew G Knepley
caccb7e3SMatthew G Knepley    cpuTimer = 0.0;
8563dfccSBarry Smith    PetscTimeSubtract(&cpuTimer);
caccb7e3SMatthew G Knepley    STREAM_Add_Optimized_double<<<dimGrid,dimBlock>>>(d_a, d_b, d_c,  N);
19816777SMark    cudaStreamSynchronize(NULL);
9566063dSJacob Faibussowitsch    PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
df3898eeSBarry Smith    //get the total elapsed time in ms
8563dfccSBarry Smith    PetscTimeAdd(&cpuTimer);
19816777SMark    if (bDontUseGPUTiming) times[5][k] = cpuTimer*1.e3;
caccb7e3SMatthew G Knepley
caccb7e3SMatthew G Knepley    cpuTimer = 0.0;
8563dfccSBarry Smith    PetscTimeSubtract(&cpuTimer);
caccb7e3SMatthew G Knepley    STREAM_Triad_double<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar,  N);
19816777SMark    cudaStreamSynchronize(NULL);
9566063dSJacob Faibussowitsch    PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
df3898eeSBarry Smith    //get the total elapsed time in ms
8563dfccSBarry Smith    PetscTimeAdd(&cpuTimer);
19816777SMark    if (bDontUseGPUTiming) times[6][k] = cpuTimer*1.e3;
caccb7e3SMatthew G Knepley
caccb7e3SMatthew G Knepley    cpuTimer = 0.0;
8563dfccSBarry Smith    PetscTimeSubtract(&cpuTimer);
caccb7e3SMatthew G Knepley    STREAM_Triad_Optimized_double<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar,  N);
19816777SMark    cudaStreamSynchronize(NULL);
9566063dSJacob Faibussowitsch    PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
df3898eeSBarry Smith    //get the total elapsed time in ms
8563dfccSBarry Smith    PetscTimeAdd(&cpuTimer);
19816777SMark    if (bDontUseGPUTiming) times[7][k] = cpuTimer*1.e3;
caccb7e3SMatthew G Knepley  }
caccb7e3SMatthew G Knepley
19816777SMark  if (1) { /* verify kernels */
caccb7e3SMatthew G Knepley  double *h_a, *h_b, *h_c;
caccb7e3SMatthew G Knepley  bool   errorSTREAMkernel = true;
caccb7e3SMatthew G Knepley
caccb7e3SMatthew G Knepley  if ((h_a = (double*)calloc(N, sizeof(double))) == (double*)NULL) {
caccb7e3SMatthew G Knepley    printf("Unable to allocate array h_a, exiting ...\n");
caccb7e3SMatthew G Knepley    exit(1);
caccb7e3SMatthew G Knepley  }
caccb7e3SMatthew G Knepley  if ((h_b = (double*)calloc(N, sizeof(double))) == (double*)NULL) {
caccb7e3SMatthew G Knepley    printf("Unable to allocate array h_b, exiting ...\n");
caccb7e3SMatthew G Knepley    exit(1);
caccb7e3SMatthew G Knepley  }
caccb7e3SMatthew G Knepley
caccb7e3SMatthew G Knepley  if ((h_c = (double*)calloc(N, sizeof(double))) == (double*)NULL) {
caccb7e3SMatthew G Knepley    printf("Unalbe to allocate array h_c, exiting ...\n");
caccb7e3SMatthew G Knepley    exit(1);
caccb7e3SMatthew G Knepley  }
caccb7e3SMatthew G Knepley
caccb7e3SMatthew G Knepley  /*
caccb7e3SMatthew G Knepley   * perform kernel, copy device memory into host memory and verify each
caccb7e3SMatthew G Knepley   * device kernel output
caccb7e3SMatthew G Knepley   */
caccb7e3SMatthew G Knepley
caccb7e3SMatthew G Knepley  /* Initialize memory on the device */
caccb7e3SMatthew G Knepley  set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N);
caccb7e3SMatthew G Knepley  set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N);
caccb7e3SMatthew G Knepley  set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N);
caccb7e3SMatthew G Knepley
caccb7e3SMatthew G Knepley  STREAM_Copy_double<<<dimGrid,dimBlock>>>(d_a, d_c, N);
9566063dSJacob Faibussowitsch  PetscCallCUDA(cudaMemcpy(h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost));
9566063dSJacob Faibussowitsch  PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost));
caccb7e3SMatthew G Knepley  errorSTREAMkernel = STREAM_Copy_verify_double(h_a, h_c, N);
caccb7e3SMatthew G Knepley  if (errorSTREAMkernel) {
9566063dSJacob Faibussowitsch    PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy:\t\tError detected in device STREAM_Copy, exiting\n"));
caccb7e3SMatthew G Knepley    exit(-2000);
caccb7e3SMatthew G Knepley  }
caccb7e3SMatthew G Knepley
caccb7e3SMatthew G Knepley  /* Initialize memory on the device */
caccb7e3SMatthew G Knepley  set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N);
caccb7e3SMatthew G Knepley  set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N);
caccb7e3SMatthew G Knepley  set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N);
caccb7e3SMatthew G Knepley
caccb7e3SMatthew G Knepley  STREAM_Copy_Optimized_double<<<dimGrid,dimBlock>>>(d_a, d_c, N);
9566063dSJacob Faibussowitsch  PetscCallCUDA(cudaMemcpy(h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost));
9566063dSJacob Faibussowitsch  PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost));
caccb7e3SMatthew G Knepley  errorSTREAMkernel = STREAM_Copy_verify_double(h_a, h_c, N);
caccb7e3SMatthew G Knepley  if (errorSTREAMkernel) {
9566063dSJacob Faibussowitsch    PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy_Optimized:\tError detected in device STREAM_Copy_Optimized, exiting\n"));
caccb7e3SMatthew G Knepley    exit(-3000);
caccb7e3SMatthew G Knepley  }
caccb7e3SMatthew G Knepley
caccb7e3SMatthew G Knepley  /* Initialize memory on the device */
caccb7e3SMatthew G Knepley  set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N);
caccb7e3SMatthew G Knepley  set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N);
caccb7e3SMatthew G Knepley
caccb7e3SMatthew G Knepley  STREAM_Scale_double<<<dimGrid,dimBlock>>>(d_b, d_c, scalar, N);
9566063dSJacob Faibussowitsch  PetscCallCUDA(cudaMemcpy(h_b, d_b, sizeof(double) * N, cudaMemcpyDeviceToHost));
9566063dSJacob Faibussowitsch  PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost));
caccb7e3SMatthew G Knepley  errorSTREAMkernel = STREAM_Scale_verify_double(h_b, h_c, scalar, N);
caccb7e3SMatthew G Knepley  if (errorSTREAMkernel) {
9566063dSJacob Faibussowitsch    PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Scale:\t\tError detected in device STREAM_Scale, exiting\n"));
caccb7e3SMatthew G Knepley    exit(-4000);
caccb7e3SMatthew G Knepley  }
caccb7e3SMatthew G Knepley
caccb7e3SMatthew G Knepley  /* Initialize memory on the device */
caccb7e3SMatthew G Knepley  set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N);
caccb7e3SMatthew G Knepley  set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N);
caccb7e3SMatthew G Knepley  set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N);
caccb7e3SMatthew G Knepley
caccb7e3SMatthew G Knepley  STREAM_Add_double<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N);
9566063dSJacob Faibussowitsch  PetscCallCUDA(cudaMemcpy(h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost));
9566063dSJacob Faibussowitsch  PetscCallCUDA(cudaMemcpy(h_b, d_b, sizeof(double) * N, cudaMemcpyDeviceToHost));
9566063dSJacob Faibussowitsch  PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost));
caccb7e3SMatthew G Knepley  errorSTREAMkernel = STREAM_Add_verify_double(h_a, h_b, h_c, N);
caccb7e3SMatthew G Knepley  if (errorSTREAMkernel) {
9566063dSJacob Faibussowitsch    PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Add:\t\tError detected in device STREAM_Add, exiting\n"));
caccb7e3SMatthew G Knepley    exit(-5000);
caccb7e3SMatthew G Knepley  }
caccb7e3SMatthew G Knepley
caccb7e3SMatthew G Knepley  /* Initialize memory on the device */
caccb7e3SMatthew G Knepley  set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N);
caccb7e3SMatthew G Knepley  set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N);
caccb7e3SMatthew G Knepley  set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N);
caccb7e3SMatthew G Knepley
caccb7e3SMatthew G Knepley  STREAM_Triad_double<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N);
9566063dSJacob Faibussowitsch  PetscCallCUDA(cudaMemcpy(h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost));
9566063dSJacob Faibussowitsch  PetscCallCUDA(cudaMemcpy(h_b, d_b, sizeof(double) * N, cudaMemcpyDeviceToHost));
9566063dSJacob Faibussowitsch  PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost));
caccb7e3SMatthew G Knepley  errorSTREAMkernel = STREAM_Triad_verify_double(h_b, h_c, h_a, scalar, N);
caccb7e3SMatthew G Knepley  if (errorSTREAMkernel) {
9566063dSJacob Faibussowitsch    PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Triad:\t\tError detected in device STREAM_Triad, exiting\n"));
caccb7e3SMatthew G Knepley    exit(-6000);
caccb7e3SMatthew G Knepley  }
caccb7e3SMatthew G Knepley
19816777SMark  free(h_a);
19816777SMark  free(h_b);
19816777SMark  free(h_c);
19816777SMark  }
caccb7e3SMatthew G Knepley  /* continue from here */
19816777SMark  printResultsReadable(times,sizeof(double));
caccb7e3SMatthew G Knepley
caccb7e3SMatthew G Knepley  /* Free memory on device */
9566063dSJacob Faibussowitsch  PetscCallCUDA(cudaFree(d_a));
9566063dSJacob Faibussowitsch  PetscCallCUDA(cudaFree(d_b));
9566063dSJacob Faibussowitsch  PetscCallCUDA(cudaFree(d_c));
19816777SMark
caccb7e3SMatthew G Knepley  PetscFunctionReturn(0);
caccb7e3SMatthew G Knepley}
caccb7e3SMatthew G Knepley
403adfb6SMatthew G Knepley///////////////////////////////////////////////////////////////////////////
403adfb6SMatthew G Knepley//Print Results to Screen and File
403adfb6SMatthew G Knepley///////////////////////////////////////////////////////////////////////////
19816777SMarkPetscErrorCode printResultsReadable(float times[][NTIMES], const size_t bsize)
a6dfd86eSKarl Rupp{
403adfb6SMatthew G Knepley  PetscErrorCode ierr;
403adfb6SMatthew G Knepley  PetscInt       j, k;
caccb7e3SMatthew G Knepley  float          avgtime[8]          = {0., 0., 0., 0., 0., 0., 0., 0.};
caccb7e3SMatthew G Knepley  float          maxtime[8]          = {0., 0., 0., 0., 0., 0., 0., 0.};
caccb7e3SMatthew G Knepley  float          mintime[8]          = {1e30,1e30,1e30,1e30,1e30,1e30,1e30,1e30};
19816777SMark  // char           *label[8]           = {"Copy:      ", "Copy Opt.: ", "Scale:     ", "Scale Opt: ", "Add:       ", "Add Opt:   ", "Triad:     ", "Triad Opt: "};
19816777SMark  const float    bytes_per_kernel[8] = {
19816777SMark    2. * bsize * N,
19816777SMark    2. * bsize * N,
19816777SMark    2. * bsize * N,
19816777SMark    2. * bsize * N,
19816777SMark    3. * bsize * N,
19816777SMark    3. * bsize * N,
19816777SMark    3. * bsize * N,
19816777SMark    3. * bsize * N
403adfb6SMatthew G Knepley  };
19816777SMark  double         rate,irate;
19816777SMark  int            rank,size;
403adfb6SMatthew G Knepley  PetscFunctionBegin;
9566063dSJacob Faibussowitsch  PetscCallMPI(MPI_Comm_rank(MPI_COMM_WORLD,&rank));
9566063dSJacob Faibussowitsch  PetscCallMPI(MPI_Comm_size(MPI_COMM_WORLD,&size));
403adfb6SMatthew G Knepley  /* --- SUMMARY --- */
19816777SMark  for (k = 0; k < NTIMES; ++k) {
caccb7e3SMatthew G Knepley    for (j = 0; j < 8; ++j) {
19816777SMark      avgtime[j] = avgtime[j] + (1.e-03f * times[j][k]); // millisec --> sec
403adfb6SMatthew G Knepley      mintime[j] = MIN(mintime[j], (1.e-03f * times[j][k]));
403adfb6SMatthew G Knepley      maxtime[j] = MAX(maxtime[j], (1.e-03f * times[j][k]));
403adfb6SMatthew G Knepley    }
19816777SMark  }
caccb7e3SMatthew G Knepley  for (j = 0; j < 8; ++j) {
403adfb6SMatthew G Knepley    avgtime[j] = avgtime[j]/(float)(NTIMES-1);
403adfb6SMatthew G Knepley  }
19816777SMark  j = 7;
19816777SMark  irate = 1.0E-06 * bytes_per_kernel[j]/mintime[j];
19816777SMark  ierr = MPI_Reduce(&irate,&rate,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD);
dd400576SPatrick Sanan  if (rank == 0) {
19816777SMark    FILE *fd;
19816777SMark    if (size == 1) {
19816777SMark      printf("%d %11.4f   Rate (MB/s)\n",size, rate);
19816777SMark      fd = fopen("flops","w");
19816777SMark      fprintf(fd,"%g\n",rate);
19816777SMark      fclose(fd);
19816777SMark    } else {
19816777SMark      double prate;
19816777SMark      fd = fopen("flops","r");
19816777SMark      fscanf(fd,"%lg",&prate);
19816777SMark      fclose(fd);
19816777SMark      printf("%d %11.4f   Rate (MB/s) %g \n", size, rate, rate/prate);
19816777SMark    }
19816777SMark  }
19816777SMark
403adfb6SMatthew G Knepley  PetscFunctionReturn(0);
403adfb6SMatthew G Knepley}