#include <petscsys.h>
#include <petsctime.h>

extern int BlastCache(void);
extern int test1(void);
extern int test2(void);

int main(int argc, char **argv)
{
  PetscCall(PetscInitialize(&argc, &argv, 0, 0));
  PetscCall(test1());
  PetscCall(test2());
  PetscCall(PetscFinalize());
  return 0;
}

int test1(void)
{
  PetscLogDouble t1, t2;
  double         value;
  int            i, ierr, *z, *zi, intval;
  PetscScalar   *x, *y;
  PetscRandom    r;

  PetscCall(PetscRandomCreate(PETSC_COMM_SELF, &r));
  PetscCall(PetscRandomSetFromOptions(r));
  PetscCall(PetscMalloc1(20000, &x));
  PetscCall(PetscMalloc1(20000, &y));

  PetscCall(PetscMalloc1(2000, &z));
  PetscCall(PetscMalloc1(2000, &zi));

  /* Take care of paging effects */
  PetscCall(PetscTime(&t1));

  /* Form the random set of integers */
  for (i = 0; i < 2000; i++) {
    PetscCall(PetscRandomGetValue(r, &value));
    intval = (int)(value * 20000.0);
    z[i]   = intval;
  }

  for (i = 0; i < 2000; i++) {
    PetscCall(PetscRandomGetValue(r, &value));
    intval = (int)(value * 20000.0);
    zi[i]  = intval;
  }
  /* fprintf(stdout,"Done setup\n"); */

  PetscCall(BlastCache());

  PetscCall(PetscTime(&t1));
  for (i = 0; i < 2000; i++) x[i] = y[i];
  PetscCall(PetscTime(&t2));
  fprintf(stdout, "%-27s : %e sec\n", "x[i] = y[i]", (t2 - t1) / 2000.0);

  PetscCall(BlastCache());

  PetscCall(PetscTime(&t1));
  for (i = 0; i < 500; i += 4) {
    x[i]     = y[z[i]];
    x[1 + i] = y[z[1 + i]];
    x[2 + i] = y[z[2 + i]];
    x[3 + i] = y[z[3 + i]];
  }
  PetscCall(PetscTime(&t2));
  fprintf(stdout, "%-27s : %e sec\n", "x[i] = y[idx[i]] - unroll 4", (t2 - t1) / 2000.0);

  PetscCall(BlastCache());

  PetscCall(PetscTime(&t1));
  for (i = 0; i < 2000; i++) x[i] = y[z[i]];
  PetscCall(PetscTime(&t2));
  fprintf(stdout, "%-27s : %e sec\n", "x[i] = y[idx[i]]", (t2 - t1) / 2000.0);

  PetscCall(BlastCache());

  PetscCall(PetscTime(&t1));
  for (i = 0; i < 1000; i += 2) {
    x[i]     = y[z[i]];
    x[1 + i] = y[z[1 + i]];
  }
  PetscCall(PetscTime(&t2));
  fprintf(stdout, "%-27s : %e sec\n", "x[i] = y[idx[i]] - unroll 2", (t2 - t1) / 2000.0);

  PetscCall(BlastCache());

  PetscCall(PetscTime(&t1));
  for (i = 0; i < 2000; i++) x[z[i]] = y[i];
  PetscCall(PetscTime(&t2));
  fprintf(stdout, "%-27s : %e sec\n", "x[z[i]] = y[i]", (t2 - t1) / 2000.0);

  PetscCall(BlastCache());

  PetscCall(PetscTime(&t1));
  for (i = 0; i < 2000; i++) x[z[i]] = y[zi[i]];
  PetscCall(PetscTime(&t2));
  fprintf(stdout, "%-27s : %e sec\n", "x[z[i]] = y[zi[i]]", (t2 - t1) / 2000.0);

  PetscCall(PetscArraycpy(x, y, 10));
  PetscCall(PetscArraycpy(z, zi, 10));
  PetscCall(PetscFree(z));
  PetscCall(PetscFree(zi));
  PetscCall(PetscFree(x));
  PetscCall(PetscFree(y));
  PetscCall(PetscRandomDestroy(&r));
  PetscFunctionReturn(PETSC_SUCCESS);
}

int test2(void)
{
  PetscLogDouble t1, t2;
  double         value;
  int            i, ierr, z[20000], zi[20000], intval, tmp;
  PetscScalar    x[20000], y[20000];
  PetscRandom    r;

  PetscCall(PetscRandomCreate(PETSC_COMM_SELF, &r));
  PetscCall(PetscRandomSetFromOptions(r));

  /* Take care of paging effects */
  PetscCall(PetscTime(&t1));

  for (i = 0; i < 20000; i++) {
    x[i]  = i;
    y[i]  = i;
    z[i]  = i;
    zi[i] = i;
  }

  /* Form the random set of integers */
  for (i = 0; i < 20000; i++) {
    PetscCall(PetscRandomGetValue(r, &value));
    intval    = (int)(value * 20000.0);
    tmp       = z[i];
    z[i]      = z[intval];
    z[intval] = tmp;
  }

  for (i = 0; i < 20000; i++) {
    PetscCall(PetscRandomGetValue(r, &value));
    intval     = (int)(value * 20000.0);
    tmp        = zi[i];
    zi[i]      = zi[intval];
    zi[intval] = tmp;
  }
  /* fprintf(stdout,"Done setup\n"); */

  /* PetscCall(BlastCache()); */

  PetscCall(PetscTime(&t1));
  for (i = 0; i < 2000; i++) x[i] = y[i];
  PetscCall(PetscTime(&t2));
  fprintf(stdout, "%-27s : %e sec\n", "x[i] = y[i]", (t2 - t1) / 2000.0);

  /* PetscCall(BlastCache()); */

  PetscCall(PetscTime(&t1));
  for (i = 0; i < 2000; i++) y[i] = x[z[i]];
  PetscCall(PetscTime(&t2));
  fprintf(stdout, "%-27s : %e sec\n", "x[i] = y[idx[i]]", (t2 - t1) / 2000.0);

  /* PetscCall(BlastCache()); */

  PetscCall(PetscTime(&t1));
  for (i = 0; i < 2000; i++) x[z[i]] = y[i];
  PetscCall(PetscTime(&t2));
  fprintf(stdout, "%-27s : %e sec\n", "x[z[i]] = y[i]", (t2 - t1) / 2000.0);

  /* PetscCall(BlastCache()); */

  PetscCall(PetscTime(&t1));
  for (i = 0; i < 2000; i++) y[z[i]] = x[zi[i]];
  PetscCall(PetscTime(&t2));
  fprintf(stdout, "%-27s : %e sec\n", "x[z[i]] = y[zi[i]]", (t2 - t1) / 2000.0);

  PetscCall(PetscRandomDestroy(&r));
  PetscFunctionReturn(PETSC_SUCCESS);
}

int BlastCache(void)
{
  int          i, ierr, n = 1000000;
  PetscScalar *x, *y, *z, *a, *b;

  PetscCall(PetscMalloc1(5 * n, &x));
  y = x + n;
  z = y + n;
  a = z + n;
  b = a + n;

  for (i = 0; i < n; i++) {
    a[i] = (PetscScalar)i;
    y[i] = (PetscScalar)i;
    z[i] = (PetscScalar)i;
    b[i] = (PetscScalar)i;
    x[i] = (PetscScalar)i;
  }

  for (i = 0; i < n; i++) a[i] = 3.0 * x[i] + 2.0 * y[i] + 3.3 * z[i] - 25. * b[i];
  for (i = 0; i < n; i++) b[i] = 3.0 * x[i] + 2.0 * y[i] + 3.3 * a[i] - 25. * b[i];
  for (i = 0; i < n; i++) z[i] = 3.0 * x[i] + 2.0 * y[i] + 3.3 * a[i] - 25. * b[i];
  PetscCall(PetscFree(x));
  PetscFunctionReturn(PETSC_SUCCESS);
}
