jit-source/sycl/sycl-gen-templates.h

*6ca0f394SUmesh Unnikrishnan// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
*6ca0f394SUmesh Unnikrishnan// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
*6ca0f394SUmesh Unnikrishnan//
*6ca0f394SUmesh Unnikrishnan// SPDX-License-Identifier: BSD-2-Clause
*6ca0f394SUmesh Unnikrishnan//
*6ca0f394SUmesh Unnikrishnan// This file is part of CEED:  http://github.com/ceed
*6ca0f394SUmesh Unnikrishnan
*6ca0f394SUmesh Unnikrishnan/// @file
*6ca0f394SUmesh Unnikrishnan/// Internal header for SYCL backend macro and type definitions for JiT source
*6ca0f394SUmesh Unnikrishnan#ifndef _ceed_sycl_gen_templates_h
*6ca0f394SUmesh Unnikrishnan#define _ceed_sycl_gen_templates_h
*6ca0f394SUmesh Unnikrishnan
*6ca0f394SUmesh Unnikrishnan#include <ceed/types.h>
*6ca0f394SUmesh Unnikrishnan
*6ca0f394SUmesh Unnikrishnan#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
*6ca0f394SUmesh Unnikrishnan#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable
*6ca0f394SUmesh Unnikrishnan// TODO: Handle FP32 case
*6ca0f394SUmesh Unnikrishnantypedef atomic_double CeedAtomicScalar;
*6ca0f394SUmesh Unnikrishnan
*6ca0f394SUmesh Unnikrishnan//------------------------------------------------------------------------------
*6ca0f394SUmesh Unnikrishnan// Load matrices for basis actions
*6ca0f394SUmesh Unnikrishnan//------------------------------------------------------------------------------
*6ca0f394SUmesh Unnikrishnaninline void loadMatrix(const CeedInt N, const CeedScalar* restrict d_B, CeedScalar* restrict B) {
*6ca0f394SUmesh Unnikrishnan  const CeedInt item_id    = get_local_linear_id();
*6ca0f394SUmesh Unnikrishnan  const CeedInt group_size = get_local_size(0) * get_local_size(1) * get_local_size(2);
*6ca0f394SUmesh Unnikrishnan  for (CeedInt i = item_id; i < N; i += group_size) B[i] = d_B[i];
*6ca0f394SUmesh Unnikrishnan}
*6ca0f394SUmesh Unnikrishnan
*6ca0f394SUmesh Unnikrishnan//------------------------------------------------------------------------------
*6ca0f394SUmesh Unnikrishnan// 1D
*6ca0f394SUmesh Unnikrishnan//------------------------------------------------------------------------------
*6ca0f394SUmesh Unnikrishnan
*6ca0f394SUmesh Unnikrishnan//------------------------------------------------------------------------------
*6ca0f394SUmesh Unnikrishnan// L-vector -> E-vector, offsets provided
*6ca0f394SUmesh Unnikrishnan//------------------------------------------------------------------------------
*6ca0f394SUmesh Unnikrishnaninline void readDofsOffset1d(const CeedInt num_comp, const CeedInt strides_comp, const CeedInt P_1D, const CeedInt num_elem,
*6ca0f394SUmesh Unnikrishnan                             const global CeedInt* restrict indices, const global CeedScalar* restrict d_u, private CeedScalar* restrict r_u) {
*6ca0f394SUmesh Unnikrishnan  const CeedInt item_id_x = get_local_id(0);
*6ca0f394SUmesh Unnikrishnan  const CeedInt elem      = get_global_id(2);
*6ca0f394SUmesh Unnikrishnan
*6ca0f394SUmesh Unnikrishnan  if (item_id_x < P_1D && elem < num_elem) {
*6ca0f394SUmesh Unnikrishnan    const CeedInt node = item_id_x;
*6ca0f394SUmesh Unnikrishnan    const CeedInt ind  = indices[node + elem * P_1D];
*6ca0f394SUmesh Unnikrishnan    for (CeedInt comp = 0; comp < num_comp; ++comp) {
*6ca0f394SUmesh Unnikrishnan      r_u[comp] = d_u[ind + strides_comp * comp];
*6ca0f394SUmesh Unnikrishnan    }
*6ca0f394SUmesh Unnikrishnan  }
*6ca0f394SUmesh Unnikrishnan}
*6ca0f394SUmesh Unnikrishnan
*6ca0f394SUmesh Unnikrishnan//------------------------------------------------------------------------------
*6ca0f394SUmesh Unnikrishnan// L-vector -> E-vector, strided
*6ca0f394SUmesh Unnikrishnan//------------------------------------------------------------------------------
*6ca0f394SUmesh Unnikrishnaninline void readDofsStrided1d(const CeedInt num_comp, const CeedInt P_1D, const CeedInt strides_node, const CeedInt strides_comp,
*6ca0f394SUmesh Unnikrishnan                              const CeedInt strides_elem, const CeedInt num_elem, global const CeedScalar* restrict d_u,
*6ca0f394SUmesh Unnikrishnan                              private CeedScalar* restrict r_u) {
*6ca0f394SUmesh Unnikrishnan  const CeedInt item_id_x = get_local_id(0);
*6ca0f394SUmesh Unnikrishnan  const CeedInt elem      = get_global_id(2);
*6ca0f394SUmesh Unnikrishnan
*6ca0f394SUmesh Unnikrishnan  if (item_id_x < P_1D && elem < num_elem) {
*6ca0f394SUmesh Unnikrishnan    const CeedInt node = item_id_x;
*6ca0f394SUmesh Unnikrishnan    const CeedInt ind  = node * strides_node + elem * strides_elem;
*6ca0f394SUmesh Unnikrishnan    for (CeedInt comp = 0; comp < num_comp; comp++) {
*6ca0f394SUmesh Unnikrishnan      r_u[comp] = d_u[ind + comp * strides_comp];
*6ca0f394SUmesh Unnikrishnan    }
*6ca0f394SUmesh Unnikrishnan  }
*6ca0f394SUmesh Unnikrishnan}
*6ca0f394SUmesh Unnikrishnan
*6ca0f394SUmesh Unnikrishnan//------------------------------------------------------------------------------
*6ca0f394SUmesh Unnikrishnan// E-vector -> L-vector, offsets provided
*6ca0f394SUmesh Unnikrishnan//------------------------------------------------------------------------------
*6ca0f394SUmesh Unnikrishnaninline void writeDofsOffset1d(const CeedInt num_comp, const CeedInt strides_comp, const CeedInt P_1D, const CeedInt num_elem,
*6ca0f394SUmesh Unnikrishnan                              const global CeedInt* restrict indices, const private CeedScalar* restrict r_v, global CeedAtomicScalar* restrict d_v) {
*6ca0f394SUmesh Unnikrishnan  const CeedInt item_id_x = get_local_id(0);
*6ca0f394SUmesh Unnikrishnan  const CeedInt elem      = get_global_id(2);
*6ca0f394SUmesh Unnikrishnan
*6ca0f394SUmesh Unnikrishnan  if (item_id_x < P_1D && elem < num_elem) {
*6ca0f394SUmesh Unnikrishnan    const CeedInt node = item_id_x;
*6ca0f394SUmesh Unnikrishnan    const CeedInt ind  = indices[node + elem * P_1D];
*6ca0f394SUmesh Unnikrishnan    for (CeedInt comp = 0; comp < num_comp; ++comp)
*6ca0f394SUmesh Unnikrishnan      atomic_fetch_add_explicit(&d_v[ind + strides_comp * comp], r_v[comp], memory_order_relaxed, memory_scope_device);
*6ca0f394SUmesh Unnikrishnan  }
*6ca0f394SUmesh Unnikrishnan}
*6ca0f394SUmesh Unnikrishnan
*6ca0f394SUmesh Unnikrishnan//------------------------------------------------------------------------------
*6ca0f394SUmesh Unnikrishnan// E-vector -> L-vector, strided
*6ca0f394SUmesh Unnikrishnan//------------------------------------------------------------------------------
*6ca0f394SUmesh Unnikrishnaninline void writeDofsStrided1d(const CeedInt num_comp, const CeedInt P_1D, const CeedInt strides_node, const CeedInt strides_comp,
*6ca0f394SUmesh Unnikrishnan                               const CeedInt strides_elem, const CeedInt num_elem, private const CeedScalar* restrict r_v,
*6ca0f394SUmesh Unnikrishnan                               global CeedScalar* restrict d_v) {
*6ca0f394SUmesh Unnikrishnan  const CeedInt item_id_x = get_local_id(0);
*6ca0f394SUmesh Unnikrishnan  const CeedInt elem      = get_global_id(2);
*6ca0f394SUmesh Unnikrishnan
*6ca0f394SUmesh Unnikrishnan  if (item_id_x < P_1D && elem < num_elem) {
*6ca0f394SUmesh Unnikrishnan    const CeedInt node = item_id_x;
*6ca0f394SUmesh Unnikrishnan    const CeedInt ind  = node * strides_node + elem * strides_elem;
*6ca0f394SUmesh Unnikrishnan    for (CeedInt comp = 0; comp < num_comp; comp++) {
*6ca0f394SUmesh Unnikrishnan      d_v[ind + comp * strides_comp] = r_v[comp];
*6ca0f394SUmesh Unnikrishnan    }
*6ca0f394SUmesh Unnikrishnan  }
*6ca0f394SUmesh Unnikrishnan}
*6ca0f394SUmesh Unnikrishnan
*6ca0f394SUmesh Unnikrishnan//------------------------------------------------------------------------------
*6ca0f394SUmesh Unnikrishnan// 2D
*6ca0f394SUmesh Unnikrishnan//------------------------------------------------------------------------------
*6ca0f394SUmesh Unnikrishnan
*6ca0f394SUmesh Unnikrishnan//------------------------------------------------------------------------------
*6ca0f394SUmesh Unnikrishnan// L-vector -> E-vector, offsets provided
*6ca0f394SUmesh Unnikrishnan//------------------------------------------------------------------------------
*6ca0f394SUmesh Unnikrishnaninline void readDofsOffset2d(const CeedInt num_comp, const CeedInt strides_comp, const CeedInt P_1D, const CeedInt num_elem,
*6ca0f394SUmesh Unnikrishnan                             const global CeedInt* restrict indices, const global CeedScalar* restrict d_u, private CeedScalar* restrict r_u) {
*6ca0f394SUmesh Unnikrishnan  const CeedInt item_id_x = get_local_id(0);
*6ca0f394SUmesh Unnikrishnan  const CeedInt item_id_y = get_local_id(1);
*6ca0f394SUmesh Unnikrishnan  const CeedInt elem      = get_global_id(2);
*6ca0f394SUmesh Unnikrishnan
*6ca0f394SUmesh Unnikrishnan  if (item_id_x < P_1D && item_id_y < P_1D && elem < num_elem) {
*6ca0f394SUmesh Unnikrishnan    const CeedInt node = item_id_x + item_id_y * P_1D;
*6ca0f394SUmesh Unnikrishnan    const CeedInt ind  = indices[node + elem * P_1D * P_1D];
*6ca0f394SUmesh Unnikrishnan    for (CeedInt comp = 0; comp < num_comp; ++comp) r_u[comp] = d_u[ind + strides_comp * comp];
*6ca0f394SUmesh Unnikrishnan  }
*6ca0f394SUmesh Unnikrishnan}
*6ca0f394SUmesh Unnikrishnan
*6ca0f394SUmesh Unnikrishnan//------------------------------------------------------------------------------
*6ca0f394SUmesh Unnikrishnan// L-vector -> E-vector, strided
*6ca0f394SUmesh Unnikrishnan//------------------------------------------------------------------------------
*6ca0f394SUmesh Unnikrishnaninline void readDofsStrided2d(const CeedInt num_comp, const CeedInt P_1D, const CeedInt strides_node, const CeedInt strides_comp,
*6ca0f394SUmesh Unnikrishnan                              const CeedInt strides_elem, const CeedInt num_elem, const global CeedScalar* restrict d_u,
*6ca0f394SUmesh Unnikrishnan                              private CeedScalar* restrict r_u) {
*6ca0f394SUmesh Unnikrishnan  const CeedInt item_id_x = get_local_id(0);
*6ca0f394SUmesh Unnikrishnan  const CeedInt item_id_y = get_local_id(1);
*6ca0f394SUmesh Unnikrishnan  const CeedInt elem      = get_global_id(2);
*6ca0f394SUmesh Unnikrishnan
*6ca0f394SUmesh Unnikrishnan  if (item_id_x < P_1D && item_id_y < P_1D && elem < num_elem) {
*6ca0f394SUmesh Unnikrishnan    const CeedInt node = item_id_x + item_id_y * P_1D;
*6ca0f394SUmesh Unnikrishnan    const CeedInt ind  = node * strides_node + elem * strides_elem;
*6ca0f394SUmesh Unnikrishnan    for (CeedInt comp = 0; comp < num_comp; ++comp) r_u[comp] = d_u[ind + comp * strides_comp];
*6ca0f394SUmesh Unnikrishnan  }
*6ca0f394SUmesh Unnikrishnan}
*6ca0f394SUmesh Unnikrishnan
*6ca0f394SUmesh Unnikrishnan//------------------------------------------------------------------------------
*6ca0f394SUmesh Unnikrishnan// E-vector -> L-vector, offsets provided
*6ca0f394SUmesh Unnikrishnan//------------------------------------------------------------------------------
*6ca0f394SUmesh Unnikrishnaninline void writeDofsOffset2d(const CeedInt num_comp, const CeedInt strides_comp, const CeedInt P_1D, const CeedInt num_elem,
*6ca0f394SUmesh Unnikrishnan                              const global CeedInt* restrict indices, const private CeedScalar* restrict r_v, global CeedAtomicScalar* restrict d_v) {
*6ca0f394SUmesh Unnikrishnan  const CeedInt item_id_x = get_local_id(0);
*6ca0f394SUmesh Unnikrishnan  const CeedInt item_id_y = get_local_id(1);
*6ca0f394SUmesh Unnikrishnan  const CeedInt elem      = get_global_id(2);
*6ca0f394SUmesh Unnikrishnan
*6ca0f394SUmesh Unnikrishnan  if (item_id_x < P_1D && item_id_y < P_1D && elem < num_elem) {
*6ca0f394SUmesh Unnikrishnan    const CeedInt node = item_id_x + item_id_y * P_1D;
*6ca0f394SUmesh Unnikrishnan    const CeedInt ind  = indices[node + elem * P_1D * P_1D];
*6ca0f394SUmesh Unnikrishnan    for (CeedInt comp = 0; comp < num_comp; ++comp)
*6ca0f394SUmesh Unnikrishnan      atomic_fetch_add_explicit(&d_v[ind + strides_comp * comp], r_v[comp], memory_order_relaxed, memory_scope_device);
*6ca0f394SUmesh Unnikrishnan  }
*6ca0f394SUmesh Unnikrishnan}
*6ca0f394SUmesh Unnikrishnan
*6ca0f394SUmesh Unnikrishnan//------------------------------------------------------------------------------
*6ca0f394SUmesh Unnikrishnan// E-vector -> L-vector, strided
*6ca0f394SUmesh Unnikrishnan//------------------------------------------------------------------------------
*6ca0f394SUmesh Unnikrishnaninline void writeDofsStrided2d(const CeedInt num_comp, const CeedInt P_1D, const CeedInt strides_node, const CeedInt strides_comp,
*6ca0f394SUmesh Unnikrishnan                               const CeedInt strides_elem, const CeedInt num_elem, const private CeedScalar* restrict r_v,
*6ca0f394SUmesh Unnikrishnan                               global CeedScalar* restrict d_v) {
*6ca0f394SUmesh Unnikrishnan  const CeedInt item_id_x = get_local_id(0);
*6ca0f394SUmesh Unnikrishnan  const CeedInt item_id_y = get_local_id(1);
*6ca0f394SUmesh Unnikrishnan  const CeedInt elem      = get_global_id(2);
*6ca0f394SUmesh Unnikrishnan
*6ca0f394SUmesh Unnikrishnan  if (item_id_x < P_1D && item_id_y < P_1D && elem < num_elem) {
*6ca0f394SUmesh Unnikrishnan    const CeedInt node = item_id_x + item_id_y * P_1D;
*6ca0f394SUmesh Unnikrishnan    const CeedInt ind  = node * strides_node + elem * strides_elem;
*6ca0f394SUmesh Unnikrishnan    for (CeedInt comp = 0; comp < num_comp; ++comp) d_v[ind + comp * strides_comp] += r_v[comp];
*6ca0f394SUmesh Unnikrishnan  }
*6ca0f394SUmesh Unnikrishnan}
*6ca0f394SUmesh Unnikrishnan
*6ca0f394SUmesh Unnikrishnan//------------------------------------------------------------------------------
*6ca0f394SUmesh Unnikrishnan// 3D
*6ca0f394SUmesh Unnikrishnan//------------------------------------------------------------------------------
*6ca0f394SUmesh Unnikrishnan
*6ca0f394SUmesh Unnikrishnan//------------------------------------------------------------------------------
*6ca0f394SUmesh Unnikrishnan// L-vector -> E-vector, offsets provided
*6ca0f394SUmesh Unnikrishnan//------------------------------------------------------------------------------
*6ca0f394SUmesh Unnikrishnaninline void readDofsOffset3d(const CeedInt num_comp, const CeedInt strides_comp, const CeedInt P_1D, const CeedInt num_elem,
*6ca0f394SUmesh Unnikrishnan                             const global CeedInt* restrict indices, const global CeedScalar* restrict d_u, private CeedScalar* restrict r_u) {
*6ca0f394SUmesh Unnikrishnan  const CeedInt item_id_x = get_local_id(0);
*6ca0f394SUmesh Unnikrishnan  const CeedInt item_id_y = get_local_id(1);
*6ca0f394SUmesh Unnikrishnan  const CeedInt elem      = get_global_id(2);
*6ca0f394SUmesh Unnikrishnan
*6ca0f394SUmesh Unnikrishnan  if (item_id_x < P_1D && item_id_y < P_1D && elem < num_elem) {
*6ca0f394SUmesh Unnikrishnan    for (CeedInt z = 0; z < P_1D; ++z) {
*6ca0f394SUmesh Unnikrishnan      const CeedInt node = item_id_x + P_1D * (item_id_y + P_1D * z);
*6ca0f394SUmesh Unnikrishnan      const CeedInt ind  = indices[node + elem * P_1D * P_1D * P_1D];
*6ca0f394SUmesh Unnikrishnan      for (CeedInt comp = 0; comp < num_comp; ++comp) r_u[z + comp * P_1D] = d_u[ind + strides_comp * comp];
*6ca0f394SUmesh Unnikrishnan    }
*6ca0f394SUmesh Unnikrishnan  }
*6ca0f394SUmesh Unnikrishnan}
*6ca0f394SUmesh Unnikrishnan
*6ca0f394SUmesh Unnikrishnan//------------------------------------------------------------------------------
*6ca0f394SUmesh Unnikrishnan// L-vector -> E-vector, strided
*6ca0f394SUmesh Unnikrishnan//------------------------------------------------------------------------------
*6ca0f394SUmesh Unnikrishnaninline void readDofsStrided3d(const CeedInt num_comp, const CeedInt P_1D, const CeedInt strides_node, const CeedInt strides_comp,
*6ca0f394SUmesh Unnikrishnan                              const CeedInt strides_elem, const CeedInt num_elem, const global CeedScalar* restrict d_u,
*6ca0f394SUmesh Unnikrishnan                              private CeedScalar* restrict r_u) {
*6ca0f394SUmesh Unnikrishnan  const CeedInt item_id_x = get_local_id(0);
*6ca0f394SUmesh Unnikrishnan  const CeedInt item_id_y = get_local_id(1);
*6ca0f394SUmesh Unnikrishnan  const CeedInt elem      = get_global_id(2);
*6ca0f394SUmesh Unnikrishnan
*6ca0f394SUmesh Unnikrishnan  if (item_id_x < P_1D && item_id_y < P_1D && elem < num_elem) {
*6ca0f394SUmesh Unnikrishnan    for (CeedInt z = 0; z < P_1D; ++z) {
*6ca0f394SUmesh Unnikrishnan      const CeedInt node = item_id_x + P_1D * (item_id_y + P_1D * z);
*6ca0f394SUmesh Unnikrishnan      const CeedInt ind  = node * strides_node + elem * strides_elem;
*6ca0f394SUmesh Unnikrishnan      for (CeedInt comp = 0; comp < num_comp; ++comp) r_u[z + comp * P_1D] = d_u[ind + comp * strides_comp];
*6ca0f394SUmesh Unnikrishnan    }
*6ca0f394SUmesh Unnikrishnan  }
*6ca0f394SUmesh Unnikrishnan}
*6ca0f394SUmesh Unnikrishnan
*6ca0f394SUmesh Unnikrishnan//------------------------------------------------------------------------------
*6ca0f394SUmesh Unnikrishnan// E-vector -> Q-vector, offests provided
*6ca0f394SUmesh Unnikrishnan//------------------------------------------------------------------------------
*6ca0f394SUmesh Unnikrishnaninline void readSliceQuadsOffset3d(const CeedInt num_comp, const CeedInt strides_comp, const CeedInt Q_1D, const CeedInt num_elem, const CeedInt q,
*6ca0f394SUmesh Unnikrishnan                                   const global CeedInt* restrict indices, const global CeedScalar* restrict d_u, private CeedScalar* restrict r_u) {
*6ca0f394SUmesh Unnikrishnan  const CeedInt item_id_x = get_local_id(0);
*6ca0f394SUmesh Unnikrishnan  const CeedInt item_id_y = get_local_id(1);
*6ca0f394SUmesh Unnikrishnan  const CeedInt elem      = get_global_id(2);
*6ca0f394SUmesh Unnikrishnan
*6ca0f394SUmesh Unnikrishnan  if (item_id_x < Q_1D && item_id_y < Q_1D && elem < num_elem) {
*6ca0f394SUmesh Unnikrishnan    const CeedInt node = item_id_x + Q_1D * (item_id_y + Q_1D * q);
*6ca0f394SUmesh Unnikrishnan    const CeedInt ind  = indices[node + elem * Q_1D * Q_1D * Q_1D];
*6ca0f394SUmesh Unnikrishnan    for (CeedInt comp = 0; comp < num_comp; ++comp) r_u[comp] = d_u[ind + strides_comp * comp];
*6ca0f394SUmesh Unnikrishnan  }
*6ca0f394SUmesh Unnikrishnan}
*6ca0f394SUmesh Unnikrishnan
*6ca0f394SUmesh Unnikrishnan//------------------------------------------------------------------------------
*6ca0f394SUmesh Unnikrishnan// E-vector -> Q-vector, strided
*6ca0f394SUmesh Unnikrishnan//------------------------------------------------------------------------------
*6ca0f394SUmesh Unnikrishnaninline void readSliceQuadsStrided3d(const CeedInt num_comp, const CeedInt Q_1D, CeedInt strides_node, CeedInt strides_comp, CeedInt strides_elem,
*6ca0f394SUmesh Unnikrishnan                                    const CeedInt num_elem, const CeedInt q, const global CeedScalar* restrict d_u,
*6ca0f394SUmesh Unnikrishnan                                    private CeedScalar* restrict r_u) {
*6ca0f394SUmesh Unnikrishnan  const CeedInt item_id_x = get_local_id(0);
*6ca0f394SUmesh Unnikrishnan  const CeedInt item_id_y = get_local_id(1);
*6ca0f394SUmesh Unnikrishnan  const CeedInt elem      = get_global_id(2);
*6ca0f394SUmesh Unnikrishnan
*6ca0f394SUmesh Unnikrishnan  if (item_id_x < Q_1D && item_id_y < Q_1D && elem < num_elem) {
*6ca0f394SUmesh Unnikrishnan    const CeedInt node = item_id_x + Q_1D * (item_id_y + Q_1D * q);
*6ca0f394SUmesh Unnikrishnan    const CeedInt ind  = node * strides_node + elem * strides_elem;
*6ca0f394SUmesh Unnikrishnan    for (CeedInt comp = 0; comp < num_comp; ++comp) r_u[comp] = d_u[ind + comp * strides_comp];
*6ca0f394SUmesh Unnikrishnan  }
*6ca0f394SUmesh Unnikrishnan}
*6ca0f394SUmesh Unnikrishnan
*6ca0f394SUmesh Unnikrishnan//------------------------------------------------------------------------------
*6ca0f394SUmesh Unnikrishnan// E-vector -> L-vector, offsets provided
*6ca0f394SUmesh Unnikrishnan//------------------------------------------------------------------------------
*6ca0f394SUmesh Unnikrishnaninline void writeDofsOffset3d(const CeedInt num_comp, const CeedInt strides_comp, const CeedInt P_1D, const CeedInt num_elem,
*6ca0f394SUmesh Unnikrishnan                              const global CeedInt* restrict indices, const private CeedScalar* restrict r_v, global CeedAtomicScalar* restrict d_v) {
*6ca0f394SUmesh Unnikrishnan  const CeedInt item_id_x = get_local_id(0);
*6ca0f394SUmesh Unnikrishnan  const CeedInt item_id_y = get_local_id(1);
*6ca0f394SUmesh Unnikrishnan  const CeedInt elem      = get_global_id(2);
*6ca0f394SUmesh Unnikrishnan
*6ca0f394SUmesh Unnikrishnan  if (item_id_x < P_1D && item_id_y < P_1D && elem < num_elem) {
*6ca0f394SUmesh Unnikrishnan    for (CeedInt z = 0; z < P_1D; ++z) {
*6ca0f394SUmesh Unnikrishnan      const CeedInt node = item_id_x + item_id_y * P_1D + z * P_1D * P_1D;
*6ca0f394SUmesh Unnikrishnan      const CeedInt ind  = indices[node + elem * P_1D * P_1D * P_1D];
*6ca0f394SUmesh Unnikrishnan      for (CeedInt comp = 0; comp < num_comp; ++comp)
*6ca0f394SUmesh Unnikrishnan        atomic_fetch_add_explicit(&d_v[ind + strides_comp * comp], r_v[z + comp * P_1D], memory_order_relaxed, memory_scope_device);
*6ca0f394SUmesh Unnikrishnan    }
*6ca0f394SUmesh Unnikrishnan  }
*6ca0f394SUmesh Unnikrishnan}
*6ca0f394SUmesh Unnikrishnan
*6ca0f394SUmesh Unnikrishnan//------------------------------------------------------------------------------
*6ca0f394SUmesh Unnikrishnan// E-vector -> L-vector, strided
*6ca0f394SUmesh Unnikrishnan//------------------------------------------------------------------------------
*6ca0f394SUmesh Unnikrishnaninline void writeDofsStrided3d(const CeedInt num_comp, const CeedInt P_1D, const CeedInt strides_node, const CeedInt strides_comp,
*6ca0f394SUmesh Unnikrishnan                               const CeedInt strides_elem, const CeedInt num_elem, const private CeedScalar* restrict r_v,
*6ca0f394SUmesh Unnikrishnan                               global CeedScalar* restrict d_v) {
*6ca0f394SUmesh Unnikrishnan  const CeedInt item_id_x = get_local_id(0);
*6ca0f394SUmesh Unnikrishnan  const CeedInt item_id_y = get_local_id(1);
*6ca0f394SUmesh Unnikrishnan  const CeedInt elem      = get_global_id(2);
*6ca0f394SUmesh Unnikrishnan
*6ca0f394SUmesh Unnikrishnan  if (item_id_x < P_1D && item_id_y < P_1D && elem < num_elem) {
*6ca0f394SUmesh Unnikrishnan    for (CeedInt z = 0; z < P_1D; ++z) {
*6ca0f394SUmesh Unnikrishnan      const CeedInt node = item_id_x + P_1D * (item_id_y + P_1D * z);
*6ca0f394SUmesh Unnikrishnan      const CeedInt ind  = node * strides_node + elem * strides_elem;
*6ca0f394SUmesh Unnikrishnan      for (CeedInt comp = 0; comp < num_comp; ++comp) d_v[ind + comp * strides_comp] += r_v[z + comp * P_1D];
*6ca0f394SUmesh Unnikrishnan    }
*6ca0f394SUmesh Unnikrishnan  }
*6ca0f394SUmesh Unnikrishnan}
*6ca0f394SUmesh Unnikrishnan
*6ca0f394SUmesh Unnikrishnan//------------------------------------------------------------------------------
*6ca0f394SUmesh Unnikrishnan// 3D collocated derivatives computation
*6ca0f394SUmesh Unnikrishnan//------------------------------------------------------------------------------
*6ca0f394SUmesh Unnikrishnaninline void gradCollo3d(const CeedInt num_comp, const CeedInt Q_1D, const CeedInt q, const private CeedScalar* restrict r_U,
*6ca0f394SUmesh Unnikrishnan                        const local CeedScalar* s_G, private CeedScalar* restrict r_V, local CeedScalar* restrict scratch) {
*6ca0f394SUmesh Unnikrishnan  const CeedInt item_id_x = get_local_id(0);
*6ca0f394SUmesh Unnikrishnan  const CeedInt item_id_y = get_local_id(1);
*6ca0f394SUmesh Unnikrishnan
*6ca0f394SUmesh Unnikrishnan  for (CeedInt comp = 0; comp < num_comp; ++comp) {
*6ca0f394SUmesh Unnikrishnan    if (item_id_x < Q_1D && item_id_y < Q_1D) {
*6ca0f394SUmesh Unnikrishnan      scratch[item_id_x + item_id_y * T_1D] = r_U[q + comp * Q_1D];
*6ca0f394SUmesh Unnikrishnan    }
*6ca0f394SUmesh Unnikrishnan    work_group_barrier(CLK_LOCAL_MEM_FENCE);
*6ca0f394SUmesh Unnikrishnan
*6ca0f394SUmesh Unnikrishnan    if (item_id_x < Q_1D && item_id_y < Q_1D) {
*6ca0f394SUmesh Unnikrishnan      // X derivative
*6ca0f394SUmesh Unnikrishnan      r_V[comp + 0 * num_comp] = 0.0;
*6ca0f394SUmesh Unnikrishnan      for (CeedInt i = 0; i < Q_1D; ++i)
*6ca0f394SUmesh Unnikrishnan        r_V[comp + 0 * num_comp] += s_G[i + item_id_x * Q_1D] * scratch[i + item_id_y * T_1D];  // Contract x direction (X derivative)
*6ca0f394SUmesh Unnikrishnan
*6ca0f394SUmesh Unnikrishnan      // Y derivative
*6ca0f394SUmesh Unnikrishnan      r_V[comp + 1 * num_comp] = 0.0;
*6ca0f394SUmesh Unnikrishnan      for (CeedInt i = 0; i < Q_1D; ++i)
*6ca0f394SUmesh Unnikrishnan        r_V[comp + 1 * num_comp] += s_G[i + item_id_y * Q_1D] * scratch[item_id_x + i * T_1D];  // Contract y direction (Y derivative)
*6ca0f394SUmesh Unnikrishnan
*6ca0f394SUmesh Unnikrishnan      // Z derivative
*6ca0f394SUmesh Unnikrishnan      r_V[comp + 2 * num_comp] = 0.0;
*6ca0f394SUmesh Unnikrishnan      for (CeedInt i = 0; i < Q_1D; ++i) r_V[comp + 2 * num_comp] += s_G[i + q * Q_1D] * r_U[i + comp * Q_1D];  // Contract z direction (Z derivative)
*6ca0f394SUmesh Unnikrishnan    }
*6ca0f394SUmesh Unnikrishnan
*6ca0f394SUmesh Unnikrishnan    work_group_barrier(CLK_LOCAL_MEM_FENCE);
*6ca0f394SUmesh Unnikrishnan  }
*6ca0f394SUmesh Unnikrishnan}
*6ca0f394SUmesh Unnikrishnan
*6ca0f394SUmesh Unnikrishnan//------------------------------------------------------------------------------
*6ca0f394SUmesh Unnikrishnan// 3D collocated derivatives transpose
*6ca0f394SUmesh Unnikrishnan//------------------------------------------------------------------------------
*6ca0f394SUmesh Unnikrishnaninline void gradColloTranspose3d(const CeedInt num_comp, const CeedInt Q_1D, const CeedInt q, const private CeedScalar* restrict r_U,
*6ca0f394SUmesh Unnikrishnan                                 const local CeedScalar* restrict s_G, private CeedScalar* restrict r_V, local CeedScalar* restrict scratch) {
*6ca0f394SUmesh Unnikrishnan  const CeedInt item_id_x = get_local_id(0);
*6ca0f394SUmesh Unnikrishnan  const CeedInt item_id_y = get_local_id(1);
*6ca0f394SUmesh Unnikrishnan
*6ca0f394SUmesh Unnikrishnan  for (CeedInt comp = 0; comp < num_comp; ++comp) {
*6ca0f394SUmesh Unnikrishnan    // X derivative
*6ca0f394SUmesh Unnikrishnan    if (item_id_x < Q_1D && item_id_y < Q_1D) {
*6ca0f394SUmesh Unnikrishnan      scratch[item_id_x + item_id_y * T_1D] = r_U[comp + 0 * num_comp];
*6ca0f394SUmesh Unnikrishnan    }
*6ca0f394SUmesh Unnikrishnan    work_group_barrier(CLK_LOCAL_MEM_FENCE);
*6ca0f394SUmesh Unnikrishnan
*6ca0f394SUmesh Unnikrishnan    if (item_id_x < Q_1D && item_id_y < Q_1D) {
*6ca0f394SUmesh Unnikrishnan      for (CeedInt i = 0; i < Q_1D; ++i)
*6ca0f394SUmesh Unnikrishnan        r_V[q + comp * Q_1D] += s_G[item_id_x + i * Q_1D] * scratch[i + item_id_y * T_1D];  // Contract x direction (X derivative)
*6ca0f394SUmesh Unnikrishnan    }
*6ca0f394SUmesh Unnikrishnan    work_group_barrier(CLK_LOCAL_MEM_FENCE);
*6ca0f394SUmesh Unnikrishnan
*6ca0f394SUmesh Unnikrishnan    // Y derivative
*6ca0f394SUmesh Unnikrishnan    if (item_id_x < Q_1D && item_id_y < Q_1D) {
*6ca0f394SUmesh Unnikrishnan      scratch[item_id_x + item_id_y * T_1D] = r_U[comp + 1 * num_comp];
*6ca0f394SUmesh Unnikrishnan    }
*6ca0f394SUmesh Unnikrishnan    work_group_barrier(CLK_LOCAL_MEM_FENCE);
*6ca0f394SUmesh Unnikrishnan
*6ca0f394SUmesh Unnikrishnan    if (item_id_x < Q_1D && item_id_y < Q_1D) {
*6ca0f394SUmesh Unnikrishnan      for (CeedInt i = 0; i < Q_1D; ++i)
*6ca0f394SUmesh Unnikrishnan        r_V[q + comp * Q_1D] += s_G[item_id_y + i * Q_1D] * scratch[item_id_x + i * T_1D];  // Contract y direction (Y derivative)
*6ca0f394SUmesh Unnikrishnan    }
*6ca0f394SUmesh Unnikrishnan    work_group_barrier(CLK_LOCAL_MEM_FENCE);
*6ca0f394SUmesh Unnikrishnan
*6ca0f394SUmesh Unnikrishnan    // Z derivative
*6ca0f394SUmesh Unnikrishnan    if (item_id_x < Q_1D && item_id_y < Q_1D) {
*6ca0f394SUmesh Unnikrishnan      for (CeedInt i = 0; i < Q_1D; ++i)
*6ca0f394SUmesh Unnikrishnan        r_V[i + comp * Q_1D] += s_G[i + q * Q_1D] * r_U[comp + 2 * num_comp];  // PARTIAL contract z direction (Z derivative)
*6ca0f394SUmesh Unnikrishnan    }
*6ca0f394SUmesh Unnikrishnan  }
*6ca0f394SUmesh Unnikrishnan}
*6ca0f394SUmesh Unnikrishnan
*6ca0f394SUmesh Unnikrishnan#endif