jit-source/sycl/sycl-shared-basis-tensor-templates.h

*bd882c8aSJames Wright// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
*bd882c8aSJames Wright// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
*bd882c8aSJames Wright//
*bd882c8aSJames Wright// SPDX-License-Identifier: BSD-2-Clause
*bd882c8aSJames Wright//
*bd882c8aSJames Wright// This file is part of CEED:  http://github.com/ceed
*bd882c8aSJames Wright
*bd882c8aSJames Wright/// @file
*bd882c8aSJames Wright/// Internal header for SYCL shared memory tensor product basis templates
*bd882c8aSJames Wright#ifndef _ceed_sycl_shared_basis_tensor_templates_h
*bd882c8aSJames Wright#define _ceed_sycl_shared_basis_tensor_templates_h
*bd882c8aSJames Wright
*bd882c8aSJames Wright#include <ceed.h>
*bd882c8aSJames Wright
*bd882c8aSJames Wright//------------------------------------------------------------------------------
*bd882c8aSJames Wright// 1D
*bd882c8aSJames Wright//------------------------------------------------------------------------------
*bd882c8aSJames Wright
*bd882c8aSJames Wright//------------------------------------------------------------------------------
*bd882c8aSJames Wright// 1D tensor contraction x
*bd882c8aSJames Wright//------------------------------------------------------------------------------
*bd882c8aSJames Wrightinline void ContractX1d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
*bd882c8aSJames Wright                        private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
*bd882c8aSJames Wright  const CeedInt item_id_x = get_local_id(0);
*bd882c8aSJames Wright
*bd882c8aSJames Wright  scratch[item_id_x] = *U;
*bd882c8aSJames Wright  work_group_barrier(CLK_LOCAL_MEM_FENCE);
*bd882c8aSJames Wright
*bd882c8aSJames Wright  *V = 0.0;
*bd882c8aSJames Wright  if (item_id_x < Q_1D) {
*bd882c8aSJames Wright    for (CeedInt i = 0; i < P_1D; i++) {
*bd882c8aSJames Wright      *V += B[i + item_id_x * P_1D] * scratch[i];  // Contract x direction
*bd882c8aSJames Wright    }
*bd882c8aSJames Wright  }
*bd882c8aSJames Wright  work_group_barrier(CLK_LOCAL_MEM_FENCE);
*bd882c8aSJames Wright}
*bd882c8aSJames Wright
*bd882c8aSJames Wright//------------------------------------------------------------------------------
*bd882c8aSJames Wright// 1D transpose tensor contraction x
*bd882c8aSJames Wright//------------------------------------------------------------------------------
*bd882c8aSJames Wrightinline void ContractTransposeX1d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
*bd882c8aSJames Wright                                 private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
*bd882c8aSJames Wright  const CeedInt item_id_x = get_local_id(0);
*bd882c8aSJames Wright
*bd882c8aSJames Wright  scratch[item_id_x] = *U;
*bd882c8aSJames Wright  work_group_barrier(CLK_LOCAL_MEM_FENCE);
*bd882c8aSJames Wright
*bd882c8aSJames Wright  *V = 0.0;
*bd882c8aSJames Wright  if (item_id_x < P_1D) {
*bd882c8aSJames Wright    for (CeedInt i = 0; i < Q_1D; i++) {
*bd882c8aSJames Wright      *V += B[item_id_x + i * P_1D] * scratch[i];  // Contract x direction
*bd882c8aSJames Wright    }
*bd882c8aSJames Wright  }
*bd882c8aSJames Wright  work_group_barrier(CLK_LOCAL_MEM_FENCE);
*bd882c8aSJames Wright}
*bd882c8aSJames Wright
*bd882c8aSJames Wright//------------------------------------------------------------------------------
*bd882c8aSJames Wright// 1D interpolate to quadrature points
*bd882c8aSJames Wright//------------------------------------------------------------------------------
*bd882c8aSJames Wrightinline void Interp1d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U,
*bd882c8aSJames Wright                     local const CeedScalar *restrict s_B, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) {
*bd882c8aSJames Wright  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
*bd882c8aSJames Wright    ContractX1d(P_1D, Q_1D, r_U + comp, s_B, r_V + comp, scratch);
*bd882c8aSJames Wright  }
*bd882c8aSJames Wright}
*bd882c8aSJames Wright
*bd882c8aSJames Wright//------------------------------------------------------------------------------
*bd882c8aSJames Wright// 1D interpolate transpose
*bd882c8aSJames Wright//------------------------------------------------------------------------------
*bd882c8aSJames Wrightinline void InterpTranspose1d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U,
*bd882c8aSJames Wright                              local const CeedScalar *restrict s_B, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) {
*bd882c8aSJames Wright  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
*bd882c8aSJames Wright    ContractTransposeX1d(P_1D, Q_1D, r_U + comp, s_B, r_V + comp, scratch);
*bd882c8aSJames Wright  }
*bd882c8aSJames Wright}
*bd882c8aSJames Wright
*bd882c8aSJames Wright//------------------------------------------------------------------------------
*bd882c8aSJames Wright// 1D derivatives at quadrature points
*bd882c8aSJames Wright//------------------------------------------------------------------------------
*bd882c8aSJames Wrightinline void Grad1d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U,
*bd882c8aSJames Wright                   local const CeedScalar *restrict s_G, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) {
*bd882c8aSJames Wright  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
*bd882c8aSJames Wright    ContractX1d(P_1D, Q_1D, r_U + comp, s_G, r_V + comp, scratch);
*bd882c8aSJames Wright  }
*bd882c8aSJames Wright}
*bd882c8aSJames Wright
*bd882c8aSJames Wright//------------------------------------------------------------------------------
*bd882c8aSJames Wright// 1D derivatives transpose
*bd882c8aSJames Wright//------------------------------------------------------------------------------
*bd882c8aSJames Wrightinline void GradTranspose1d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U,
*bd882c8aSJames Wright                            local const CeedScalar *restrict s_G, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) {
*bd882c8aSJames Wright  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
*bd882c8aSJames Wright    ContractTransposeX1d(P_1D, Q_1D, r_U + comp, s_G, r_V + comp, scratch);
*bd882c8aSJames Wright  }
*bd882c8aSJames Wright}
*bd882c8aSJames Wright
*bd882c8aSJames Wright//------------------------------------------------------------------------------
*bd882c8aSJames Wright// 1D quadrature weights
*bd882c8aSJames Wright//------------------------------------------------------------------------------
*bd882c8aSJames Wrightinline void Weight1d(const CeedInt Q_1D, const CeedScalar *restrict q_weight_1d, CeedScalar *restrict w) {
*bd882c8aSJames Wright  const CeedInt item_id_x = get_local_id(0);
*bd882c8aSJames Wright  *w                      = (item_id_x < Q_1D) ? q_weight_1d[item_id_x] : 0.0;
*bd882c8aSJames Wright}
*bd882c8aSJames Wright
*bd882c8aSJames Wright//------------------------------------------------------------------------------
*bd882c8aSJames Wright// 2D
*bd882c8aSJames Wright//------------------------------------------------------------------------------
*bd882c8aSJames Wright
*bd882c8aSJames Wright//------------------------------------------------------------------------------
*bd882c8aSJames Wright// 2D tensor contraction x
*bd882c8aSJames Wright//------------------------------------------------------------------------------
*bd882c8aSJames Wrightinline void ContractX2d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
*bd882c8aSJames Wright                        private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
*bd882c8aSJames Wright  const CeedInt item_id_x = get_local_id(0);
*bd882c8aSJames Wright  const CeedInt item_id_y = get_local_id(1);
*bd882c8aSJames Wright
*bd882c8aSJames Wright  scratch[item_id_x + item_id_y * T_1D] = *U;
*bd882c8aSJames Wright  work_group_barrier(CLK_LOCAL_MEM_FENCE);
*bd882c8aSJames Wright
*bd882c8aSJames Wright  *V = 0.0;
*bd882c8aSJames Wright  if (item_id_x < Q_1D && item_id_y < P_1D) {
*bd882c8aSJames Wright    for (CeedInt i = 0; i < P_1D; i++) {
*bd882c8aSJames Wright      *V += B[i + item_id_x * P_1D] * scratch[i + item_id_y * T_1D];  // Contract x direction
*bd882c8aSJames Wright    }
*bd882c8aSJames Wright  }
*bd882c8aSJames Wright  work_group_barrier(CLK_LOCAL_MEM_FENCE);
*bd882c8aSJames Wright}
*bd882c8aSJames Wright
*bd882c8aSJames Wright//------------------------------------------------------------------------------
*bd882c8aSJames Wright// 2D tensor contract y
*bd882c8aSJames Wright//------------------------------------------------------------------------------
*bd882c8aSJames Wrightinline void ContractY2d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
*bd882c8aSJames Wright                        private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
*bd882c8aSJames Wright  const CeedInt item_id_x = get_local_id(0);
*bd882c8aSJames Wright  const CeedInt item_id_y = get_local_id(1);
*bd882c8aSJames Wright
*bd882c8aSJames Wright  scratch[item_id_x + item_id_y * T_1D] = *U;
*bd882c8aSJames Wright  work_group_barrier(CLK_LOCAL_MEM_FENCE);
*bd882c8aSJames Wright
*bd882c8aSJames Wright  *V = 0.0;
*bd882c8aSJames Wright  if (item_id_x < Q_1D && item_id_y < Q_1D) {
*bd882c8aSJames Wright    for (CeedInt i = 0; i < P_1D; i++) {
*bd882c8aSJames Wright      *V += B[i + item_id_y * P_1D] * scratch[item_id_x + i * T_1D];  // Contract y direction
*bd882c8aSJames Wright    }
*bd882c8aSJames Wright  }
*bd882c8aSJames Wright  work_group_barrier(CLK_LOCAL_MEM_FENCE);
*bd882c8aSJames Wright}
*bd882c8aSJames Wright
*bd882c8aSJames Wright//------------------------------------------------------------------------------
*bd882c8aSJames Wright// 2D transpose tensor contract y
*bd882c8aSJames Wright//------------------------------------------------------------------------------
*bd882c8aSJames Wrightinline void ContractTransposeY2d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
*bd882c8aSJames Wright                                 private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
*bd882c8aSJames Wright  const CeedInt item_id_x = get_local_id(0);
*bd882c8aSJames Wright  const CeedInt item_id_y = get_local_id(1);
*bd882c8aSJames Wright
*bd882c8aSJames Wright  scratch[item_id_x + item_id_y * T_1D] = *U;
*bd882c8aSJames Wright  work_group_barrier(CLK_LOCAL_MEM_FENCE);
*bd882c8aSJames Wright
*bd882c8aSJames Wright  *V = 0.0;
*bd882c8aSJames Wright  if (item_id_x < Q_1D && item_id_y < P_1D) {
*bd882c8aSJames Wright    for (CeedInt i = 0; i < Q_1D; i++) {
*bd882c8aSJames Wright      *V += B[item_id_y + i * P_1D] * scratch[item_id_x + i * T_1D];  // Contract y direction
*bd882c8aSJames Wright    }
*bd882c8aSJames Wright  }
*bd882c8aSJames Wright  work_group_barrier(CLK_LOCAL_MEM_FENCE);
*bd882c8aSJames Wright}
*bd882c8aSJames Wright
*bd882c8aSJames Wright//------------------------------------------------------------------------------
*bd882c8aSJames Wright// 2D transpose tensor contract x
*bd882c8aSJames Wright//------------------------------------------------------------------------------
*bd882c8aSJames Wrightinline void ContractTransposeX2d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
*bd882c8aSJames Wright                                 private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
*bd882c8aSJames Wright  const CeedInt item_id_x = get_local_id(0);
*bd882c8aSJames Wright  const CeedInt item_id_y = get_local_id(1);
*bd882c8aSJames Wright
*bd882c8aSJames Wright  scratch[item_id_x + item_id_y * T_1D] = *U;
*bd882c8aSJames Wright  work_group_barrier(CLK_LOCAL_MEM_FENCE);
*bd882c8aSJames Wright
*bd882c8aSJames Wright  *V = 0.0;
*bd882c8aSJames Wright  if (item_id_x < P_1D && item_id_y < P_1D) {
*bd882c8aSJames Wright    for (CeedInt i = 0; i < Q_1D; i++) {
*bd882c8aSJames Wright      *V += B[item_id_x + i * P_1D] * scratch[i + item_id_y * T_1D];  // Contract x direction
*bd882c8aSJames Wright    }
*bd882c8aSJames Wright  }
*bd882c8aSJames Wright  work_group_barrier(CLK_LOCAL_MEM_FENCE);
*bd882c8aSJames Wright}
*bd882c8aSJames Wright
*bd882c8aSJames Wright//------------------------------------------------------------------------------
*bd882c8aSJames Wright// 2D transpose tensor contract and add x
*bd882c8aSJames Wright//------------------------------------------------------------------------------
*bd882c8aSJames Wrightinline void ContractTransposeAddX2d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
*bd882c8aSJames Wright                                    private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
*bd882c8aSJames Wright  const CeedInt item_id_x = get_local_id(0);
*bd882c8aSJames Wright  const CeedInt item_id_y = get_local_id(1);
*bd882c8aSJames Wright
*bd882c8aSJames Wright  scratch[item_id_x + item_id_y * T_1D] = *U;
*bd882c8aSJames Wright  work_group_barrier(CLK_LOCAL_MEM_FENCE);
*bd882c8aSJames Wright
*bd882c8aSJames Wright  if (item_id_x < P_1D && item_id_y < P_1D) {
*bd882c8aSJames Wright    for (CeedInt i = 0; i < Q_1D; i++) {
*bd882c8aSJames Wright      *V += B[item_id_x + i * P_1D] * scratch[i + item_id_y * T_1D];  // Contract x direction
*bd882c8aSJames Wright    }
*bd882c8aSJames Wright  }
*bd882c8aSJames Wright  work_group_barrier(CLK_LOCAL_MEM_FENCE);
*bd882c8aSJames Wright}
*bd882c8aSJames Wright
*bd882c8aSJames Wright//------------------------------------------------------------------------------
*bd882c8aSJames Wright// 2D interpolate to quadrature points
*bd882c8aSJames Wright//------------------------------------------------------------------------------
*bd882c8aSJames Wrightinline void InterpTensor2d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U,
*bd882c8aSJames Wright                           local const CeedScalar *restrict s_B, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) {
*bd882c8aSJames Wright  CeedScalar r_t[1];
*bd882c8aSJames Wright
*bd882c8aSJames Wright  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
*bd882c8aSJames Wright    ContractX2d(P_1D, Q_1D, r_U + comp, s_B, r_t, scratch);
*bd882c8aSJames Wright    ContractY2d(P_1D, Q_1D, r_t, s_B, r_V + comp, scratch);
*bd882c8aSJames Wright  }
*bd882c8aSJames Wright}
*bd882c8aSJames Wright
*bd882c8aSJames Wright//------------------------------------------------------------------------------
*bd882c8aSJames Wright// 2D interpolate transpose
*bd882c8aSJames Wright//------------------------------------------------------------------------------
*bd882c8aSJames Wrightinline void InterpTransposeTensor2d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U,
*bd882c8aSJames Wright                                    local const CeedScalar *restrict s_B, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) {
*bd882c8aSJames Wright  CeedScalar r_t[1];
*bd882c8aSJames Wright
*bd882c8aSJames Wright  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
*bd882c8aSJames Wright    ContractTransposeY2d(P_1D, Q_1D, r_U + comp, s_B, r_t, scratch);
*bd882c8aSJames Wright    ContractTransposeX2d(P_1D, Q_1D, r_t, s_B, r_V + comp, scratch);
*bd882c8aSJames Wright  }
*bd882c8aSJames Wright}
*bd882c8aSJames Wright
*bd882c8aSJames Wright//------------------------------------------------------------------------------
*bd882c8aSJames Wright// 2D derivatives at quadrature points
*bd882c8aSJames Wright//------------------------------------------------------------------------------
*bd882c8aSJames Wrightinline void GradTensor2d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U,
*bd882c8aSJames Wright                         local const CeedScalar *restrict s_B, local const CeedScalar *restrict s_G, private CeedScalar *restrict r_V,
*bd882c8aSJames Wright                         local CeedScalar *restrict scratch) {
*bd882c8aSJames Wright  CeedScalar r_t[1];
*bd882c8aSJames Wright
*bd882c8aSJames Wright  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
*bd882c8aSJames Wright    ContractX2d(P_1D, Q_1D, r_U + comp, s_G, r_t, scratch);
*bd882c8aSJames Wright    ContractY2d(P_1D, Q_1D, r_t, s_B, r_V + comp + 0 * NUM_COMP, scratch);
*bd882c8aSJames Wright    ContractX2d(P_1D, Q_1D, r_U + comp, s_B, r_t, scratch);
*bd882c8aSJames Wright    ContractY2d(P_1D, Q_1D, r_t, s_G, r_V + comp + 1 * NUM_COMP, scratch);
*bd882c8aSJames Wright  }
*bd882c8aSJames Wright}
*bd882c8aSJames Wright
*bd882c8aSJames Wright//------------------------------------------------------------------------------
*bd882c8aSJames Wright// 2D derivatives transpose
*bd882c8aSJames Wright//------------------------------------------------------------------------------
*bd882c8aSJames Wrightinline void GradTransposeTensor2d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U,
*bd882c8aSJames Wright                                  local const CeedScalar *restrict s_B, local const CeedScalar *restrict s_G, private CeedScalar *restrict r_V,
*bd882c8aSJames Wright                                  local CeedScalar *restrict scratch) {
*bd882c8aSJames Wright  CeedScalar r_t[1];
*bd882c8aSJames Wright
*bd882c8aSJames Wright  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
*bd882c8aSJames Wright    ContractTransposeY2d(P_1D, Q_1D, r_U + comp + 0 * NUM_COMP, s_B, r_t, scratch);
*bd882c8aSJames Wright    ContractTransposeX2d(P_1D, Q_1D, r_t, s_G, r_V + comp, scratch);
*bd882c8aSJames Wright    ContractTransposeY2d(P_1D, Q_1D, r_U + comp + 1 * NUM_COMP, s_G, r_t, scratch);
*bd882c8aSJames Wright    ContractTransposeAddX2d(P_1D, Q_1D, r_t, s_B, r_V + comp, scratch);
*bd882c8aSJames Wright  }
*bd882c8aSJames Wright}
*bd882c8aSJames Wright
*bd882c8aSJames Wright//------------------------------------------------------------------------------
*bd882c8aSJames Wright// 2D quadrature weights
*bd882c8aSJames Wright//------------------------------------------------------------------------------
*bd882c8aSJames Wrightinline void WeightTensor2d(const CeedInt Q_1D, const CeedScalar *restrict q_weight_1d, CeedScalar *restrict w) {
*bd882c8aSJames Wright  const CeedInt item_id_x = get_local_id(0);
*bd882c8aSJames Wright  const CeedInt item_id_y = get_local_id(1);
*bd882c8aSJames Wright
*bd882c8aSJames Wright  *w = (item_id_x < Q_1D && item_id_y < Q_1D) ? q_weight_1d[item_id_x] * q_weight_1d[item_id_y] : 0.0;
*bd882c8aSJames Wright}
*bd882c8aSJames Wright
*bd882c8aSJames Wright//------------------------------------------------------------------------------
*bd882c8aSJames Wright// 3D
*bd882c8aSJames Wright//------------------------------------------------------------------------------
*bd882c8aSJames Wright
*bd882c8aSJames Wright//------------------------------------------------------------------------------
*bd882c8aSJames Wright// 3D tensor contract x
*bd882c8aSJames Wright//------------------------------------------------------------------------------
*bd882c8aSJames Wrightinline void ContractX3d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
*bd882c8aSJames Wright                        private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
*bd882c8aSJames Wright  const CeedInt item_id_x = get_local_id(0);
*bd882c8aSJames Wright  const CeedInt item_id_y = get_local_id(1);
*bd882c8aSJames Wright
*bd882c8aSJames Wright  CeedScalar r_B[T_1D];
*bd882c8aSJames Wright  for (CeedInt i = 0; i < P_1D; i++) {
*bd882c8aSJames Wright    r_B[i] = B[i + item_id_x * P_1D];
*bd882c8aSJames Wright  }
*bd882c8aSJames Wright
*bd882c8aSJames Wright  for (CeedInt k = 0; k < P_1D; k++) {
*bd882c8aSJames Wright    scratch[item_id_x + item_id_y * T_1D] = U[k];
*bd882c8aSJames Wright    work_group_barrier(CLK_LOCAL_MEM_FENCE);
*bd882c8aSJames Wright
*bd882c8aSJames Wright    V[k] = 0.0;
*bd882c8aSJames Wright    if (item_id_x < Q_1D && item_id_y < P_1D) {
*bd882c8aSJames Wright      for (CeedInt i = 0; i < P_1D; i++) {
*bd882c8aSJames Wright        V[k] += r_B[i] * scratch[i + item_id_y * T_1D];  // Contract x direction
*bd882c8aSJames Wright      }
*bd882c8aSJames Wright    }
*bd882c8aSJames Wright    work_group_barrier(CLK_LOCAL_MEM_FENCE);
*bd882c8aSJames Wright  }
*bd882c8aSJames Wright}
*bd882c8aSJames Wright
*bd882c8aSJames Wright//------------------------------------------------------------------------------
*bd882c8aSJames Wright// 3D tensor contract y
*bd882c8aSJames Wright//------------------------------------------------------------------------------
*bd882c8aSJames Wrightinline void ContractY3d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
*bd882c8aSJames Wright                        private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
*bd882c8aSJames Wright  const CeedInt item_id_x = get_local_id(0);
*bd882c8aSJames Wright  const CeedInt item_id_y = get_local_id(1);
*bd882c8aSJames Wright
*bd882c8aSJames Wright  CeedScalar r_B[T_1D];
*bd882c8aSJames Wright  for (CeedInt i = 0; i < P_1D; i++) {
*bd882c8aSJames Wright    r_B[i] = B[i + item_id_y * P_1D];
*bd882c8aSJames Wright  }
*bd882c8aSJames Wright
*bd882c8aSJames Wright  for (CeedInt k = 0; k < P_1D; k++) {
*bd882c8aSJames Wright    scratch[item_id_x + item_id_y * T_1D] = U[k];
*bd882c8aSJames Wright    work_group_barrier(CLK_LOCAL_MEM_FENCE);
*bd882c8aSJames Wright
*bd882c8aSJames Wright    V[k] = 0.0;
*bd882c8aSJames Wright    if (item_id_x < Q_1D && item_id_y < Q_1D) {
*bd882c8aSJames Wright      for (CeedInt i = 0; i < P_1D; i++) {
*bd882c8aSJames Wright        V[k] += r_B[i] * scratch[item_id_x + i * T_1D];  // Contract y direction
*bd882c8aSJames Wright      }
*bd882c8aSJames Wright    }
*bd882c8aSJames Wright    work_group_barrier(CLK_LOCAL_MEM_FENCE);
*bd882c8aSJames Wright  }
*bd882c8aSJames Wright}
*bd882c8aSJames Wright
*bd882c8aSJames Wright//------------------------------------------------------------------------------
*bd882c8aSJames Wright// 3D tensor contract z
*bd882c8aSJames Wright//------------------------------------------------------------------------------
*bd882c8aSJames Wrightinline void ContractZ3d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
*bd882c8aSJames Wright                        private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
*bd882c8aSJames Wright  const CeedInt item_id_x = get_local_id(0);
*bd882c8aSJames Wright  const CeedInt item_id_y = get_local_id(1);
*bd882c8aSJames Wright
*bd882c8aSJames Wright  for (CeedInt k = 0; k < Q_1D; k++) {
*bd882c8aSJames Wright    V[k] = 0.0;
*bd882c8aSJames Wright    if (item_id_x < Q_1D && item_id_y < Q_1D) {
*bd882c8aSJames Wright      for (CeedInt i = 0; i < P_1D; i++) {
*bd882c8aSJames Wright        V[k] += B[i + k * P_1D] * U[i];  // Contract z direction
*bd882c8aSJames Wright      }
*bd882c8aSJames Wright    }
*bd882c8aSJames Wright  }
*bd882c8aSJames Wright}
*bd882c8aSJames Wright
*bd882c8aSJames Wright//------------------------------------------------------------------------------
*bd882c8aSJames Wright// 3D transpose tensor contract z
*bd882c8aSJames Wright//------------------------------------------------------------------------------
*bd882c8aSJames Wrightinline void ContractTransposeZ3d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
*bd882c8aSJames Wright                                 private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
*bd882c8aSJames Wright  const CeedInt item_id_x = get_local_id(0);
*bd882c8aSJames Wright  const CeedInt item_id_y = get_local_id(1);
*bd882c8aSJames Wright
*bd882c8aSJames Wright  for (CeedInt k = 0; k < P_1D; k++) {
*bd882c8aSJames Wright    V[k] = 0.0;
*bd882c8aSJames Wright    if (item_id_x < Q_1D && item_id_y < Q_1D) {
*bd882c8aSJames Wright      for (CeedInt i = 0; i < Q_1D; i++) {
*bd882c8aSJames Wright        V[k] += B[k + i * P_1D] * U[i];  // Contract z direction
*bd882c8aSJames Wright      }
*bd882c8aSJames Wright    }
*bd882c8aSJames Wright  }
*bd882c8aSJames Wright}
*bd882c8aSJames Wright
*bd882c8aSJames Wright//------------------------------------------------------------------------------
*bd882c8aSJames Wright// 3D transpose tensor contract y
*bd882c8aSJames Wright//------------------------------------------------------------------------------
*bd882c8aSJames Wrightinline void ContractTransposeY3d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
*bd882c8aSJames Wright                                 private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
*bd882c8aSJames Wright  const CeedInt item_id_x = get_local_id(0);
*bd882c8aSJames Wright  const CeedInt item_id_y = get_local_id(1);
*bd882c8aSJames Wright
*bd882c8aSJames Wright  CeedScalar r_B[T_1D];
*bd882c8aSJames Wright  for (CeedInt i = 0; i < Q_1D; i++) {
*bd882c8aSJames Wright    r_B[i] = B[item_id_y + i * P_1D];
*bd882c8aSJames Wright  }
*bd882c8aSJames Wright
*bd882c8aSJames Wright  for (CeedInt k = 0; k < P_1D; k++) {
*bd882c8aSJames Wright    scratch[item_id_x + item_id_y * T_1D] = U[k];
*bd882c8aSJames Wright    work_group_barrier(CLK_LOCAL_MEM_FENCE);
*bd882c8aSJames Wright
*bd882c8aSJames Wright    V[k] = 0.0;
*bd882c8aSJames Wright    if (item_id_x < Q_1D && item_id_y < P_1D) {
*bd882c8aSJames Wright      for (CeedInt i = 0; i < Q_1D; i++) {
*bd882c8aSJames Wright        V[k] += r_B[i] * scratch[item_id_x + i * T_1D];  // Contract y direction
*bd882c8aSJames Wright      }
*bd882c8aSJames Wright    }
*bd882c8aSJames Wright    work_group_barrier(CLK_LOCAL_MEM_FENCE);
*bd882c8aSJames Wright  }
*bd882c8aSJames Wright}
*bd882c8aSJames Wright
*bd882c8aSJames Wright//------------------------------------------------------------------------------
*bd882c8aSJames Wright// 3D transpose tensor contract y
*bd882c8aSJames Wright//------------------------------------------------------------------------------
*bd882c8aSJames Wrightinline void ContractTransposeAddY3d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
*bd882c8aSJames Wright                                    private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
*bd882c8aSJames Wright  const CeedInt item_id_x = get_local_id(0);
*bd882c8aSJames Wright  const CeedInt item_id_y = get_local_id(1);
*bd882c8aSJames Wright
*bd882c8aSJames Wright  CeedScalar r_B[T_1D];
*bd882c8aSJames Wright  for (CeedInt i = 0; i < Q_1D; i++) {
*bd882c8aSJames Wright    r_B[i] = B[item_id_y + i * P_1D];
*bd882c8aSJames Wright  }
*bd882c8aSJames Wright
*bd882c8aSJames Wright  for (CeedInt k = 0; k < P_1D; k++) {
*bd882c8aSJames Wright    scratch[item_id_x + item_id_y * T_1D] = U[k];
*bd882c8aSJames Wright    work_group_barrier(CLK_LOCAL_MEM_FENCE);
*bd882c8aSJames Wright    if (item_id_x < Q_1D && item_id_y < P_1D) {
*bd882c8aSJames Wright      for (CeedInt i = 0; i < Q_1D; i++) {
*bd882c8aSJames Wright        V[k] += r_B[i] * scratch[item_id_x + i * T_1D];  // Contract y direction
*bd882c8aSJames Wright      }
*bd882c8aSJames Wright    }
*bd882c8aSJames Wright    work_group_barrier(CLK_LOCAL_MEM_FENCE);
*bd882c8aSJames Wright  }
*bd882c8aSJames Wright}
*bd882c8aSJames Wright
*bd882c8aSJames Wright//------------------------------------------------------------------------------
*bd882c8aSJames Wright// 3D transpose tensor contract x
*bd882c8aSJames Wright//------------------------------------------------------------------------------
*bd882c8aSJames Wrightinline void ContractTransposeX3d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
*bd882c8aSJames Wright                                 private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
*bd882c8aSJames Wright  const CeedInt item_id_x = get_local_id(0);
*bd882c8aSJames Wright  const CeedInt item_id_y = get_local_id(1);
*bd882c8aSJames Wright
*bd882c8aSJames Wright  CeedScalar r_B[T_1D];
*bd882c8aSJames Wright  for (CeedInt i = 0; i < Q_1D; i++) {
*bd882c8aSJames Wright    r_B[i] = B[item_id_x + i * P_1D];
*bd882c8aSJames Wright  }
*bd882c8aSJames Wright
*bd882c8aSJames Wright  for (CeedInt k = 0; k < P_1D; k++) {
*bd882c8aSJames Wright    scratch[item_id_x + item_id_y * T_1D] = U[k];
*bd882c8aSJames Wright    work_group_barrier(CLK_LOCAL_MEM_FENCE);
*bd882c8aSJames Wright    V[k] = 0.0;
*bd882c8aSJames Wright    if (item_id_x < P_1D && item_id_y < P_1D) {
*bd882c8aSJames Wright      for (CeedInt i = 0; i < Q_1D; i++) {
*bd882c8aSJames Wright        V[k] += r_B[i] * scratch[i + item_id_y * T_1D];  // Contract x direction
*bd882c8aSJames Wright      }
*bd882c8aSJames Wright    }
*bd882c8aSJames Wright    work_group_barrier(CLK_LOCAL_MEM_FENCE);
*bd882c8aSJames Wright  }
*bd882c8aSJames Wright}
*bd882c8aSJames Wright
*bd882c8aSJames Wright//------------------------------------------------------------------------------
*bd882c8aSJames Wright// 3D transpose tensor contract add x
*bd882c8aSJames Wright//------------------------------------------------------------------------------
*bd882c8aSJames Wrightinline void ContractTransposeAddX3d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
*bd882c8aSJames Wright                                    private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
*bd882c8aSJames Wright  const CeedInt item_id_x = get_local_id(0);
*bd882c8aSJames Wright  const CeedInt item_id_y = get_local_id(1);
*bd882c8aSJames Wright
*bd882c8aSJames Wright  CeedScalar r_B[T_1D];
*bd882c8aSJames Wright  for (CeedInt i = 0; i < Q_1D; i++) {
*bd882c8aSJames Wright    r_B[i] = B[item_id_x + i * P_1D];
*bd882c8aSJames Wright  }
*bd882c8aSJames Wright
*bd882c8aSJames Wright  for (CeedInt k = 0; k < P_1D; k++) {
*bd882c8aSJames Wright    scratch[item_id_x + item_id_y * T_1D] = U[k];
*bd882c8aSJames Wright    work_group_barrier(CLK_LOCAL_MEM_FENCE);
*bd882c8aSJames Wright
*bd882c8aSJames Wright    if (item_id_x < P_1D && item_id_y < P_1D) {
*bd882c8aSJames Wright      for (CeedInt i = 0; i < Q_1D; i++) {
*bd882c8aSJames Wright        V[k] += r_B[i] * scratch[i + item_id_y * T_1D];  // Contract x direction
*bd882c8aSJames Wright      }
*bd882c8aSJames Wright    }
*bd882c8aSJames Wright    work_group_barrier(CLK_LOCAL_MEM_FENCE);
*bd882c8aSJames Wright  }
*bd882c8aSJames Wright}
*bd882c8aSJames Wright
*bd882c8aSJames Wright//------------------------------------------------------------------------------
*bd882c8aSJames Wright// 3D interpolate to quadrature points
*bd882c8aSJames Wright//------------------------------------------------------------------------------
*bd882c8aSJames Wrightinline void InterpTensor3d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U,
*bd882c8aSJames Wright                           local const CeedScalar *restrict s_B, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) {
*bd882c8aSJames Wright  CeedScalar r_t1[T_1D];
*bd882c8aSJames Wright  CeedScalar r_t2[T_1D];
*bd882c8aSJames Wright
*bd882c8aSJames Wright  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
*bd882c8aSJames Wright    ContractX3d(P_1D, Q_1D, r_U + comp * P_1D, s_B, r_t1, scratch);
*bd882c8aSJames Wright    ContractY3d(P_1D, Q_1D, r_t1, s_B, r_t2, scratch);
*bd882c8aSJames Wright    ContractZ3d(P_1D, Q_1D, r_t2, s_B, r_V + comp * Q_1D, scratch);
*bd882c8aSJames Wright  }
*bd882c8aSJames Wright}
*bd882c8aSJames Wright
*bd882c8aSJames Wright//------------------------------------------------------------------------------
*bd882c8aSJames Wright// 3D interpolate transpose
*bd882c8aSJames Wright//------------------------------------------------------------------------------
*bd882c8aSJames Wrightinline void InterpTransposeTensor3d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U,
*bd882c8aSJames Wright                                    local const CeedScalar *restrict s_B, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) {
*bd882c8aSJames Wright  CeedScalar r_t1[T_1D];
*bd882c8aSJames Wright  CeedScalar r_t2[T_1D];
*bd882c8aSJames Wright
*bd882c8aSJames Wright  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
*bd882c8aSJames Wright    ContractTransposeZ3d(P_1D, Q_1D, r_U + comp * Q_1D, s_B, r_t1, scratch);
*bd882c8aSJames Wright    ContractTransposeY3d(P_1D, Q_1D, r_t1, s_B, r_t2, scratch);
*bd882c8aSJames Wright    ContractTransposeX3d(P_1D, Q_1D, r_t2, s_B, r_V + comp * P_1D, scratch);
*bd882c8aSJames Wright  }
*bd882c8aSJames Wright}
*bd882c8aSJames Wright
*bd882c8aSJames Wright//------------------------------------------------------------------------------
*bd882c8aSJames Wright// 3D derivatives at quadrature points
*bd882c8aSJames Wright//------------------------------------------------------------------------------
*bd882c8aSJames Wrightinline void GradTensor3d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U,
*bd882c8aSJames Wright                         local const CeedScalar *restrict s_B, local const CeedScalar *restrict s_G, private CeedScalar *restrict r_V,
*bd882c8aSJames Wright                         local CeedScalar *restrict scratch) {
*bd882c8aSJames Wright  CeedScalar r_t1[T_1D];
*bd882c8aSJames Wright  CeedScalar r_t2[T_1D];
*bd882c8aSJames Wright
*bd882c8aSJames Wright  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
*bd882c8aSJames Wright    ContractX3d(P_1D, Q_1D, r_U + comp * P_1D, s_G, r_t1, scratch);
*bd882c8aSJames Wright    ContractY3d(P_1D, Q_1D, r_t1, s_B, r_t2, scratch);
*bd882c8aSJames Wright    ContractZ3d(P_1D, Q_1D, r_t2, s_B, r_V + comp * Q_1D + 0 * NUM_COMP * Q_1D, scratch);
*bd882c8aSJames Wright    ContractX3d(P_1D, Q_1D, r_U + comp * P_1D, s_B, r_t1, scratch);
*bd882c8aSJames Wright    ContractY3d(P_1D, Q_1D, r_t1, s_G, r_t2, scratch);
*bd882c8aSJames Wright    ContractZ3d(P_1D, Q_1D, r_t2, s_B, r_V + comp * Q_1D + 1 * NUM_COMP * Q_1D, scratch);
*bd882c8aSJames Wright    ContractX3d(P_1D, Q_1D, r_U + comp * P_1D, s_B, r_t1, scratch);
*bd882c8aSJames Wright    ContractY3d(P_1D, Q_1D, r_t1, s_B, r_t2, scratch);
*bd882c8aSJames Wright    ContractZ3d(P_1D, Q_1D, r_t2, s_G, r_V + comp * Q_1D + 2 * NUM_COMP * Q_1D, scratch);
*bd882c8aSJames Wright  }
*bd882c8aSJames Wright}
*bd882c8aSJames Wright
*bd882c8aSJames Wright//------------------------------------------------------------------------------
*bd882c8aSJames Wright// 3D derivatives transpose
*bd882c8aSJames Wright//------------------------------------------------------------------------------
*bd882c8aSJames Wrightinline void GradTransposeTensor3d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U,
*bd882c8aSJames Wright                                  local const CeedScalar *restrict s_B, local const CeedScalar *restrict s_G, private CeedScalar *restrict r_V,
*bd882c8aSJames Wright                                  local CeedScalar *restrict scratch) {
*bd882c8aSJames Wright  CeedScalar r_t1[T_1D];
*bd882c8aSJames Wright  CeedScalar r_t2[T_1D];
*bd882c8aSJames Wright
*bd882c8aSJames Wright  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
*bd882c8aSJames Wright    ContractTransposeZ3d(P_1D, Q_1D, r_U + comp * Q_1D + 0 * NUM_COMP * Q_1D, s_B, r_t1, scratch);
*bd882c8aSJames Wright    ContractTransposeY3d(P_1D, Q_1D, r_t1, s_B, r_t2, scratch);
*bd882c8aSJames Wright    ContractTransposeX3d(P_1D, Q_1D, r_t2, s_G, r_V + comp * P_1D, scratch);
*bd882c8aSJames Wright    ContractTransposeZ3d(P_1D, Q_1D, r_U + comp * Q_1D + 1 * NUM_COMP * Q_1D, s_B, r_t1, scratch);
*bd882c8aSJames Wright    ContractTransposeY3d(P_1D, Q_1D, r_t1, s_G, r_t2, scratch);
*bd882c8aSJames Wright    ContractTransposeAddX3d(P_1D, Q_1D, r_t2, s_B, r_V + comp * P_1D, scratch);
*bd882c8aSJames Wright    ContractTransposeZ3d(P_1D, Q_1D, r_U + comp * Q_1D + 2 * NUM_COMP * Q_1D, s_G, r_t1, scratch);
*bd882c8aSJames Wright    ContractTransposeY3d(P_1D, Q_1D, r_t1, s_B, r_t2, scratch);
*bd882c8aSJames Wright    ContractTransposeAddX3d(P_1D, Q_1D, r_t2, s_B, r_V + comp * P_1D, scratch);
*bd882c8aSJames Wright  }
*bd882c8aSJames Wright}
*bd882c8aSJames Wright
*bd882c8aSJames Wright//------------------------------------------------------------------------------
*bd882c8aSJames Wright// 3D derivatives at quadrature points
*bd882c8aSJames Wright//------------------------------------------------------------------------------
*bd882c8aSJames Wrightinline void GradTensorCollocated3d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U,
*bd882c8aSJames Wright                                   local const CeedScalar *restrict s_B, local const CeedScalar *restrict s_G, private CeedScalar *restrict r_V,
*bd882c8aSJames Wright                                   local CeedScalar *restrict scratch) {
*bd882c8aSJames Wright  CeedScalar r_t1[T_1D];
*bd882c8aSJames Wright  CeedScalar r_t2[T_1D];
*bd882c8aSJames Wright
*bd882c8aSJames Wright  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
*bd882c8aSJames Wright    ContractX3d(P_1D, Q_1D, r_U + comp * P_1D, s_B, r_t1, scratch);
*bd882c8aSJames Wright    ContractY3d(P_1D, Q_1D, r_t1, s_B, r_t2, scratch);
*bd882c8aSJames Wright    ContractZ3d(P_1D, Q_1D, r_t2, s_B, r_t1, scratch);
*bd882c8aSJames Wright    ContractX3d(Q_1D, Q_1D, r_t1, s_G, r_V + comp * Q_1D + 0 * NUM_COMP * Q_1D, scratch);
*bd882c8aSJames Wright    ContractY3d(Q_1D, Q_1D, r_t1, s_G, r_V + comp * Q_1D + 1 * NUM_COMP * Q_1D, scratch);
*bd882c8aSJames Wright    ContractZ3d(Q_1D, Q_1D, r_t1, s_G, r_V + comp * Q_1D + 2 * NUM_COMP * Q_1D, scratch);
*bd882c8aSJames Wright  }
*bd882c8aSJames Wright}
*bd882c8aSJames Wright
*bd882c8aSJames Wright//------------------------------------------------------------------------------
*bd882c8aSJames Wright// 3D derivatives transpose
*bd882c8aSJames Wright//------------------------------------------------------------------------------
*bd882c8aSJames Wrightinline void GradTransposeTensorCollocated3d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U,
*bd882c8aSJames Wright                                            local const CeedScalar *restrict s_B, local const CeedScalar *restrict s_G,
*bd882c8aSJames Wright                                            private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) {
*bd882c8aSJames Wright  CeedScalar r_t1[T_1D];
*bd882c8aSJames Wright  CeedScalar r_t2[T_1D];
*bd882c8aSJames Wright
*bd882c8aSJames Wright  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
*bd882c8aSJames Wright    ContractTransposeZ3d(Q_1D, Q_1D, r_U + comp * Q_1D + 2 * NUM_COMP * Q_1D, s_G, r_t2, scratch);
*bd882c8aSJames Wright    ContractTransposeAddY3d(Q_1D, Q_1D, r_U + comp * Q_1D + 1 * NUM_COMP * Q_1D, s_G, r_t2, scratch);
*bd882c8aSJames Wright    ContractTransposeAddX3d(Q_1D, Q_1D, r_U + comp * Q_1D + 0 * NUM_COMP * Q_1D, s_G, r_t2, scratch);
*bd882c8aSJames Wright    ContractTransposeZ3d(P_1D, Q_1D, r_t2, s_B, r_t1, scratch);
*bd882c8aSJames Wright    ContractTransposeY3d(P_1D, Q_1D, r_t1, s_B, r_t2, scratch);
*bd882c8aSJames Wright    ContractTransposeX3d(P_1D, Q_1D, r_t2, s_B, r_V + comp * P_1D, scratch);
*bd882c8aSJames Wright  }
*bd882c8aSJames Wright}
*bd882c8aSJames Wright
*bd882c8aSJames Wright//------------------------------------------------------------------------------
*bd882c8aSJames Wright// 3D quadrature weights
*bd882c8aSJames Wright//------------------------------------------------------------------------------
*bd882c8aSJames Wright// template <int Q_1D>
*bd882c8aSJames Wrightinline void WeightTensor3d(const CeedInt Q_1D, const CeedScalar *restrict q_weight_1d, CeedScalar *restrict w) {
*bd882c8aSJames Wright  const CeedInt item_id_x = get_local_id(0);
*bd882c8aSJames Wright  const CeedInt item_id_y = get_local_id(1);
*bd882c8aSJames Wright
*bd882c8aSJames Wright  if (item_id_x < Q_1D && item_id_y < Q_1D) {
*bd882c8aSJames Wright    const CeedScalar w_xy = q_weight_1d[item_id_x] * q_weight_1d[item_id_y];
*bd882c8aSJames Wright    for (CeedInt q = 0; q < Q_1D; ++q) w[q] = w_xy * q_weight_1d[q];
*bd882c8aSJames Wright  } else {
*bd882c8aSJames Wright    for (CeedInt q = 0; q < Q_1D; q++) w[q] = 0.0;
*bd882c8aSJames Wright  }
*bd882c8aSJames Wright}
*bd882c8aSJames Wright
*bd882c8aSJames Wright//------------------------------------------------------------------------------
*bd882c8aSJames Wright
*bd882c8aSJames Wright#endif