jit-source/magma/magma-basis-weight-3d.h

*f80f4a74SSebastian Grimberg// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
*f80f4a74SSebastian Grimberg// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
*f80f4a74SSebastian Grimberg//
*f80f4a74SSebastian Grimberg// SPDX-License-Identifier: BSD-2-Clause
*f80f4a74SSebastian Grimberg//
*f80f4a74SSebastian Grimberg// This file is part of CEED:  http://github.com/ceed
*f80f4a74SSebastian Grimberg
*f80f4a74SSebastian Grimberg//////////////////////////////////////////////////////////////////////////////////////////
*f80f4a74SSebastian Grimberg// weight basis action -- 3D
*f80f4a74SSebastian Grimbergtemplate <typename T, int DIM_, int NCOMP_, int Q_, int iDIM, int iCOMP>
*f80f4a74SSebastian Grimberg__device__ __inline__ void magma_weight_3d_device(const T *sTweight, T rV[DIM_][NCOMP_][Q_], const int tx) {
*f80f4a74SSebastian Grimberg  // Assumptions
*f80f4a74SSebastian Grimberg  // 1. 1D thread configuration of size Q_^2
*f80f4a74SSebastian Grimberg  // 2. rV[][][] matches the storage used in other actions (interp, grad, ... etc)
*f80f4a74SSebastian Grimberg  // 3. iDIM and iCOMP specify which indexes to use in rV,
*f80f4a74SSebastian Grimberg  //    since the output per thread is a register array of size Q_
*f80f4a74SSebastian Grimberg  // 4. Sync is recommended after the call (to make sure sTweight can be overwritten)
*f80f4a74SSebastian Grimberg
*f80f4a74SSebastian Grimberg  if (tx < (Q_ * Q_)) {
*f80f4a74SSebastian Grimberg    // x sTweight[j]    for first update
*f80f4a74SSebastian Grimberg    // x sTweight[tx%Q_] for second update
*f80f4a74SSebastian Grimberg    // x sTweight[tx/Q_] for third update
*f80f4a74SSebastian Grimberg    for (int j = 0; j < Q_; j++) {
*f80f4a74SSebastian Grimberg      rV[iDIM][iCOMP][j] = sTweight[j] * sTweight[tx % Q_] * sTweight[tx / Q_];
*f80f4a74SSebastian Grimberg    }
*f80f4a74SSebastian Grimberg  }
*f80f4a74SSebastian Grimberg}
*f80f4a74SSebastian Grimberg
*f80f4a74SSebastian Grimberg//////////////////////////////////////////////////////////////////////////////////////////
*f80f4a74SSebastian Grimbergextern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(Q *Q, MAGMA_MAXTHREADS_3D)) __global__
*f80f4a74SSebastian Grimberg    void magma_weight_3d_kernel(const CeedScalar *dqweight1d, CeedScalar *dV, const int v_stride, const int nelem) {
*f80f4a74SSebastian Grimberg  MAGMA_DEVICE_SHARED(CeedScalar, shared_data)
*f80f4a74SSebastian Grimberg
*f80f4a74SSebastian Grimberg  const int tx      = threadIdx.x;
*f80f4a74SSebastian Grimberg  const int ty      = threadIdx.y;
*f80f4a74SSebastian Grimberg  const int elem_id = (blockIdx.x * blockDim.y) + ty;
*f80f4a74SSebastian Grimberg
*f80f4a74SSebastian Grimberg  if (elem_id >= nelem) return;
*f80f4a74SSebastian Grimberg
*f80f4a74SSebastian Grimberg  CeedScalar rV[1][1][Q];  // allocate with DIM=NCOMP=1, but sizes may differ for a fused operator
*f80f4a74SSebastian Grimberg  // global memory pointers
*f80f4a74SSebastian Grimberg  dV += elem_id * v_stride;
*f80f4a74SSebastian Grimberg
*f80f4a74SSebastian Grimberg  // shared memory pointers
*f80f4a74SSebastian Grimberg  CeedScalar *sTweight = (CeedScalar *)shared_data;
*f80f4a74SSebastian Grimberg
*f80f4a74SSebastian Grimberg  // read dqweight_1d
*f80f4a74SSebastian Grimberg  if (tx < Q) {
*f80f4a74SSebastian Grimberg    sTweight[tx] = dqweight1d[tx];
*f80f4a74SSebastian Grimberg  }
*f80f4a74SSebastian Grimberg  __syncthreads();
*f80f4a74SSebastian Grimberg
*f80f4a74SSebastian Grimberg  magma_weight_3d_device<CeedScalar, 1, 1, Q, 0, 0>(sTweight, rV, tx);
*f80f4a74SSebastian Grimberg
*f80f4a74SSebastian Grimberg  // write V
*f80f4a74SSebastian Grimberg  if (tx < (Q * Q)) {
*f80f4a74SSebastian Grimberg    for (int j = 0; j < Q; j++) {
*f80f4a74SSebastian Grimberg      dV[j * (Q * Q) + tx] = rV[0][0][j];
*f80f4a74SSebastian Grimberg    }
*f80f4a74SSebastian Grimberg  }
*f80f4a74SSebastian Grimberg}