// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
//
// SPDX-License-Identifier: BSD-2-Clause
//
// This file is part of CEED:  http://github.com/ceed

#include <ceed/types.h>

/// A structure used to pass additional data to f_build_mass_diff
struct BuildContext {
  CeedInt dim, space_dim;
};

/// libCEED Q-function for building quadrature data for a mass + diffusion operator
CEED_QFUNCTION(build_mass_diff)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
  struct BuildContext *build_data = (struct BuildContext *)ctx;
  // in[0] is Jacobians with shape [dim, nc=dim, Q]
  // in[1] is quadrature weights, size (Q)
  //
  // At every quadrature point, compute w/det(J).adj(J).adj(J)^T and store
  // the symmetric part of the result.
  const CeedScalar *J = in[0], *w = in[1];
  CeedScalar       *q_data = out[0];

  switch (build_data->dim + 10 * build_data->space_dim) {
    case 11:
      CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
        // Mass
        q_data[i + Q * 0] = w[i] * J[i];
        // Diffusion
        q_data[i + Q * 1] = w[i] / J[i];
      }  // End of Quadrature Point Loop
      break;
    case 22:
      CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
        // J: 0 2   q_data: 1 3   adj(J):  J22 -J12
        //    1 3           3 2           -J21  J11
        const CeedScalar J11 = J[i + Q * 0];
        const CeedScalar J21 = J[i + Q * 1];
        const CeedScalar J12 = J[i + Q * 2];
        const CeedScalar J22 = J[i + Q * 3];
        const CeedScalar qw  = w[i] / (J11 * J22 - J21 * J12);

        // Mass
        q_data[i + Q * 0] = w[i] * (J11 * J22 - J21 * J12);
        // Diffusion
        q_data[i + Q * 1] = qw * (J12 * J12 + J22 * J22);
        q_data[i + Q * 2] = qw * (J11 * J11 + J21 * J21);
        q_data[i + Q * 3] = -qw * (J11 * J12 + J21 * J22);
      }  // End of Quadrature Point Loop
      break;
    case 33:
      CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
        // Compute the adjoint
        CeedScalar A[3][3];
        for (CeedInt j = 0; j < 3; j++) {
          for (CeedInt k = 0; k < 3; k++) {
            // Equivalent code with J as a VLA and no mod operations:
            // A[k][j] = J[j+1][k+1]*J[j+2][k+2] - J[j+1][k+2]*J[j+2][k+1]
            A[k][j] = J[i + Q * ((j + 1) % 3 + 3 * ((k + 1) % 3))] * J[i + Q * ((j + 2) % 3 + 3 * ((k + 2) % 3))] -
                      J[i + Q * ((j + 1) % 3 + 3 * ((k + 2) % 3))] * J[i + Q * ((j + 2) % 3 + 3 * ((k + 1) % 3))];
          }
        }

        // Compute quadrature weight / det(J)
        const CeedScalar qw = w[i] / (J[i + Q * 0] * A[0][0] + J[i + Q * 1] * A[0][1] + J[i + Q * 2] * A[0][2]);

        // Mass
        q_data[i + Q * 0] = w[i] * (J[i + Q * 0] * A[0][0] + J[i + Q * 1] * A[0][1] + J[i + Q * 2] * A[0][2]);
        // Diffusion
        // Stored in Voigt convention
        // 1 6 5
        // 6 2 4
        // 5 4 3
        q_data[i + Q * 1] = qw * (A[0][0] * A[0][0] + A[0][1] * A[0][1] + A[0][2] * A[0][2]);
        q_data[i + Q * 2] = qw * (A[1][0] * A[1][0] + A[1][1] * A[1][1] + A[1][2] * A[1][2]);
        q_data[i + Q * 3] = qw * (A[2][0] * A[2][0] + A[2][1] * A[2][1] + A[2][2] * A[2][2]);
        q_data[i + Q * 4] = qw * (A[1][0] * A[2][0] + A[1][1] * A[2][1] + A[1][2] * A[2][2]);
        q_data[i + Q * 5] = qw * (A[0][0] * A[2][0] + A[0][1] * A[2][1] + A[0][2] * A[2][2]);
        q_data[i + Q * 6] = qw * (A[0][0] * A[1][0] + A[0][1] * A[1][1] + A[0][2] * A[1][2]);
      }  // End of Quadrature Point Loop
      break;
  }
  return CEED_ERROR_SUCCESS;
}

/// libCEED Q-function for applying a mass + diffusion operator
CEED_QFUNCTION(apply_mass_diff)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
  struct BuildContext *build_data = (struct BuildContext *)ctx;
  // in[0], out[0] have shape [1,   nc=1, Q]
  // in[1], out[1] have shape [dim, nc=1, Q]
  const CeedScalar *u = in[0], *ug = in[1], *q_data = in[2];
  CeedScalar       *v = out[0], *vg = out[1];

  switch (build_data->dim) {
    case 1:
      CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
        // Mass
        v[i] = q_data[i + Q * 0] * u[i];
        // Diffusion
        vg[i] = ug[i] * q_data[i + Q * 1];
      }  // End of Quadrature Point Loop
      break;
    case 2:
      CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
        // Mass
        v[i] = q_data[i + Q * 0] * u[i];

        // Diffusion
        // Read spatial derivatives of u
        const CeedScalar du[2] = {ug[i + Q * 0], ug[i + Q * 1]};

        // Read q_data (dXdxdXdx_T symmetric matrix)
        // Stored in Voigt convention
        // 1 3
        // 3 2
        const CeedScalar dXdxdXdx_T[2][2] = {
            {q_data[i + 1 * Q], q_data[i + 3 * Q]},
            {q_data[i + 3 * Q], q_data[i + 2 * Q]}
        };
        // j = direction of vg
        for (int j = 0; j < 2; j++) vg[i + j * Q] = (du[0] * dXdxdXdx_T[0][j] + du[1] * dXdxdXdx_T[1][j]);
      }  // End of Quadrature Point Loop
      break;
    case 3:
      CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
        // Mass
        v[i] = q_data[i + Q * 0] * u[i];

        // Diffusion
        // Read spatial derivatives of u
        const CeedScalar du[3] = {ug[i + Q * 0], ug[i + Q * 1], ug[i + Q * 2]};

        // Read q_data (dXdxdXdx_T symmetric matrix)
        // Stored in Voigt convention
        // 0 5 4
        // 5 1 3
        // 4 3 2
        const CeedScalar dXdxdXdx_T[3][3] = {
            {q_data[i + 1 * Q], q_data[i + 6 * Q], q_data[i + 5 * Q]},
            {q_data[i + 6 * Q], q_data[i + 2 * Q], q_data[i + 4 * Q]},
            {q_data[i + 5 * Q], q_data[i + 4 * Q], q_data[i + 3 * Q]}
        };
        // j = direction of vg
        for (int j = 0; j < 3; j++) vg[i + j * Q] = (du[0] * dXdxdXdx_T[0][j] + du[1] * dXdxdXdx_T[1][j] + du[2] * dXdxdXdx_T[2][j]);
      }  // End of Quadrature Point Loop
      break;
  }
  return CEED_ERROR_SUCCESS;
}
