C Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors
C All Rights Reserved. See the top-level COPYRIGHT and NOTICE files for details.
C
C SPDX-License-Identifier: (BSD-2-Clause)
C
C This file is part of CEED:  http://github.com/ceed

C> @file
C> Mass and diffusion operators examples using Nek5000
C_TESTARGS(name="BP1") -c {ceed_resource} -e bp1 -n 1 -b 4 -test
C_TESTARGS(name="BP3") -c {ceed_resource} -e bp3 -n 1 -b 4 -test

C-----------------------------------------------------------------------
      subroutine masssetupf(ctx,q,u1,u2,u3,u4,u5,u6,u7,
     $  u8,u9,u10,u11,u12,u13,u14,u15,u16,v1,v2,v3,v4,v5,v6,v7,v8,
     $  v9,v10,v11,v12,v13,v14,v15,v16,ierr)
C     Set up mass operator
C     Input: u1,u2,u3,q             Output: v1,v2,ierr
      integer q,ierr
      real*8 ctx(1)
      real*8 u1(3*q)
      real*8 u2(9*q)
      real*8 u3(q)
      real*8 v1(q)
      real*8 v2(q)
      real*8 a11,a12,a13,a21,a22,a23,a31,a32,a33
      real*8 g11,g12,g13,g21,g22,g23,g31,g32,g33
      real*8 jacmq

C     Quadrature Point Loop
      do i=1,q
        a11=u2(i+q*0)
        a12=u2(i+q*3)
        a13=u2(i+q*6)

        a21=u2(i+q*1)
        a22=u2(i+q*4)
        a23=u2(i+q*7)

        a31=u2(i+q*2)
        a32=u2(i+q*5)
        a33=u2(i+q*8)

        g11 = (a22*a33-a23*a32)
        g12 = (a13*a32-a33*a12)
        g13 = (a12*a23-a22*a13)

        g21 = (a23*a31-a21*a33)
        g22 = (a11*a33-a31*a13)
        g23 = (a13*a21-a23*a11)

        g31 = (a21*a32-a22*a31)
        g32 = (a12*a31-a32*a11)
        g33 = (a11*a22-a21*a12)

        jacmq = a11*g11+a21*g12+a31*g13

C       Rho
        v1(i)=u3(i)*jacmq

C       RHS
        v2(i)=u3(i)*jacmq
     $             *dsqrt(u1(i+q*0)*u1(i+q*0)
     $                   +u1(i+q*1)*u1(i+q*1)
     $                   +u1(i+q*2)*u1(i+q*2))
      enddo

      ierr=0
      end
C-----------------------------------------------------------------------
      subroutine massf(ctx,q,u1,u2,u3,u4,u5,u6,u7,
     $  u8,u9,u10,u11,u12,u13,u14,u15,u16,v1,v2,v3,v4,v5,v6,v7,v8,
     $  v9,v10,v11,v12,v13,v14,v15,v16,ierr)
C     Apply mass operator
C     Input: u1,u2,q                Output: v1,ierr
      integer q,ierr
      real*8 ctx(1)
      real*8 u1(q)
      real*8 u2(q)
      real*8 v1(q)

C     Quadrature Point Loop
      do i=1,q
        v1(i)=u2(i)*u1(i)
      enddo

      ierr=0
      end
C-----------------------------------------------------------------------
      subroutine diffsetupf(ctx,q,u1,u2,u3,u4,u5,u6,u7,
     $  u8,u9,u10,u11,u12,u13,u14,u15,u16,v1,v2,v3,v4,v5,v6,v7,v8,
     $  v9,v10,v11,v12,v13,v14,v15,v16,ierr)
C     Set up diffusion operator
C     Input: u1,u2,u3,q             Output: v1,v2,ierr
      integer q,ierr
      real*8 ctx(1)
      real*8 u1(3*q)
      real*8 u2(9*q)
      real*8 u3(q)
      real*8 v1(6*q)
      real*8 v2(q)
      real*8 a11,a12,a13,a21,a22,a23,a31,a32,a33
      real*8 g11,g12,g13,g21,g22,g23,g31,g32,g33
      real*8 jacmq,scl
      real*8 c(3),k(3)

C     Quadrature Point Loop
      do i=1,q
        pi = 3.14159265358979323846

        c(1)=0.
        c(2)=1.
        c(3)=2.
        k(1)=1.
        k(2)=2.
        k(3)=3.

        a11=u2(i+q*0)
        a12=u2(i+q*3)
        a13=u2(i+q*6)

        a21=u2(i+q*1)
        a22=u2(i+q*4)
        a23=u2(i+q*7)

        a31=u2(i+q*2)
        a32=u2(i+q*5)
        a33=u2(i+q*8)

        g11 = (a22*a33-a23*a32)
        g12 = (a13*a32-a33*a12)
        g13 = (a12*a23-a22*a13)

        g21 = (a23*a31-a21*a33)
        g22 = (a11*a33-a31*a13)
        g23 = (a13*a21-a23*a11)

        g31 = (a21*a32-a22*a31)
        g32 = (a12*a31-a32*a11)
        g33 = (a11*a22-a21*a12)

        jacmq = a11*g11+a21*g12+a31*g13

        scl = u3(i)/jacmq

C       Geometric factors
C       Stored in Voigt convention
C       0 5 4
C       5 1 3
C       4 3 2
        v1(i+0*q) = scl*(g11*g11+g12*g12+g13*g13) ! Grr
        v1(i+1*q) = scl*(g21*g21+g22*g22+g23*g23) ! Gss
        v1(i+2*q) = scl*(g31*g31+g32*g32+g33*g33) ! Gtt
        v1(i+3*q) = scl*(g21*g31+g22*g32+g23*g33) ! Gst
        v1(i+4*q) = scl*(g11*g31+g12*g32+g13*g33) ! Grt
        v1(i+5*q) = scl*(g11*g21+g12*g22+g13*g23) ! Grs

C       RHS
        v2(i) = u3(i)*jacmq*pi*pi
     $            *dsin(pi*(c(1)+k(1)*u1(i+0*q)))
     $            *dsin(pi*(c(2)+k(2)*u1(i+1*q)))
     $            *dsin(pi*(c(3)+k(3)*u1(i+2*q)))
     $            *(k(1)*k(1)+k(2)*k(2)+k(3)*k(3))

      enddo

      ierr=0
      end
C-----------------------------------------------------------------------
      subroutine diffusionf(ctx,q,u1,u2,u3,u4,u5,u6,u7,
     $  u8,u9,u10,u11,u12,u13,u14,u15,u16,v1,v2,v3,v4,v5,v6,v7,v8,
     $  v9,v10,v11,v12,v13,v14,v15,v16,ierr)
C     Apply diffusion operator
C     Input: u1,u2,q                Output: v1,ierr
      integer q,ierr
      real*8 ctx(1)
      real*8 u1(3*q)
      real*8 u2(6*q)
      real*8 v1(3*q)

C     Quadrature Point Loop
      do i=1,q
        v1(i+0*q)=
     $     u2(i+0*q)*u1(i)+u2(i+5*q)*u1(i+q)+u2(i+4*q)*u1(i+2*q)
        v1(i+1*q)=
     $     u2(i+5*q)*u1(i)+u2(i+1*q)*u1(i+q)+u2(i+3*q)*u1(i+2*q)
        v1(i+2*q)=
     $     u2(i+4*q)*u1(i)+u2(i+3*q)*u1(i+q)+u2(i+2*q)*u1(i+2*q)
      enddo

      ierr=0
      end
C-----------------------------------------------------------------------
      subroutine set_h2_as_rhoJac_GL(h2,bmq,nxq)
C     Set h2 as rhoJac
C     Input: bmq,nxq                Output: h2
      include 'SIZE'
      real*8 h2(1),bmq(1)

      common /ctmp77/ wd(lxd),zd(lxd)
      integer e,i,L

      call zwgl(zd,wd,nxq)  ! nxq = number of points

      q = 1.0               ! Later, this can be a function of position...

      L = 0
      do e=1,lelt
      do i=1,nxq**ldim
         L=L+1
         h2(L) = q*bmq(L)
      enddo
      enddo

      return
      end
C-----------------------------------------------------------------------
      subroutine dist_fld_h1(e)
C     Set distance initial condition for BP1
C     Input:                        Output: e
      include 'SIZE'
      include 'TOTAL'
      real*8 x,y,z
      real*8 e(1)

      n=lx1*ly1*lz1*nelt

      do i=1,n
        x=xm1(i,1,1,1)
        y=ym1(i,1,1,1)
        z=zm1(i,1,1,1)

        e(i) = dsqrt(x*x+y*y+z*z)
      enddo

      call dsavg(e)  ! This is requisite for random fields

      return
      end
C-----------------------------------------------------------------------
      subroutine sin_fld_h1(e)
C     Set sine initial condition for BP3
C     Input:                        Output: e
      include 'SIZE'
      include 'TOTAL'
      real*8 x,y,z
      real*8 e(1)
      real*8 c(3),k(3)

      n=lx1*ly1*lz1*nelt
      pi = 3.14159265358979323846

      c(1)=0.
      c(2)=1.
      c(3)=2.
      k(1)=1.
      k(2)=2.
      k(3)=3.

      do i=1,n
        x=xm1(i,1,1,1)
        y=ym1(i,1,1,1)
        z=zm1(i,1,1,1)

        e(i) = dsin(pi*(c(1)+k(1)*x))
     &        *dsin(pi*(c(2)+k(2)*y))
     &        *dsin(pi*(c(3)+k(3)*z))

      enddo

      call dsavg(e)  ! This is requisite for random fields

      return
      end
C-----------------------------------------------------------------------
      subroutine uservp(ix,iy,iz,eg) ! set variable properties
      include 'SIZE'
      include 'TOTAL'
      include 'NEKUSE'
      integer e,f,eg
C     e = gllel(eg)

      udiff  = 0.0
      utrans = 0.0

      return
      end
C-----------------------------------------------------------------------
      subroutine userf(ix,iy,iz,eg) ! set acceleration term
C
C     Note: this is an acceleration term, NOT a force!
C     Thus, ffx will subsequently be multiplied by rho(x,t).
C
      include 'SIZE'
      include 'TOTAL'
      include 'NEKUSE'
      integer e,f,eg
C     e = gllel(eg)

      ffx = 0.0
      ffy = 0.0
      ffz = 0.0

      return
      end
C-----------------------------------------------------------------------
      subroutine userq(i,j,k,eg) ! set source term
      include 'SIZE'
      include 'TOTAL'
      include 'NEKUSE'
      integer e,f,eg
      e = gllel(eg)

      qvol   = 0

      return
      end
C-----------------------------------------------------------------------
      subroutine userbc(ix,iy,iz,f,eg) ! set up boundary conditions
C     NOTE ::: This subroutine MAY NOT be called by every process
      include 'SIZE'
      include 'TOTAL'
      include 'NEKUSE'
      integer e,f,eg

      ux   = 0.0
      uy   = 0.0
      uz   = 0.0
      temp = 0.0

      return
      end
C-----------------------------------------------------------------------
      subroutine useric(ix,iy,iz,eg) ! set up initial conditions
      include 'SIZE'
      include 'TOTAL'
      include 'NEKUSE'
      integer e,f,eg

      ux   = 0.0
      uy   = 0.0
      uz   = 0.0
      temp = 0.0

      return
      end
C-----------------------------------------------------------------------
      subroutine usrdat   ! This routine to modify element vertices
      include 'SIZE'
      include 'TOTAL'

      return
      end
C-----------------------------------------------------------------------
      subroutine usrdat2  ! This routine to modify mesh coordinates
      include 'SIZE'
      include 'TOTAL'

      x0 = 0
      x1 = 1
      call rescale_x(xm1,x0,x1)
      call rescale_x(ym1,x0,x1)
      call rescale_x(zm1,x0,x1)

      param(59)=1  ! Force Nek to use the "deformed element" formulation

      return
      end
C-----------------------------------------------------------------------
      subroutine usrdat3
      include 'SIZE'
      include 'TOTAL'

      return
      end
C-----------------------------------------------------------------------
      subroutine xmask1   (r1,h2,nel)
C     Apply zero Dirichlet boundary conditions
C     Input: h2,nel                 Output: r1
      include 'SIZE'
      include 'TOTAL'
      real*8 r1(1),h2(1)

      n=nx1*ny1*nz1*nel
      do i=1,n
         r1(i)=r1(i)*h2(i)
      enddo

      return
      end
C-----------------------------------------------------------------------
      function glrdif(x,y,n)
C     Compute Linfty norm of (x-y)
C     Input: x,y                    Output: n
      real*8 x(n),y(n)

      dmx=0
      xmx=0
      ymx=0

      do i=1,n
         diff=abs(x(i)-y(i))
         dmx =max(dmx,diff)
         xmx =max(xmx,x(i))
         ymx =max(ymx,y(i))
      enddo

      xmx = max(xmx,ymx)
      dmx = glmax(dmx,1) ! max across processors
      xmx = glmax(xmx,1)

      if (xmx.gt.0) then
         glrdif = dmx/xmx
      else
         glrdif = -dmx   ! Negative indicates something strange
      endif

      return
      end
C-----------------------------------------------------------------------
      subroutine loc_grad3(ur,us,ut,u,N,D,Dt)
C     3D transpose of local gradient
C     Input: u,N,D,Dt               Output: ur,us,ut
      real*8 ur(0:N,0:N,0:N),us(0:N,0:N,0:N),ut(0:N,0:N,0:N)
      real*8 u (0:N,0:N,0:N)
      real*8 D (0:N,0:N),Dt(0:N,0:N)

      m1 = N+1
      m2 = m1*m1

      call mxm(D ,m1,u(0,0,0),m1,ur,m2)
      do k=0,N
         call mxm(u(0,0,k),m1,Dt,m1,us(0,0,k),m1)
      enddo
      call mxm(u(0,0,0),m2,Dt,m1,ut,m1)

      return
      end
c-----------------------------------------------------------------------
      subroutine loc_grad3t(u,ur,us,ut,N,D,Dt,w)
C     3D transpose of local gradient
C     Input: ur,us,ut,N,D,Dt        Output: u
       real*8 u (0:N,0:N,0:N)
       real*8 ur(0:N,0:N,0:N),us(0:N,0:N,0:N),ut(0:N,0:N,0:N)
       real*8 D (0:N,0:N),Dt(0:N,0:N)
       real*8 w (0:N,0:N,0:N)

       m1 = N+1
       m2 = m1*m1
       m3 = m1*m1*m1

       call mxm(Dt,m1,ur,m1,u(0,0,0),m2)
       do k=0,N
          call mxm(us(0,0,k),m1,D ,m1,w(0,0,k),m1)
       enddo
       call add2(u(0,0,0),w,m3)
       call mxm(ut,m2,D ,m1,w,m1)
       call add2(u(0,0,0),w,m3)

      return
      end
C-----------------------------------------------------------------------
      subroutine geodatq(gf,bmq,w3mq,nxq)
C     Routine to generate elemental geometric matrices on mesh 1
C     (Gauss-Legendre Lobatto mesh).
      include 'SIZE'
      include 'TOTAL'

      parameter (lg=3+3*(ldim-2),lzq=lx1+1,lxyd=lzq**ldim)

      real*8 gf(lg,nxq**ldim,lelt),bmq(nxq**ldim,lelt),w3mq(nxq,nxq,nxq)

      common /ctmp1/ xr(lxyd),xs(lxyd),xt(lxyd)
      common /sxrns/ yr(lxyd),ys(lxyd),yt(lxyd)
     $ ,             zr(lxyd),zs(lxyd),zt(lxyd)

      common /ctmp77/ wd(lxd),zd(lxd)
      common /dxmfine/ dxmq(lzq,lzq),dxtmq(lzq,lzq)

      integer e
      real*8 tmp(lxyd)
      real*8 a11,a12,a13,a21,a22,a23,a31,a32,a33
      real*8 g11,g12,g13,g21,g22,g23,g31,g32,g33
      real*8 jacmq

      if (nxq.gt.lzq) call exitti('ABORT: recompile with lzq=$',nxq)

      call zwgl    (zd,wd,lzq)                            ! nxq = number of points
      call gen_dgl (dxmq,dxtmq,lzq,lzq,tmp)

      do k=1,nxq
      do j=1,nxq
      do i=1,nxq
         w3mq(i,j,k) = wd(i)*wd(j)*wd(k)
      enddo
      enddo
      enddo

      nxyzq = nxq**ldim
      nxqm1 = lzq-1

      do e=1,nelt
         call intp_rstd (tmp,xm1(1,1,1,e),lx1,lzq,if3d,0) ! 0-->Fwd interpolation
         call loc_grad3 (xr,xs,xt,tmp,nxqm1,dxmq,dxtmq)

         call intp_rstd (tmp,ym1(1,1,1,e),lx1,lzq,if3d,0)
         call loc_grad3 (yr,ys,yt,tmp,nxqm1,dxmq,dxtmq)

         call intp_rstd (tmp,zm1(1,1,1,e),lx1,lzq,if3d,0)
         call loc_grad3 (zr,zs,zt,tmp,nxqm1,dxmq,dxtmq)

         do i=1,nxyzq
            a11 = xr(i)
            a12 = xs(i)
            a13 = xt(i)

            a21 = yr(i)
            a22 = ys(i)
            a23 = yt(i)

            a31 = zr(i)
            a32 = zs(i)
            a33 = zt(i)

            g11 = (a22*a33-a23*a32)
            g12 = (a13*a32-a33*a12)
            g13 = (a12*a23-a22*a13)

            g21 = (a23*a31-a21*a33)
            g22 = (a11*a33-a31*a13)
            g23 = (a13*a21-a23*a11)

            g31 = (a21*a32-a22*a31)
            g32 = (a12*a31-a32*a11)
            g33 = (a11*a22-a21*a12)

            jacmq = a11*g11+a21*g12+a31*g13

            bmq(i,e)  = w3mq(i,1,1)*jacmq
            scale     = w3mq(i,1,1)/jacmq

            gf(1,i,e) = scale*(g11*g11+g12*g12+g13*g13) ! Grr
            gf(2,i,e) = scale*(g11*g21+g12*g22+g13*g23) ! Grs
            gf(3,i,e) = scale*(g11*g31+g12*g32+g13*g33) ! Grt
            gf(4,i,e) = scale*(g21*g21+g22*g22+g23*g23) ! Gss
            gf(5,i,e) = scale*(g21*g31+g22*g32+g23*g33) ! Gst
            gf(6,i,e) = scale*(g31*g31+g32*g32+g33*g33) ! Gtt

         enddo
      enddo

      return
      end
C-----------------------------------------------------------------------
      subroutine setprecn_bp1 (d,h1,h2)
C     Generate diagonal preconditioner for Helmholtz operator
C     Input: h1,h2                  Output: d
      include 'SIZE'
      include 'TOTAL'

      parameter (lxyz=lx1*ly1*lz1,lg=3+3*(ldim-2))

      real*8    d(lx1,ly1,lz1,lelt),h1(lxyz,lelt),h2(lxyz,lelt)
      integer e

      real*8         gf(lg,lx1,ly1,lz1,lelt) ! Equivalence new gf() data
      equivalence (gf,g1m1)                  ! layout to g1m1...g6m1

      real*8 ysm1(ly1)

      nel   = nelfld(ifield)
      n     = nel*lx1*ly1*lz1
      nxyz  = lx1*ly1*lz1

      call copy    (d,bm1,n)   ! Mass matrix preconditioning full mass matrix
      call dssum   (d,nx1,ny1,nz1)
      call invcol1 (d,n)
      return

      call dsset(lx1,ly1,lz1)

      do 1000 e=1,nel

        call rzero(d(1,1,1,e),nxyz)

        do 320 iz=1,lz1
         do 320 iy=1,ly1
         do 320 ix=1,lx1
         do 320 iq=1,lx1
           d(ix,iy,iz,e) = d(ix,iy,iz,e)
     $                   + gf(1,iq,iy,iz,e) * dxm1(iq,ix)**2
     $                   + gf(2,ix,iq,iz,e) * dxm1(iq,iy)**2
     $                   + gf(3,ix,iy,iq,e) * dxm1(iq,iz)**2
  320    continue
C
C        Add cross terms if element is deformed.
C
         if (lxyz.gt.0) then

           do i2=1,ly1,ly1-1
           do i1=1,lx1,lx1-1
              d(1,i1,i2,e) = d(1,i1,i2,e)
     $            + gf(4,1,i1,i2,e) * dxtm1(1,1)*dytm1(i1,i1)
     $            + gf(5,1,i1,i2,e) * dxtm1(1,1)*dztm1(i2,i2)
              d(lx1,i1,i2,e) = d(lx1,i1,i2,e)
     $            + gf(4,lx1,i1,i2,e) * dxtm1(lx1,lx1)*dytm1(i1,i1)
     $            + gf(5,lx1,i1,i2,e) * dxtm1(lx1,lx1)*dztm1(i2,i2)
              d(i1,1,i2,e) = d(i1,1,i2,e)
     $            + gf(4,i1,1,i2,e) * dytm1(1,1)*dxtm1(i1,i1)
     $            + gf(6,i1,1,i2,e) * dytm1(1,1)*dztm1(i2,i2)
              d(i1,ly1,i2,e) = d(i1,ly1,i2,e)
     $            + gf(4,i1,ly1,i2,e) * dytm1(ly1,ly1)*dxtm1(i1,i1)
     $            + gf(6,i1,ly1,i2,e) * dytm1(ly1,ly1)*dztm1(i2,i2)
              d(i1,i2,1,e) = d(i1,i2,1,e)
     $            + gf(5,i1,i2,1,e) * dztm1(1,1)*dxtm1(i1,i1)
     $            + gf(6,i1,i2,1,e) * dztm1(1,1)*dytm1(i2,i2)
              d(i1,i2,lz1,e) = d(i1,i2,lz1,e)
     $            + gf(5,i1,i2,lz1,e) * dztm1(lz1,lz1)*dxtm1(i1,i1)
     $            + gf(6,i1,i2,lz1,e) * dztm1(lz1,lz1)*dytm1(i2,i2)

           enddo
           enddo
         endif

        do i=1,lxyz
           d(i,1,1,e)=d(i,1,1,e)*h1(i,e)+h2(i,e)*bm1(i,1,1,e)
        enddo

 1000 continue ! element loop

C     If axisymmetric, add a diagonal term in the radial direction (ISD=2)

      if (ifaxis.and.(isd.eq.2)) then
         do 1200 e=1,nel
            if (ifrzer(e)) call mxm(ym1(1,1,1,e),lx1,datm1,ly1,ysm1,1)
            k=0
            do 1190 j=1,ly1
            do 1190 i=1,lx1
               k=k+1
               if (ym1(i,j,1,e).ne.0.) then
                  term1 = bm1(i,j,1,e)/ym1(i,j,1,e)**2
                  if (ifrzer(e)) then
                     term2 =  wxm1(i)*wam1(1)*dam1(1,j)
     $                       *jacm1(i,1,1,e)/ysm1(i)
                  else
                     term2 = 0.
                  endif
                  d(i,j,1,e) = d(i,j,1,e) + h1(k,e)*(term1+term2)
               endif
 1190       continue
 1200    continue

      endif
      call dssum   (d,nx1,ny1,nz1)
      call invcol1 (d,n)

      if (nio.eq.0) write(6,1) n,d(1,1,1,1),h1(1,1),h2(1,1),bm1(1,1,1,1)
   1  format(i9,1p4e12.4,' diag prec')

      return
      end
C-----------------------------------------------------------------------
      subroutine setprecn_bp3 (d,h1,h2)
C     Generate dummy diagonal preconditioner for Helmholtz operator
C     Input: h1,h2                  Output: d
      include 'SIZE'
      include 'TOTAL'

      parameter (n=lx1*ly1*lz1*lelt)
      real*8 d(n),h1(n),h2(n)

      call rone (d,n)

      return
      end
C-----------------------------------------------------------------------
      subroutine userchk
      include 'SIZE'
      include 'TOTAL'

      integer bp

      call get_bp(bp)

      if (bp==1) then
        if (istep.gt.0) call bp1
      elseif (bp==3) then
        if (istep.gt.0) call bp3
      else
        write(6,*) "INVALID BP SPECIFICED"
      endif

      return
      end
C-----------------------------------------------------------------------
      subroutine bp1
C     Solution to BP1 using libCEED
      include 'SIZE'
      include 'TOTAL'
      include 'CTIMER'  ! ifsync
      include 'FDMH1'
      include 'ceed/fortran.h'

      parameter (lzq=lx1+1)
      parameter (lx=lx1*ly1*lz1,lg=3+3*(ldim-2),lq=lzq**ldim)
      common /bpgfactors/ gf(lg*lq,lelt),bmq(lq,lelt),w3mq(lq)

      parameter (lt=lx1*ly1*lz1*lelt)
      parameter (ld=lxd*lyd*lzd*lelt)
      common /vcrns/ u1(lt),r1(lt),r2(lt),r3(lt)
      common /vcrny/ e1(lt)
      common /vcrvh/ h1(lt),h2(lx*lelt),pap(3)
      real*8 coords(ldim*lx*lelt)

      logical ifh3
      integer*8 nnode
      integer ceed,err,test
      character*64 spec

      integer p,q,ncompx,ncompu,enode,lnode
      integer vec_p1,vec_ap1,vec_qdata,vec_coords,vec_rhs
      integer stridesu(3),stridesx(3),stridesw(3)
      integer erstrctu,erstrctx,erstrctw
      integer basisu,basisx
      integer qf_mass,qf_setup
      integer op_mass,op_setup
      real*8  x,y,z
      integer*8 offset

      external massf,masssetupf

      ifield = 1
      nxq    = nx1+1
      n = nx1*ny1*nz1*nelt

      ifsync = .false.

C     Set up coordinates
      ii=0
      do j=0,nelt-1
      do i=1,lx
        ii=ii+1
        x = xm1(ii,1,1,1)
        y = ym1(ii,1,1,1)
        z = zm1(ii,1,1,1)
        coords(i+0*lx+3*j*lx)=x
        coords(i+1*lx+3*j*lx)=y
        coords(i+2*lx+3*j*lx)=z
      enddo
      enddo

C     Init ceed library
      call get_spec(spec)
      call ceedinit(trim(spec)//char(0),ceed,err)

      call get_test(test)

C     Set up Nek geometry data
      call geodatq       (gf,bmq,w3mq,nxq)         ! Set up gf() arrays
      call set_h2_as_rhoJac_GL(h2,bmq,nxq)

C     Set up true soln
      call dist_fld_h1   (e1)
      call copy          (h1,e1,n)                 ! Save exact soln in h1

C     Set up solver parameters
      tol       = 1e-10
      param(22) = tol
      maxit     = 100

      call nekgsync()

C     Create ceed basis for mesh and computation
      p=nx1
      q=p+1
      ncompu=1
      ncompx=ldim
      call ceedbasiscreatetensorh1lagrange(ceed,ndim,ndim,p,q,
     $  ceed_gauss,basisx,err)
      call ceedbasiscreatetensorh1lagrange(ceed,ndim,ncompu,p,q,
     $  ceed_gauss,basisu,err)

C     Create ceed element restrictions for mesh and computation
      enode=p**ldim
      lnode=enode*nelt*ncompu
      stridesx=[1,enode,enode*ldim]
      call ceedelemrestrictioncreatestrided(ceed,nelt,enode,ldim,
     $  ldim*lnode,stridesx,erstrctx,err)
      stridesu=[1,enode,enode*ncompu]
      call ceedelemrestrictioncreatestrided(ceed,nelt,enode,ncompu,
     $  ncompu*lnode,stridesu,erstrctu,err)
      call ceedelemrestrictioncreatestrided(ceed,nelt,q**ldim,
     $  1,nelt*q**ldim,ceed_strides_backend,erstrctw,err)

C     Create ceed vectors
      call ceedvectorcreate(ceed,lnode,vec_p1,err)
      call ceedvectorcreate(ceed,lnode,vec_ap1,err)
      call ceedvectorcreate(ceed,lnode,vec_rhs,err)
      call ceedvectorcreate(ceed,ldim*lx*nelt,vec_coords,err)
      call ceedvectorcreate(ceed,nelt*q**ldim,vec_qdata,err)

      offset=0
      call ceedvectorsetarray(vec_coords,ceed_mem_host,
     $  ceed_use_pointer,coords,offset,err)

C     Create ceed qfunctions for masssetupf and massf
      call ceedqfunctioncreateinterior(ceed,1,masssetupf,
     $  EXAMPLE_DIR
     $  //'bps/bps.h:masssetupf',qf_setup,err)
      call ceedqfunctionaddinput(qf_setup,'x',ncompx,
     $  ceed_eval_interp,err)
      call ceedqfunctionaddinput(qf_setup,'dx',ncompx*ldim,
     $  ceed_eval_grad,err)
      call ceedqfunctionaddinput(qf_setup,'weight',ncompu,
     $  ceed_eval_weight,err)
      call ceedqfunctionaddoutput(qf_setup,'qdata',ncompu,
     $  ceed_eval_none,err)
      call ceedqfunctionaddoutput(qf_setup,'rhs',ncompu,
     $  ceed_eval_interp,err)

      call ceedqfunctioncreateinterior(ceed,1,massf,
     $  EXAMPLE_DIR
     $  //'bps/bps.h:massf',qf_mass,err)
      call ceedqfunctionaddinput(qf_mass,'u',ncompu,
     $  ceed_eval_interp,err)
      call ceedqfunctionaddinput(qf_mass,'qdata',ncompu,
     $  ceed_eval_none,err)
      call ceedqfunctionaddoutput(qf_mass,'v',ncompu,
     $  ceed_eval_interp,err)

C     Create ceed operators
      call ceedoperatorcreate(ceed,qf_setup,
     $  ceed_qfunction_none,ceed_qfunction_none,op_setup,err)
      call ceedoperatorsetfield(op_setup,'x',erstrctx,
     $  basisx,ceed_vector_active,err)
      call ceedoperatorsetfield(op_setup,'dx',erstrctx,
     $  basisx,ceed_vector_active,err)
      call ceedoperatorsetfield(op_setup,'weight',
     $  ceed_elemrestriction_none,basisx,ceed_vector_none,err)
      call ceedoperatorsetfield(op_setup,'qdata',erstrctw,
     $  ceed_basis_collocated,ceed_vector_active,err)
      call ceedoperatorsetfield(op_setup,'rhs',erstrctu,
     $  basisu,vec_rhs,err)

      call ceedoperatorcreate(ceed,qf_mass,
     $  ceed_qfunction_none,ceed_qfunction_none,op_mass,err)
      call ceedoperatorsetfield(op_mass,'u',erstrctu,
     $  basisu,ceed_vector_active,err)
      call ceedoperatorsetfield(op_mass,'qdata',erstrctw,
     $  ceed_basis_collocated,vec_qdata,err)
      call ceedoperatorsetfield(op_mass,'v',erstrctu,
     $  basisu,ceed_vector_active,err)

C     Compute setup data
      call ceedvectorsetarray(vec_rhs,ceed_mem_host,
     $  ceed_use_pointer,r1,offset,err)
      call ceedoperatorapply(op_setup,vec_coords,vec_qdata,
     $  ceed_request_immediate,err)

C     Set up true RHS
      call dssum         (r1,nx1,ny1,nz1)          ! r1

C     Set up algebraic RHS with libCEED
      call ceedvectorsetarray(vec_p1,ceed_mem_host,
     $  ceed_use_pointer,h1,offset,err)
      call ceedvectorsetarray(vec_ap1,ceed_mem_host,
     $  ceed_use_pointer,r2,offset,err)
      call ceedoperatorapply(op_mass,vec_p1,vec_ap1,
     $  ceed_request_immediate,err)                ! r2 = A_ceed*h1
      call dssum         (r2,nx1,ny1,nz1)

C     Set up algebraic RHS with Nek5000
      call axhm1         (pap,r3,h1,h1,h2,'bp1')   ! r3 = A_nek5k*h1
      call dssum         (r3,nx1,ny1,nz1)

      call nekgsync()

C     Solve true RHS
      if (nid.eq.0) write (6,*) "libCEED true RHS"
      tstart = dnekclock()
      call cggos(u1,r1,h1,h2,vmult,binvm1,tol,ceed,op_mass,
     $  vec_p1,vec_ap1,maxit,'bp1')
      tstop  = dnekclock()

C     Output
      telaps = (tstop-tstart)
      maxits = maxit
      er1 = glrdif(u1,e1,n)
      if (nid.eq.0) write(6,3) lx1,nelgv,er1,' error ',maxit

      if (test.eq.1.and.nid.eq.0) then
        if (maxit>=100) then
          write(6,*) "UNCONVERGED CG"
        endif
        if (dabs(er1)>5e-3) then
          write(6,*) "ERROR IS TOO LARGE"
        endif
      endif

      nx     = nx1-1
      nnode  = nelgt            ! nnodes
      nnode  = nnode*(nx**ldim) ! nnodes
      nppp   = nnode/np         ! nnodes/proc

      dofps  = nnode/telaps     ! DOF/sec - scalar form
      titers = telaps/maxits    ! time per iteration
      tppp_s = titers/nppp      ! time per iteraton per local point

      if (nid.eq.0) write(6,1) 'case scalar:'
     $ ,np,nx,nelt,nelgt,nnode,nppp,maxits,telaps,dofps,titers,tppp_s

C     Solve libCEED algebraic RHS
      if (nid.eq.0) write (6,*) "libCEED algebraic RHS"
      maxit = 100
      tstart = dnekclock()
      call cggos(u1,r2,h1,h2,vmult,binvm1,tol,ceed,op_mass,
     $  vec_p1,vec_ap1,maxit,'bp1')
      tstop  = dnekclock()

C     Output
      telaps = (tstop-tstart)
      maxits = maxit
      er1 = glrdif(u1,e1,n)
      if (nid.eq.0) write(6,3) lx1,nelgv,er1,' error ',maxit

      if (test.eq.1.and.nid.eq.0) then
        if (maxit>=100) then
          write(6,*) "UNCONVERGED CG"
        endif
        if (dabs(er1)>1e-5) then
          write(6,*) "ERROR IS TOO LARGE"
        endif
      endif

      nx      = nx1-1
      nnode   = nelgt            ! nnodes
      nnode   = nnode*(nx**ldim) ! nnodes
      nppp    = nnode/np         ! nnodes/proc

      dofps   = nnode/telaps     ! DOF/sec - scalar form
      titers  = telaps/maxits    ! time per iteration
      tppp_s  = titers/nppp      ! time per iteraton per local point

      if (nid.eq.0) write(6,1) 'case scalar:'
     $ ,np,nx,nelt,nelgt,nnode,nppp,maxits,telaps,dofps,titers,tppp_s

C     Solve Nek5000 algebraic RHS
      if (nid.eq.0) write (6,*) "Nek5000 algebraic RHS"
      maxit = 100
      tstart = dnekclock()
      call cggos(u1,r3,h1,h2,vmult,binvm1,tol,ceed,op_mass,
     $  vec_p1,vec_ap1,maxit,'bp1')
      tstop  = dnekclock()

C     Output
      telaps = (tstop-tstart)
      maxits = maxit
      er1 = glrdif(u1,e1,n)
      if (nid.eq.0) write(6,3) lx1,nelgv,er1,' error ',maxit

      if (test.eq.1.and.nid.eq.0) then
        if (maxit>=100) then
          write(6,*) "UNCONVERGED CG"
        endif
        if (dabs(er1)>1e-5) then
          write(6,*) "ERROR IS TOO LARGE"
        endif
      endif

      nx      = nx1-1
      nnode   = nelgt            ! nnodes
      nnode   = nnode*(nx**ldim) ! nnodes
      nppp    = nnode/np         ! nnodes/proc

      dofps   = nnode/telaps     ! DOF/sec - scalar form
      titers  = telaps/maxits    ! time per iteration
      tppp_s  = titers/nppp      ! time per iteraton per local point

      if (nid.eq.0) write(6,1) 'case scalar:'
     $ ,np,nx,nelt,nelgt,nnode,nppp,maxits,telaps,dofps,titers,tppp_s

    1 format(a12,i7,i3,i7,i10,i14,i10,i4,1p4e13.5)
    3 format(i3,i9,e12.4,1x,a8,i9)

C     Destroy ceed handles
      call ceedvectordestroy(vec_p1,err)
      call ceedvectordestroy(vec_ap1,err)
      call ceedvectordestroy(vec_rhs,err)
      call ceedvectordestroy(vec_qdata,err)
      call ceedvectordestroy(vec_coords,err)
      call ceedelemrestrictiondestroy(erstrctu,err)
      call ceedelemrestrictiondestroy(erstrctx,err)
      call ceedelemrestrictiondestroy(erstrctw,err)
      call ceedbasisdestroy(basisu,err)
      call ceedbasisdestroy(basisx,err)
      call ceedqfunctiondestroy(qf_setup,err)
      call ceedqfunctiondestroy(qf_mass,err)
      call ceedoperatordestroy(op_setup,err)
      call ceedoperatordestroy(op_mass,err)
      call ceeddestroy(ceed,err)

      return
      end
C-----------------------------------------------------------------------
      subroutine bp3
C     Solution to BP3 using libCEED
      include 'SIZE'
      include 'TOTAL'
      include 'CTIMER'  ! ifsync
      include 'FDMH1'
      include 'ceed/fortran.h'

      parameter (lzq=lx1+1)
      parameter (lx=lx1*ly1*lz1,lg=3+3*(ldim-2),lq=lzq**ldim)
      common /bpgfactors/ gf(lg*lq,lelt),bmq(lq,lelt),w3mq(lq)

      parameter (lt=lx1*ly1*lz1*lelt)
      parameter (ld=lxd*lyd*lzd*lelt)
      common /vcrns/ u1(lt),r1(lt),r2(lt),r3(lt)
      common /vcrny/ e1(lt)
      common /vcrvh/ h1(lt),h2(ld),pap(3)
      real*8 coords(ldim*lx*lelt)

      logical ifh3
      integer*8 nnode
      integer ceed,err,test
      character*64 spec

      integer p,q,ncompx,ncompu,enode,lnode
      integer vec_p1,vec_ap1,vec_qdata,vec_coords,vec_rhs
      integer stridesu(3),stridesx(3),stridesw(3)
      integer erstrctu,erstrctx,erstrctw
      integer basisu,basisx
      integer qf_diffusion,qf_setup
      integer op_diffusion,op_setup
      integer ii,i,ngeo
      real*8  x,y,z
      integer*8 offset

      external diffusionf,diffsetupf

      ifield = 1
      nxq    = nx1+1
      n = nx1*ny1*nz1*nelt

      ifsync = .false.

C     Set up coordinates and mask
      ii=0
      do j=0,nelt-1
      do i=1,lx
        ii=ii+1
        x = xm1(ii,1,1,1)
        y = ym1(ii,1,1,1)
        z = zm1(ii,1,1,1)
        coords(i+0*lx+3*j*lx)=x
        coords(i+1*lx+3*j*lx)=y
        coords(i+2*lx+3*j*lx)=z
        if ( x.eq.0.or.x.eq.1
     $   .or.y.eq.0.or.y.eq.1
     $   .or.z.eq.0.or.z.eq.1 ) then
          h2(ii)=0.
        else
          h2(ii)=1.
        endif
      enddo
      enddo

C     Init ceed library
      call get_spec(spec)
      call ceedinit(trim(spec)//char(0),ceed,err)

      call get_test(test)

C     Set up Nek geometry data
      call geodatq       (gf,bmq,w3mq,nxq)         ! Set up gf() arrays

C     Set up true soln
      call sin_fld_h1    (e1)
      call xmask1        (e1,h2,nelt)
      call copy          (h1,e1,n)                 ! Save exact soln in h1

C     Set up solver parameters
      tol       = 1e-10
      param(22) = tol
      maxit     = 100

      call nekgsync()

C     Create ceed basis for mesh and computation
      p=nx1
      q=p+1
      ncompu=1
      ncompx=ldim
      call ceedbasiscreatetensorh1lagrange(ceed,ldim,ncompx,p,q,
     $  ceed_gauss,basisx,err)
      call ceedbasiscreatetensorh1lagrange(ceed,ldim,ncompu,p,q,
     $  ceed_gauss,basisu,err)

C     Create ceed element restrictions for mesh and computation
      enode=p**ldim
      lnode=enode*nelt*ncompu
      ngeo=(ldim*(ldim+1))/2
      stridesx=[1,enode,enode*ldim]
      call ceedelemrestrictioncreatestrided(ceed,nelt,enode,ldim,
     $  ldim*lnode,stridesx,erstrctx,err)
      stridesu=[1,enode,enode*ncompu]
      call ceedelemrestrictioncreatestrided(ceed,nelt,enode,ncompu,
     $  ncompu*lnode,stridesu,erstrctu,err)
      stridesw=[1,q**ldim,ngeo*q**ldim]
      call ceedelemrestrictioncreatestrided(ceed,nelt,q**ldim,
     $  ngeo,nelt*q**ldim*ngeo,stridesw,erstrctw,err)

C     Create ceed vectors
      call ceedvectorcreate(ceed,lnode,vec_p1,err)
      call ceedvectorcreate(ceed,lnode,vec_ap1,err)
      call ceedvectorcreate(ceed,lnode,vec_rhs,err)
      call ceedvectorcreate(ceed,ldim*lx*nelt,vec_coords,err)
      call ceedvectorcreate(ceed,nelt*ngeo*q**ldim,vec_qdata,err)

      offset=0
      call ceedvectorsetarray(vec_coords,ceed_mem_host,
     $  ceed_use_pointer,coords,offset,err)

C     Create ceed qfunctions for diffsetupf and diffusionf
      call ceedqfunctioncreateinterior(ceed,1,diffsetupf,
     $  EXAMPLE_DIR
     $  //'bps/bps.h:diffsetupf'//char(0),qf_setup,err)
      call ceedqfunctionaddinput(qf_setup,'x',ncompx,
     $  ceed_eval_interp,err)
      call ceedqfunctionaddinput(qf_setup,'dx',ncompx*ldim,
     $  ceed_eval_grad,err)
      call ceedqfunctionaddinput(qf_setup,'weight',ncompu,
     $  ceed_eval_weight,err)
      call ceedqfunctionaddoutput(qf_setup,'qdata',ngeo,
     $  ceed_eval_none,err)
      call ceedqfunctionaddoutput(qf_setup,'rhs',ncompu,
     $  ceed_eval_interp,err)

      call ceedqfunctioncreateinterior(ceed,1,diffusionf,
     $  EXAMPLE_DIR
     $  //'bps/bps.h:diffusionf'//char(0),qf_diffusion,err)
      call ceedqfunctionaddinput(qf_diffusion,'u',ncompu*ldim,
     $  ceed_eval_grad,err)
      call ceedqfunctionaddinput(qf_diffusion,'qdata',ngeo,
     $  ceed_eval_none,err)
      call ceedqfunctionaddoutput(qf_diffusion,'v',ncompu*ldim,
     $  ceed_eval_grad,err)

C     Create ceed operators
      call ceedoperatorcreate(ceed,qf_setup,
     $  ceed_qfunction_none,ceed_qfunction_none,op_setup,err)
      call ceedoperatorsetfield(op_setup,'x',erstrctx,
     $  basisx,ceed_vector_active,err)
      call ceedoperatorsetfield(op_setup,'dx',erstrctx,
     $  basisx,ceed_vector_active,err)
      call ceedoperatorsetfield(op_setup,'weight',
     $  ceed_elemrestriction_none,basisx,ceed_vector_none,err)
      call ceedoperatorsetfield(op_setup,'qdata',erstrctw,
     $  ceed_basis_collocated,ceed_vector_active,err)
      call ceedoperatorsetfield(op_setup,'rhs',erstrctu,
     $  basisu,vec_rhs,err)

      call ceedoperatorcreate(ceed,qf_diffusion,
     $  ceed_qfunction_none,ceed_qfunction_none,op_diffusion,err)
      call ceedoperatorsetfield(op_diffusion,'u',erstrctu,
     $  basisu,ceed_vector_active,err)
      call ceedoperatorsetfield(op_diffusion,'qdata',erstrctw,
     $  ceed_basis_collocated,vec_qdata,err)
      call ceedoperatorsetfield(op_diffusion,'v',erstrctu,
     $  basisu,ceed_vector_active,err)

C     Compute setup data
      call ceedvectorsetarray(vec_rhs,ceed_mem_host,
     $  ceed_use_pointer,r1,offset,err)
      call ceedoperatorapply(op_setup,vec_coords,vec_qdata,
     $  ceed_request_immediate,err)

C     Set up true RHS
      call dssum         (r1,nx1,ny1,nz1)          ! r1
      call xmask1        (r1,h2,nelt)

C     Set up algebraic RHS with libCEED
      call ceedvectorsetarray(vec_p1,ceed_mem_host,
     $  ceed_use_pointer,h1,offset,err)
      call ceedvectorsetarray(vec_ap1,ceed_mem_host,
     $  ceed_use_pointer,r2,offset,err)
      call ceedoperatorapply(op_diffusion,vec_p1,vec_ap1,
     $  ceed_request_immediate,err)                ! r2 = A_ceed*h1
      call dssum         (r2,nx1,ny1,nz1)
      call xmask1        (r2,h2,nelt)

C     Set up algebraic RHS with Nek5000
      call axhm1         (pap,r3,h1,h1,h2,'bp3')   ! r3 = A_nek5k*h1
      call dssum         (r3,nx1,ny1,nz1)
      call xmask1        (r3,h2,nelt)

      call nekgsync()

C     Solve true RHS
      if (nid.eq.0) write (6,*) "libCEED true RHS"
      tstart = dnekclock()
      call cggos(u1,r1,h1,h2,vmult,binvm1,tol,ceed,op_diffusion,
     $  vec_p1,vec_ap1,maxit,'bp3')
      tstop  = dnekclock()

C     Output
      telaps = (tstop-tstart)
      maxits = maxit
      er1 = glrdif(u1,e1,n)
      if (nid.eq.0) write(6,3) lx1,nelgv,er1,' error ',maxit

      if (test.eq.1.and.nid.eq.0) then
        if (maxit>=100) then
          write(6,*) "UNCONVERGED CG"
        endif
        if (dabs(er1)>1e-3) then
          write(6,*) "ERROR IS TOO LARGE"
        endif
      endif

      nx      = nx1-1
      nnode   = nelgt            ! nnodes
      nnode   = nnode*(nx**ldim) ! nnodes
      nppp    = nnode/np         ! nnodes/proc

      dofps   = nnode/telaps     ! DOF/sec - scalar form
      titers  = telaps/maxits    ! time per iteration
      tppp_s  = titers/nppp      ! time per iteraton per local point

      if (nid.eq.0) write(6,1) 'case scalar:'
     $ ,np,nx,nelt,nelgt,nnode,nppp,maxits,telaps,dofps,titers,tppp_s

C     Solve libCEED algebraic RHS
      if (nid.eq.0) write (6,*) "libCEED algebraic RHS"
      maxit = 100
      tstart = dnekclock()
      call cggos(u1,r2,h1,h2,vmult,binvm1,tol,ceed,op_diffusion,
     $  vec_p1,vec_ap1,maxit,'bp3')
      tstop  = dnekclock()

C     Output
      telaps = (tstop-tstart)
      maxits = maxit
      er1 = glrdif(u1,e1,n)
      if (nid.eq.0) write(6,3) lx1,nelgv,er1,' error ',maxit

      if (test.eq.1.and.nid.eq.0) then
        if (maxit>=100) then
          write(6,*) "UNCONVERGED CG"
        endif
        if (dabs(er1)>1e-9) then
          write(6,*) "ERROR IS TOO LARGE"
        endif
      endif

      nx      = nx1-1
      nnode   = nelgt            ! nnodes
      nnode   = nnode*(nx**ldim) ! nnodes
      nppp    = nnode/np         ! nnodes/proc

      dofps   = nnode/telaps     ! DOF/sec - scalar form
      titers  = telaps/maxits    ! time per iteration
      tppp_s  = titers/nppp      ! time per iteraton per local point

      if (nid.eq.0) write(6,1) 'case scalar:'
     $ ,np,nx,nelt,nelgt,nnode,nppp,maxits,telaps,dofps,titers,tppp_s

C     Solve Nek5000 algebraic RHS
      if (nid.eq.0) write (6,*) "Nek5000 algebraic RHS"
      maxit = 100
      tstart = dnekclock()
      call cggos(u1,r3,h1,h2,vmult,binvm1,tol,ceed,op_diffusion,
     $  vec_p1,vec_ap1,maxit,'bp3')
      tstop  = dnekclock()

C     Output
      telaps = (tstop-tstart)
      maxits = maxit
      er1 = glrdif(u1,e1,n)
      if (nid.eq.0) write(6,3) lx1,nelgv,er1,' error ',maxit

      if (test.eq.1.and.nid.eq.0) then
        if (maxit>=100) then
          write(6,*) "UNCONVERGED CG"
        endif
        if (dabs(er1)>1e-9) then
          write(6,*) "ERROR IS TOO LARGE"
        endif
      endif

      nx      = nx1-1
      nnode   = nelgt            ! nnodes
      nnode   = nnode*(nx**ldim) ! nnodes
      nppp    = nnode/np         ! nnodes/proc

      dofps   = nnode/telaps     ! DOF/sec - scalar form
      titers  = telaps/maxits    ! time per iteration
      tppp_s  = titers/nppp      ! time per iteraton per local point

      if (nid.eq.0) write(6,1) 'case scalar:'
     $ ,np,nx,nelt,nelgt,nnode,nppp,maxits,telaps,dofps,titers,tppp_s

    1 format(a12,i7,i3,i7,i10,i14,i10,i4,1p4e13.5)
    3 format(i3,i9,e12.4,1x,a8,i9)

C     Destroy ceed handles
      call ceedvectordestroy(vec_p1,err)
      call ceedvectordestroy(vec_ap1,err)
      call ceedvectordestroy(vec_rhs,err)
      call ceedvectordestroy(vec_qdata,err)
      call ceedvectordestroy(vec_coords,err)
      call ceedelemrestrictiondestroy(erstrctu,err)
      call ceedelemrestrictiondestroy(erstrctx,err)
      call ceedelemrestrictiondestroy(erstrctw,err)
      call ceedbasisdestroy(basisu,err)
      call ceedbasisdestroy(basisx,err)
      call ceedqfunctiondestroy(qf_setup,err)
      call ceedqfunctiondestroy(qf_diffusion,err)
      call ceedoperatordestroy(op_setup,err)
      call ceedoperatordestroy(op_diffusion,err)
      call ceeddestroy(ceed,err)

      return
      end
C-----------------------------------------------------------------------
      subroutine cggos(u1,r1,h1,h2,rmult,binv,tin,ceed,ceed_op,vec_p1,
     $  vec_ap1,maxit,bpname)
C     Scalar conjugate gradient iteration for solution of uncoupled
C     Helmholtz equations
C     Input: r1,h1,h2,rmult,binv,tin,ceed,ceed_op,vec_p1,vec_ap1,bpname
C     Output: u1,maxit
      include 'SIZE'
      include 'TOTAL'
      include 'DOMAIN'
      include 'FDMH1'
      character*3 bpname

C     INPUT:  rhs1 - rhs
C             h1   - exact solution

      parameter (lt=lx1*ly1*lz1*lelt)
      parameter (ld=lxd*lyd*lzd*lelt)
      real*8 u1(lt),r1(lt),h1(lt),h2(lt)
      real*8 rmult(1),binv(1)
      integer ceed,ceed_op,vec_ap1,vec_p1
      common /scrcg/ dpc(lt),p1(lt),z1(lt)
      common /scrca/ wv(4),wk(4),rpp1(4),rpp2(4),alph(4),beta(4),pap(4)

      real*8 ap1(lt)
      equivalence (ap1,z1)

      vol   = volfld(ifield)
      nel   = nelfld(ifield)
      nxyz  = lx1*ly1*lz1
      n     = nxyz*nel
      nx    = nx1-1                  ! Polynomial order (just for i/o)

      tol=tin

      if(bpname.ne.'bp1') then
        call setprecn_bp3(dpc,h1,h2) ! Set up diagional pre-conidtioner
      else
        call setprecn_bp1(dpc,h1,h2) ! Set up diagional pre-conidtioner
      endif

      call rzero         (u1,n)      ! Initialize solution

      wv(1)=0
      do i=1,n
         s=rmult(i)                  !      -1
         p1(i)=dpc(i)*r1(i)          ! p = M  r      T
         wv(1)=wv(1)+s*p1(i)*r1(i)   !              r p
      enddo
      call gop(wv(1),wk,'+  ',1)
      rpp1(1) = wv  (1)

      do 1000 iter=1,maxit
         call axhm1_ceed (pap,ap1,p1,h1,h2,ceed,ceed_op,
     $     vec_ap1,vec_p1)
         call dssum    (ap1,nx1,ny1,nz1)
         if (bpname.ne.'bp1') call xmask1(ap1,h2,nel)

         call gop      (pap,wk,'+  ',1)
         alph(1) = rpp1(1)/pap(1)

         do i=1,n
            u1(i)=u1(i)+alph(1)* p1(i)
            r1(i)=r1(i)-alph(1)*ap1(i)
         enddo

C        tolerance check here
         call rzero(wv,2)
         do i=1,n
            wv(1)=wv(1)+r1(i)*r1(i)            ! L2 error estimate
            z1(i)=dpc(i)*r1(i)                 ! z = M  r
            wv(2)=wv(2)+rmult(i)*z1(i)*r1(i)   ! r z
         enddo
         call gop(wv,wk,'+  ',2)

C        if (nio.eq.0) write(6,1) ifield,istep,iter,nx,(wv(k),k=1,1)
  1     format(i2,i9,i5,i4,1p1e12.4,' cggos')

         enorm=sqrt(wv(1))
         if (enorm.lt.tol) then
            ifin = iter
            if (nio.eq.0) write(6,3000) istep,ifin,enorm,tol
            goto 9999
         endif
C        if (nio.eq.0) write(6,2) iter,enorm,alph(1),pap(1),'alpha'
 2      format(i5,1p3e12.4,2x,a5)

         rpp2(1)=rpp1(1)
         rpp1(1)=wv  (2)
         beta1  =rpp1(1)/rpp2(1)
         do i=1,n
            p1(i)=z1(i) + beta1*p1(i)
         enddo

 1000 continue

      rbnorm=sqrt(wv(1))
      if (nio.eq.0) write (6,3001) istep,iter,rbnorm,tol
      iter = iter-1

 9999 continue

      maxit=iter

 3000 format(i12,1x,'cggo scalar:',i6,1p5e13.4)
 3001 format(2i6,' Unconverged cggo scalar: rbnorm =',1p2e13.6)

      return
      end
C-----------------------------------------------------------------------
      subroutine axhm1_ceed(pap,ap1,p1,h1,h2,ceed,ceed_op,
     $  vec_ap1,vec_p1)
C     Vector conjugate gradient matvec for solution of uncoupled
C     Helmholtz equations
C     Input: pap,p1,h1,h2,bpname,ceed,ceed_op,vec_ap1,vec_p1
C     Output: ap1
      include 'SIZE'
      include 'TOTAL'
      include 'ceed/fortran.h'

      parameter (lx=lx1*ly1*lz1,lg=3+3*(ldim-2))
      real*8       gf(lg,lx,lelt)             ! Equivalence new gf() data
      equivalence (gf,g1m1)                   ! layout to g1m1...g6m1

      real*8   pap(3)
      real*8   ap1(lx,lelt)
      real*8    p1(lx,lelt)
      real*8    h1(lx,lelt),h2(lx,lelt)
      integer ceed,ceed_op,vec_ap1,vec_p1,err
      integer i,e
      integer*8 offset

      offset=0
      call ceedvectorsetarray(vec_p1,ceed_mem_host,ceed_use_pointer,
     $  p1,offset,err)
      call ceedvectorsetarray(vec_ap1,ceed_mem_host,ceed_use_pointer,
     $  ap1,offset,err)

      call ceedoperatorapply(ceed_op,vec_p1,vec_ap1,
     $  ceed_request_immediate,err)

      call ceedvectortakearray(vec_p1,ceed_mem_host,0,offset,err)
      call ceedvectortakearray(vec_ap1,ceed_mem_host,0,offset,err)

      pap(1)=0.

      do e=1,nelt
         do i=1,lx
           pap(1)=pap(1)+ap1(i,e)*p1(i,e)
         enddo
      enddo

      return
      end
C-----------------------------------------------------------------------
      subroutine ax_e_bp1(w,u,g,h1,h2,b,ju,us,ut)
C     Local matrix-vector for solution of BP3 (stiffness matrix)
C     Input: u,g,h1,h2,b,ju,us,ut   Output: w
      include 'SIZE'
      include 'TOTAL'

      parameter (lxyz=lx1*ly1*lz1,lg=3+3*(ldim-2))
      real*8 w(lxyz),u(lxyz),g(lg,lxyz),h1(lxyz),h2(lxyz),b(lxyz)
      real*8 ju(lxyz),us(lxyz),ut(lxyz)

      nxq = nx1+1 ! Number of quadrature points

      lxyzq = nxq**ldim

      call intp_rstd (ju,u,lx1,nxq,if3d,0) ! 0 --> Fwd interpolation
      do i=1,lxyzq
         ju(i)=ju(i)*h2(i) !! h2 must be on the fine grid, w/ quad wts
      enddo
      call intp_rstd (w,ju,lx1,nxq,if3d,1) ! 1 --> ju-->u

      return
      end
C-----------------------------------------------------------------------
      subroutine axhm1_bp1(pap,ap1,p1,h1,h2)
C     Vector conjugate gradient matvec for solution of BP1 (mass matrix)
C     Input: pap,p1,h1,h2           Output: ap1
      include 'SIZE'
      include 'TOTAL'

      parameter (lx=lx1*ly1*lz1,lg=3+3*(ldim-2))
      real*8         gf(lg,lx,lelt)             ! Equivalence new gf() data
      equivalence (gf,g1m1)                     ! layout to g1m1...g6m1

      real*8 pap(3)
      real*8 ap1(lx,lelt)
      real*8  p1(lx,lelt)
      real*8  h1(lx,lelt),h2(lx,lelt)

      real*8 ur(lx),us(lx),ut(lx)
      common /ctmp1/ ur,us,ut

      integer e

      pap(1)=0.

      k=1
      nxq = nx1+1

      do e=1,nelt

         call ax_e_bp1(ap1(1,e),p1(1,e),gf(1,1,e),h1(1,e),h2(k,1)
     $                                          ,bm1(1,1,1,e),ur,us,ut)
         do i=1,lx
           pap(1)=pap(1)+ap1(i,e)*p1(i,e)
         enddo
         k=k+nxq*nxq*nxq

      enddo

      return
      end
C-----------------------------------------------------------------------
      subroutine ax_e_bp3(w,u,g,ur,us,ut,wk)
C     Local matrix-vector for solution of BP3 (stiffness matrix)
C     Input: u,g,ur,us,ut,wk        Output: w
      include 'SIZE'
      include 'TOTAL'

      parameter (lzq=lx1+1,lxyz=lx1*lx1*lx1,lxyzq=lzq*lzq*lzq)

      common /ctmp0/ tmp(lxyzq)
      common /dxmfine/ dxmq(lzq,lzq),dxtmq(lzq,lzq)

      real*8 ur(lxyzq),us(lxyzq),ut(lxyzq),wk(lxyzq)
      real*8 w(lxyz),u(lxyz),g(2*ldim,lxyzq)

      n = lzq-1

      call intp_rstd  (wk,u,lx1,lzq,if3d,0) ! 0 --> Fwd interpolation
      call loc_grad3  (ur,us,ut,wk,n,dxmq,dxtmq)

      do i=1,lxyzq
         wr = g(1,i)*ur(i) + g(2,i)*us(i) + g(3,i)*ut(i)
         ws = g(2,i)*ur(i) + g(4,i)*us(i) + g(5,i)*ut(i)
         wt = g(3,i)*ur(i) + g(5,i)*us(i) + g(6,i)*ut(i)
         ur(i) = wr
         us(i) = ws
         ut(i) = wt
      enddo

      call loc_grad3t (wk,ur,us,ut,n,dxmq,dxtmq,tmp)
      call intp_rstd  (w,wk,lx1,lzq,if3d,1) ! 1 --> ju-->u

      return
      end
C-----------------------------------------------------------------------
      subroutine axhm1_bp3(pap,ap1,p1,h1,h2)
C     Vector conjugate gradient matvec for solution of BP3 (stiffness matrix)
C     Input: pap,p1,h1,h2           Output: ap1
      include 'SIZE'
      include 'TOTAL'

      parameter (lzq=lx1+1)
      parameter (lx=lx1*ly1*lz1,lg=3+3*(ldim-2),lq=lzq**ldim)
      common /bpgfactors/ gf(lg,lq,lelt),bmq(lq,lelt),w3mq(lq)

      real*8 pap(3)
      real*8 ap1(lx,lelt)
      real*8  p1(lx,lelt)
      real*8  h1(lx,lelt),h2(lx,lelt)

      common /ctmp1/ ur,us,ut,wk
      real*8 ur(lq),us(lq),ut(lq),wk(lq)

      integer e

      pap(1)=0.

      do e=1,nelt

         call ax_e_bp3(ap1(1,e),p1(1,e),gf(1,1,e),ur,us,ut,wk)
         do i=1,lx
           pap(1)=pap(1)+p1(i,e)*ap1(i,e)
         enddo

      enddo

      return
      end
C-----------------------------------------------------------------------
      subroutine axhm1(pap,ap1,p1,h1,h2,bpname)
C     Vector conjugate gradient matvec for solution of uncoupled
C     Helmholtz equations
C     Input: pap,p1,h1,h2,bpname    Output: ap1
      include 'SIZE'
      include 'TOTAL'

      parameter (lx=lx1*ly1*lz1)

      real*8 pap(3),ap1(lx,lelt),p1(lx,lelt)
      real*8 h1(lx,lelt),h2(lx,lelt)

      character*3 bpname

      if (bpname.eq.'bp1') then
         call axhm1_bp1(pap,ap1,p1,h1,h2)

      elseif (bpname.eq.'bp3') then
         call axhm1_bp3(pap,ap1,p1,h1,h2)

      else
         write(6,*) bpname,' axhm1 bpname error'
         stop

      endif

      return
      end
C-----------------------------------------------------------------------
      subroutine get_bp(bp)
C     Get BP to run
C     Input:                        Output: bp
      integer i,bp
      character*64 bpval

      bp=0
      if(iargc().ge.1) then
        call getarg(1,bpval)
      endif
      if(bpval.eq."bp1") then
        bp=1
      elseif(bpval.eq."bp3") then
        bp=3
      endif

      return
      end
C-----------------------------------------------------------------------
      subroutine get_spec(spec)
C     Get CEED backend specification
C     Input:                        Output: spec
      integer i
      character*64 spec

      spec = '/cpu/self'
      if(iargc().ge.2) then
        call getarg(2,spec)
      endif

      return
      end
C-----------------------------------------------------------------------
      subroutine get_test(test)
C     Get test mode flag
C     Input:                        Output: test
      integer i,test
      character*64 testval

      test=0
      if(iargc().ge.3) then
        call getarg(3,testval)
      endif
      if(testval.eq."test") then
        test=1
      endif

      return
      end
C-----------------------------------------------------------------------

