d7/d1d/a01475_source.html

 //////////////////////////////////////////////////////////////////////////////////////
 // This file is distributed under the University of Illinois/NCSA Open Source License.
 // See LICENSE file in top directory for details.
 //
 // Copyright (c) 2020 QMCPACK developers.
 //
 // File developed by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
 //
 // File created by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
 //////////////////////////////////////////////////////////////////////////////////////


 #include "SplineC2COMPTarget.h"
 #include "spline2/MultiBsplineEval.hpp"
 #include "spline2/MultiBsplineEval_OMPoffload.hpp"
 #include "QMCWaveFunctions/BsplineFactory/contraction_helper.hpp"
 #include "Platforms/OMPTarget/ompReductionComplex.hpp"
 #include "ApplyPhaseC2C.hpp"
 #include "Concurrency/OpenMP.h"

 namespace qmcplusplus
 {
 template<typename ST>
 SplineC2COMPTarget<ST>::SplineC2COMPTarget(const SplineC2COMPTarget& in) = default;

 template<typename ST>
 inline void SplineC2COMPTarget<ST>::set_spline(SingleSplineType* spline_r,
                                                SingleSplineType* spline_i,
                                                int twist,
                                                int ispline,
                                                int level)
 {
   SplineInst->copy_spline(spline_r, 2 * ispline);
   SplineInst->copy_spline(spline_i, 2 * ispline + 1);
 }

 template<typename ST>
 bool SplineC2COMPTarget<ST>::read_splines(hdf_archive& h5f)
 {
   std::ostringstream o;
   o << "spline_" << MyIndex;
   einspline_engine<SplineType> bigtable(SplineInst->getSplinePtr());
   return h5f.readEntry(bigtable, o.str().c_str()); //"spline_0");
 }

 template<typename ST>
 bool SplineC2COMPTarget<ST>::write_splines(hdf_archive& h5f)
 {
   std::ostringstream o;
   o << "spline_" << MyIndex;
   einspline_engine<SplineType> bigtable(SplineInst->getSplinePtr());
   return h5f.writeEntry(bigtable, o.str().c_str()); //"spline_0");
 }

 template<typename ST>
 inline void SplineC2COMPTarget<ST>::assign_v(const PointType& r,
                                              const vContainer_type& myV,
                                              ValueVector& psi,
                                              int first,
                                              int last) const
 {
   // protect last
   const size_t last_cplx = std::min(kPoints.size(), psi.size());
   last                   = last > last_cplx ? last_cplx : last;

   const ST x = r[0], y = r[1], z = r[2];
   const ST* restrict kx = myKcart->data(0);
   const ST* restrict ky = myKcart->data(1);
   const ST* restrict kz = myKcart->data(2);
 #pragma omp simd
   for (size_t j = first; j < last; ++j)
   {
     ST s, c;
     const ST val_r = myV[2 * j];
     const ST val_i = myV[2 * j + 1];
     omptarget::sincos(-(x * kx[j] + y * ky[j] + z * kz[j]), &s, &c);
     psi[j + first_spo] = ComplexT(val_r * c - val_i * s, val_i * c + val_r * s);
   }
 }

 template<typename ST>
 void SplineC2COMPTarget<ST>::evaluateValue(const ParticleSet& P, const int iat, ValueVector& psi)
 {
   const PointType& r = P.activeR(iat);
   PointType ru(PrimLattice.toUnit_floor(r));

 #pragma omp parallel
   {
     int first, last;
     // Factor of 2 because psi is complex and the spline storage and evaluation uses a real type
     FairDivideAligned(2 * psi.size(), getAlignment<ST>(), omp_get_num_threads(), omp_get_thread_num(), first, last);

     spline2::evaluate3d(SplineInst->getSplinePtr(), ru, myV, first, last);
     assign_v(r, myV, psi, first / 2, last / 2);
   }
 }

 template<typename ST>
 void SplineC2COMPTarget<ST>::evaluateDetRatios(const VirtualParticleSet& VP,
                                                ValueVector& psi,
                                                const ValueVector& psiinv,
                                                std::vector<ValueType>& ratios)
 {
   const int nVP = VP.getTotalNum();
   psiinv_pos_copy.resize(psiinv.size() + nVP * 3);

   // stage psiinv to psiinv_pos_copy
   std::copy_n(psiinv.data(), psiinv.size(), psiinv_pos_copy.data());

   // pack particle positions
   auto* restrict pos_scratch = reinterpret_cast<RealType*>(psiinv_pos_copy.data() + psiinv.size());
   for (int iat = 0; iat < nVP; ++iat)
   {
     const PointType& r = VP.activeR(iat);
     PointType ru(PrimLattice.toUnit_floor(r));
     pos_scratch[iat * 6]     = r[0];
     pos_scratch[iat * 6 + 1] = r[1];
     pos_scratch[iat * 6 + 2] = r[2];
     pos_scratch[iat * 6 + 3] = ru[0];
     pos_scratch[iat * 6 + 4] = ru[1];
     pos_scratch[iat * 6 + 5] = ru[2];
   }

   const size_t ChunkSizePerTeam = 512;
   const int NumTeams            = (myV.size() + ChunkSizePerTeam - 1) / ChunkSizePerTeam;
   ratios_private.resize(nVP, NumTeams);
   const auto spline_padded_size = myV.size();
   const auto sposet_padded_size = getAlignedSize<ValueType>(OrbitalSetSize);
   offload_scratch.resize(spline_padded_size * nVP);
   results_scratch.resize(sposet_padded_size * nVP);

   // Ye: need to extract sizes and pointers before entering target region
   const auto* spline_ptr         = SplineInst->getSplinePtr();
   auto* offload_scratch_ptr      = offload_scratch.data();
   auto* results_scratch_ptr      = results_scratch.data();
   const auto myKcart_padded_size = myKcart->capacity();
   auto* myKcart_ptr              = myKcart->data();
   auto* psiinv_ptr               = psiinv_pos_copy.data();
   auto* ratios_private_ptr       = ratios_private.data();
   const size_t first_spo_local   = first_spo;
   const auto orb_size            = psiinv.size();

   {
     ScopedTimer offload(offload_timer_);
     PRAGMA_OFFLOAD("omp target teams distribute collapse(2) num_teams(NumTeams*nVP) \
                 map(always, to: psiinv_ptr[0:psiinv_pos_copy.size()]) \
                 map(always, from: ratios_private_ptr[0:NumTeams*nVP])")
     for (int iat = 0; iat < nVP; iat++)
       for (int team_id = 0; team_id < NumTeams; team_id++)
       {
         const size_t first = ChunkSizePerTeam * team_id;
         const size_t last  = omptarget::min(first + ChunkSizePerTeam, spline_padded_size);

         auto* restrict offload_scratch_iat_ptr = offload_scratch_ptr + spline_padded_size * iat;
         auto* restrict psi_iat_ptr             = results_scratch_ptr + sposet_padded_size * iat;
         auto* restrict pos_scratch             = reinterpret_cast<RealType*>(psiinv_ptr + orb_size);

         int ix, iy, iz;
         ST a[4], b[4], c[4];
         spline2::computeLocationAndFractional(spline_ptr, ST(pos_scratch[iat * 6 + 3]), ST(pos_scratch[iat * 6 + 4]),
                                               ST(pos_scratch[iat * 6 + 5]), ix, iy, iz, a, b, c);

         PRAGMA_OFFLOAD("omp parallel for")
         for (int index = 0; index < last - first; index++)
           spline2offload::evaluate_v_impl_v2(spline_ptr, ix, iy, iz, first + index, a, b, c,
                                              offload_scratch_iat_ptr + first + index);
         const size_t first_cplx = first / 2;
         const size_t last_cplx  = omptarget::min(last / 2, orb_size);
         PRAGMA_OFFLOAD("omp parallel for")
         for (int index = first_cplx; index < last_cplx; index++)
           C2C::assign_v(ST(pos_scratch[iat * 6]), ST(pos_scratch[iat * 6 + 1]), ST(pos_scratch[iat * 6 + 2]),
                         psi_iat_ptr, offload_scratch_iat_ptr, myKcart_ptr, myKcart_padded_size, first_spo_local, index);

         ComplexT sum(0);
         PRAGMA_OFFLOAD("omp parallel for simd reduction(+:sum)")
         for (int i = first_cplx; i < last_cplx; i++)
           sum += psi_iat_ptr[i] * psiinv_ptr[i];
         ratios_private_ptr[iat * NumTeams + team_id] = sum;
       }
   }

   // do the reduction manually
   for (int iat = 0; iat < nVP; ++iat)
   {
     ratios[iat] = ComplexT(0);
     for (int tid = 0; tid < NumTeams; tid++)
       ratios[iat] += ratios_private[iat][tid];
   }
 }

 template<typename ST>
 void SplineC2COMPTarget<ST>::mw_evaluateDetRatios(const RefVectorWithLeader<SPOSet>& spo_list,
                                                   const RefVectorWithLeader<const VirtualParticleSet>& vp_list,
                                                   const RefVector<ValueVector>& psi_list,
                                                   const std::vector<const ValueType*>& invRow_ptr_list,
                                                   std::vector<std::vector<ValueType>>& ratios_list) const
 {
   assert(this == &spo_list.getLeader());
   auto& phi_leader            = spo_list.getCastedLeader<SplineC2COMPTarget<ST>>();
   auto& mw_mem                = phi_leader.mw_mem_handle_.getResource();
   auto& det_ratios_buffer_H2D = mw_mem.det_ratios_buffer_H2D;
   auto& mw_ratios_private     = mw_mem.mw_ratios_private;
   auto& mw_offload_scratch    = mw_mem.mw_offload_scratch;
   auto& mw_results_scratch    = mw_mem.mw_results_scratch;
   const size_t nw             = spo_list.size();
   const size_t orb_size       = phi_leader.size();

   size_t mw_nVP = 0;
   for (const VirtualParticleSet& VP : vp_list)
     mw_nVP += VP.getTotalNum();

   const size_t packed_size = nw * sizeof(ValueType*) + mw_nVP * (6 * sizeof(ST) + sizeof(int));
   det_ratios_buffer_H2D.resize(packed_size);

   // pack invRow_ptr_list to det_ratios_buffer_H2D
   Vector<const ValueType*> ptr_buffer(reinterpret_cast<const ValueType**>(det_ratios_buffer_H2D.data()), nw);
   for (size_t iw = 0; iw < nw; iw++)
     ptr_buffer[iw] = invRow_ptr_list[iw];

   // pack particle positions
   auto* pos_ptr = reinterpret_cast<ST*>(det_ratios_buffer_H2D.data() + nw * sizeof(ValueType*));
   auto* ref_id_ptr =
       reinterpret_cast<int*>(det_ratios_buffer_H2D.data() + nw * sizeof(ValueType*) + mw_nVP * 6 * sizeof(ST));
   size_t iVP = 0;
   for (size_t iw = 0; iw < nw; iw++)
   {
     const VirtualParticleSet& VP = vp_list[iw];
     assert(ratios_list[iw].size() == VP.getTotalNum());
     for (size_t iat = 0; iat < VP.getTotalNum(); ++iat, ++iVP)
     {
       ref_id_ptr[iVP]    = iw;
       const PointType& r = VP.activeR(iat);
       PointType ru(PrimLattice.toUnit_floor(r));
       pos_ptr[0] = r[0];
       pos_ptr[1] = r[1];
       pos_ptr[2] = r[2];
       pos_ptr[3] = ru[0];
       pos_ptr[4] = ru[1];
       pos_ptr[5] = ru[2];
       pos_ptr += 6;
     }
   }

   const size_t ChunkSizePerTeam = 512;
   const int NumTeams            = (myV.size() + ChunkSizePerTeam - 1) / ChunkSizePerTeam;
   mw_ratios_private.resize(mw_nVP, NumTeams);
   const auto spline_padded_size = myV.size();
   const auto sposet_padded_size = getAlignedSize<ValueType>(OrbitalSetSize);
   mw_offload_scratch.resize(spline_padded_size * mw_nVP);
   mw_results_scratch.resize(sposet_padded_size * mw_nVP);

   // Ye: need to extract sizes and pointers before entering target region
   const auto* spline_ptr         = SplineInst->getSplinePtr();
   auto* offload_scratch_ptr      = mw_offload_scratch.data();
   auto* results_scratch_ptr      = mw_results_scratch.data();
   const auto myKcart_padded_size = myKcart->capacity();
   auto* myKcart_ptr              = myKcart->data();
   auto* buffer_H2D_ptr           = det_ratios_buffer_H2D.data();
   auto* ratios_private_ptr       = mw_ratios_private.data();
   const size_t first_spo_local   = first_spo;

   {
     ScopedTimer offload(offload_timer_);
     PRAGMA_OFFLOAD("omp target teams distribute collapse(2) num_teams(NumTeams*mw_nVP) \
                 map(always, to: buffer_H2D_ptr[0:det_ratios_buffer_H2D.size()]) \
                 map(always, from: ratios_private_ptr[0:NumTeams*mw_nVP])")
     for (int iat = 0; iat < mw_nVP; iat++)
       for (int team_id = 0; team_id < NumTeams; team_id++)
       {
         const size_t first = ChunkSizePerTeam * team_id;
         const size_t last  = omptarget::min(first + ChunkSizePerTeam, spline_padded_size);

         auto* restrict offload_scratch_iat_ptr = offload_scratch_ptr + spline_padded_size * iat;
         auto* restrict psi_iat_ptr             = results_scratch_ptr + sposet_padded_size * iat;
         auto* ref_id_ptr = reinterpret_cast<int*>(buffer_H2D_ptr + nw * sizeof(ValueType*) + mw_nVP * 6 * sizeof(ST));
         auto* restrict psiinv_ptr  = reinterpret_cast<const ValueType**>(buffer_H2D_ptr)[ref_id_ptr[iat]];
         auto* restrict pos_scratch = reinterpret_cast<ST*>(buffer_H2D_ptr + nw * sizeof(ValueType*));

         int ix, iy, iz;
         ST a[4], b[4], c[4];
         spline2::computeLocationAndFractional(spline_ptr, pos_scratch[iat * 6 + 3], pos_scratch[iat * 6 + 4],
                                               pos_scratch[iat * 6 + 5], ix, iy, iz, a, b, c);

         PRAGMA_OFFLOAD("omp parallel for")
         for (int index = 0; index < last - first; index++)
           spline2offload::evaluate_v_impl_v2(spline_ptr, ix, iy, iz, first + index, a, b, c,
                                              offload_scratch_iat_ptr + first + index);
         const size_t first_cplx = first / 2;
         const size_t last_cplx  = omptarget::min(last / 2, orb_size);
         PRAGMA_OFFLOAD("omp parallel for")
         for (int index = first_cplx; index < last_cplx; index++)
           C2C::assign_v(pos_scratch[iat * 6], pos_scratch[iat * 6 + 1], pos_scratch[iat * 6 + 2], psi_iat_ptr,
                         offload_scratch_iat_ptr, myKcart_ptr, myKcart_padded_size, first_spo_local, index);

         ComplexT sum(0);
         PRAGMA_OFFLOAD("omp parallel for simd reduction(+:sum)")
         for (int i = first_cplx; i < last_cplx; i++)
           sum += psi_iat_ptr[i] * psiinv_ptr[i];
         ratios_private_ptr[iat * NumTeams + team_id] = sum;
       }
   }

   // do the reduction manually
   iVP = 0;
   for (size_t iw = 0; iw < nw; iw++)
   {
     auto& ratios = ratios_list[iw];
     for (size_t iat = 0; iat < ratios.size(); iat++, iVP++)
     {
       ratios[iat] = ComplexT(0);
       for (int tid = 0; tid < NumTeams; ++tid)
         ratios[iat] += mw_ratios_private[iVP][tid];
     }
   }
 }

 /** assign_vgl_from_l can be used when myL is precomputed and myV,myG,myL in cartesian
    */
 template<typename ST>
 inline void SplineC2COMPTarget<ST>::assign_vgl_from_l(const PointType& r,
                                                       ValueVector& psi,
                                                       GradVector& dpsi,
                                                       ValueVector& d2psi)
 {
   constexpr ST two(2);
   const ST x = r[0], y = r[1], z = r[2];

   const ST* restrict k0 = myKcart->data(0);
   const ST* restrict k1 = myKcart->data(1);
   const ST* restrict k2 = myKcart->data(2);

   const ST* restrict g0 = myG.data(0);
   const ST* restrict g1 = myG.data(1);
   const ST* restrict g2 = myG.data(2);

   const size_t last_cplx = last_spo > psi.size() ? psi.size() : last_spo;
   const size_t N         = last_cplx - first_spo;
 #pragma omp simd
   for (size_t j = 0; j < N; ++j)
   {
     const size_t jr = j << 1;
     const size_t ji = jr + 1;

     const ST kX    = k0[j];
     const ST kY    = k1[j];
     const ST kZ    = k2[j];
     const ST val_r = myV[jr];
     const ST val_i = myV[ji];

     //phase
     ST s, c;
     omptarget::sincos(-(x * kX + y * kY + z * kZ), &s, &c);

     //dot(PrimLattice.G,myG[j])
     const ST dX_r = g0[jr];
     const ST dY_r = g1[jr];
     const ST dZ_r = g2[jr];

     const ST dX_i = g0[ji];
     const ST dY_i = g1[ji];
     const ST dZ_i = g2[ji];

     // \f$\nabla \psi_r + {\bf k}\psi_i\f$
     const ST gX_r = dX_r + val_i * kX;
     const ST gY_r = dY_r + val_i * kY;
     const ST gZ_r = dZ_r + val_i * kZ;
     const ST gX_i = dX_i - val_r * kX;
     const ST gY_i = dY_i - val_r * kY;
     const ST gZ_i = dZ_i - val_r * kZ;

     const ST lap_r = myL[jr] + (*mKK)[j] * val_r + two * (kX * dX_i + kY * dY_i + kZ * dZ_i);
     const ST lap_i = myL[ji] + (*mKK)[j] * val_i - two * (kX * dX_r + kY * dY_r + kZ * dZ_r);

     const size_t psiIndex = j + first_spo;
     psi[psiIndex]         = ComplexT(c * val_r - s * val_i, c * val_i + s * val_r);
     dpsi[psiIndex][0]     = ComplexT(c * gX_r - s * gX_i, c * gX_i + s * gX_r);
     dpsi[psiIndex][1]     = ComplexT(c * gY_r - s * gY_i, c * gY_i + s * gY_r);
     dpsi[psiIndex][2]     = ComplexT(c * gZ_r - s * gZ_i, c * gZ_i + s * gZ_r);
     d2psi[psiIndex]       = ComplexT(c * lap_r - s * lap_i, c * lap_i + s * lap_r);
   }
 }

 template<typename ST>
 void SplineC2COMPTarget<ST>::evaluateVGL(const ParticleSet& P,
                                          const int iat,
                                          ValueVector& psi,
                                          GradVector& dpsi,
                                          ValueVector& d2psi)
 {
   const PointType& r = P.activeR(iat);
   PointType ru(PrimLattice.toUnit_floor(r));

   const size_t ChunkSizePerTeam = 512;
   const int NumTeams            = (myV.size() + ChunkSizePerTeam - 1) / ChunkSizePerTeam;

   const auto spline_padded_size = myV.size();
   const auto sposet_padded_size = getAlignedSize<ValueType>(OrbitalSetSize);
   offload_scratch.resize(spline_padded_size * SoAFields3D::NUM_FIELDS);
   // for V(1)G(3)L(1) final result
   results_scratch.resize(sposet_padded_size * 5);

   // Ye: need to extract sizes and pointers before entering target region
   const auto* spline_ptr    = SplineInst->getSplinePtr();
   auto* offload_scratch_ptr = offload_scratch.data();
   auto* results_scratch_ptr = results_scratch.data();
   const auto x = r[0], y = r[1], z = r[2];
   const auto rux = ru[0], ruy = ru[1], ruz = ru[2];
   const auto myKcart_padded_size = myKcart->capacity();
   auto* mKK_ptr                  = mKK->data();
   auto* GGt_ptr                  = GGt_offload->data();
   auto* PrimLattice_G_ptr        = PrimLattice_G_offload->data();
   auto* myKcart_ptr              = myKcart->data();
   const size_t first_spo_local   = first_spo;
   const auto orb_size            = psi.size();

   {
     ScopedTimer offload(offload_timer_);
     PRAGMA_OFFLOAD("omp target teams distribute num_teams(NumTeams) \
                 map(always, from: results_scratch_ptr[0:sposet_padded_size*5])")
     for (int team_id = 0; team_id < NumTeams; team_id++)
     {
       const size_t first = ChunkSizePerTeam * team_id;
       const size_t last  = omptarget::min(first + ChunkSizePerTeam, spline_padded_size);

       int ix, iy, iz;
       ST a[4], b[4], c[4], da[4], db[4], dc[4], d2a[4], d2b[4], d2c[4];
       spline2::computeLocationAndFractional(spline_ptr, rux, ruy, ruz, ix, iy, iz, a, b, c, da, db, dc, d2a, d2b, d2c);

       const ST G[9]      = {PrimLattice_G_ptr[0], PrimLattice_G_ptr[1], PrimLattice_G_ptr[2],
                             PrimLattice_G_ptr[3], PrimLattice_G_ptr[4], PrimLattice_G_ptr[5],
                             PrimLattice_G_ptr[6], PrimLattice_G_ptr[7], PrimLattice_G_ptr[8]};
       const ST symGGt[6] = {GGt_ptr[0], GGt_ptr[1] + GGt_ptr[3], GGt_ptr[2] + GGt_ptr[6],
                             GGt_ptr[4], GGt_ptr[5] + GGt_ptr[7], GGt_ptr[8]};

       PRAGMA_OFFLOAD("omp parallel for")
       for (int index = 0; index < last - first; index++)
       {
         spline2offload::evaluate_vgh_impl_v2(spline_ptr, ix, iy, iz, first + index, a, b, c, da, db, dc, d2a, d2b, d2c,
                                              offload_scratch_ptr + first + index, spline_padded_size);
         const int output_index = first + index;
         offload_scratch_ptr[spline_padded_size * SoAFields3D::LAPL + output_index] =
             SymTrace(offload_scratch_ptr[spline_padded_size * SoAFields3D::HESS00 + output_index],
                      offload_scratch_ptr[spline_padded_size * SoAFields3D::HESS01 + output_index],
                      offload_scratch_ptr[spline_padded_size * SoAFields3D::HESS02 + output_index],
                      offload_scratch_ptr[spline_padded_size * SoAFields3D::HESS11 + output_index],
                      offload_scratch_ptr[spline_padded_size * SoAFields3D::HESS12 + output_index],
                      offload_scratch_ptr[spline_padded_size * SoAFields3D::HESS22 + output_index], symGGt);
       }

       const size_t first_cplx = first / 2;
       const size_t last_cplx  = omptarget::min(last / 2, orb_size);
       PRAGMA_OFFLOAD("omp parallel for")
       for (int index = first_cplx; index < last_cplx; index++)
         C2C::assign_vgl(x, y, z, results_scratch_ptr, sposet_padded_size, mKK_ptr, offload_scratch_ptr,
                         spline_padded_size, G, myKcart_ptr, myKcart_padded_size, first_spo_local, index);
     }
   }

   for (size_t i = 0; i < orb_size; i++)
   {
     psi[i]     = results_scratch[i];
     dpsi[i][0] = results_scratch[i + sposet_padded_size];
     dpsi[i][1] = results_scratch[i + sposet_padded_size * 2];
     dpsi[i][2] = results_scratch[i + sposet_padded_size * 3];
     d2psi[i]   = results_scratch[i + sposet_padded_size * 4];
   }
 }

 template<typename ST>
 void SplineC2COMPTarget<ST>::evaluateVGLMultiPos(const Vector<ST, OffloadPinnedAllocator<ST>>& multi_pos,
                                                  Vector<ST, OffloadPinnedAllocator<ST>>& offload_scratch,
                                                  Vector<ComplexT, OffloadPinnedAllocator<ComplexT>>& results_scratch,
                                                  const RefVector<ValueVector>& psi_v_list,
                                                  const RefVector<GradVector>& dpsi_v_list,
                                                  const RefVector<ValueVector>& d2psi_v_list) const
 {
   const size_t num_pos          = psi_v_list.size();
   const size_t ChunkSizePerTeam = 512;
   const int NumTeams            = (myV.size() + ChunkSizePerTeam - 1) / ChunkSizePerTeam;
   const auto spline_padded_size = myV.size();
   const auto sposet_padded_size = getAlignedSize<ValueType>(OrbitalSetSize);
   offload_scratch.resize(spline_padded_size * num_pos * SoAFields3D::NUM_FIELDS);
   const auto orb_size = psi_v_list[0].get().size();
   // for V(1)G(3)L(1) final result
   results_scratch.resize(sposet_padded_size * num_pos * 5);

   // Ye: need to extract sizes and pointers before entering target region
   const auto* spline_ptr         = SplineInst->getSplinePtr();
   auto* pos_copy_ptr             = multi_pos.data();
   auto* offload_scratch_ptr      = offload_scratch.data();
   auto* results_scratch_ptr      = results_scratch.data();
   const auto myKcart_padded_size = myKcart->capacity();
   auto* mKK_ptr                  = mKK->data();
   auto* GGt_ptr                  = GGt_offload->data();
   auto* PrimLattice_G_ptr        = PrimLattice_G_offload->data();
   auto* myKcart_ptr              = myKcart->data();
   const size_t first_spo_local   = first_spo;

   {
     ScopedTimer offload(offload_timer_);
     PRAGMA_OFFLOAD("omp target teams distribute collapse(2) num_teams(NumTeams*num_pos) \
                     map(always, to: pos_copy_ptr[0:num_pos*6]) \
                     map(always, from: results_scratch_ptr[0:sposet_padded_size*num_pos*5])")
     for (int iw = 0; iw < num_pos; iw++)
       for (int team_id = 0; team_id < NumTeams; team_id++)
       {
         const size_t first = ChunkSizePerTeam * team_id;
         const size_t last  = omptarget::min(first + ChunkSizePerTeam, spline_padded_size);

         auto* restrict offload_scratch_iw_ptr = offload_scratch_ptr + spline_padded_size * iw * SoAFields3D::NUM_FIELDS;
         auto* restrict psi_iw_ptr             = results_scratch_ptr + sposet_padded_size * iw * 5;

         int ix, iy, iz;
         ST a[4], b[4], c[4], da[4], db[4], dc[4], d2a[4], d2b[4], d2c[4];
         spline2::computeLocationAndFractional(spline_ptr, pos_copy_ptr[iw * 6 + 3], pos_copy_ptr[iw * 6 + 4],
                                               pos_copy_ptr[iw * 6 + 5], ix, iy, iz, a, b, c, da, db, dc, d2a, d2b, d2c);

         const ST G[9]      = {PrimLattice_G_ptr[0], PrimLattice_G_ptr[1], PrimLattice_G_ptr[2],
                               PrimLattice_G_ptr[3], PrimLattice_G_ptr[4], PrimLattice_G_ptr[5],
                               PrimLattice_G_ptr[6], PrimLattice_G_ptr[7], PrimLattice_G_ptr[8]};
         const ST symGGt[6] = {GGt_ptr[0], GGt_ptr[1] + GGt_ptr[3], GGt_ptr[2] + GGt_ptr[6],
                               GGt_ptr[4], GGt_ptr[5] + GGt_ptr[7], GGt_ptr[8]};

         PRAGMA_OFFLOAD("omp parallel for")
         for (int index = 0; index < last - first; index++)
         {
           spline2offload::evaluate_vgh_impl_v2(spline_ptr, ix, iy, iz, first + index, a, b, c, da, db, dc, d2a, d2b,
                                                d2c, offload_scratch_iw_ptr + first + index, spline_padded_size);
           const int output_index = first + index;
           offload_scratch_iw_ptr[spline_padded_size * SoAFields3D::LAPL + output_index] =
               SymTrace(offload_scratch_iw_ptr[spline_padded_size * SoAFields3D::HESS00 + output_index],
                        offload_scratch_iw_ptr[spline_padded_size * SoAFields3D::HESS01 + output_index],
                        offload_scratch_iw_ptr[spline_padded_size * SoAFields3D::HESS02 + output_index],
                        offload_scratch_iw_ptr[spline_padded_size * SoAFields3D::HESS11 + output_index],
                        offload_scratch_iw_ptr[spline_padded_size * SoAFields3D::HESS12 + output_index],
                        offload_scratch_iw_ptr[spline_padded_size * SoAFields3D::HESS22 + output_index], symGGt);
         }

         const size_t first_cplx = first / 2;
         const size_t last_cplx  = omptarget::min(last / 2, orb_size);
         PRAGMA_OFFLOAD("omp parallel for")
         for (int index = first_cplx; index < last_cplx; index++)
           C2C::assign_vgl(pos_copy_ptr[iw * 6], pos_copy_ptr[iw * 6 + 1], pos_copy_ptr[iw * 6 + 2], psi_iw_ptr,
                           sposet_padded_size, mKK_ptr, offload_scratch_iw_ptr, spline_padded_size, G, myKcart_ptr,
                           myKcart_padded_size, first_spo_local, index);
       }
   }

   for (int iw = 0; iw < num_pos; ++iw)
   {
     auto* restrict results_iw_ptr = results_scratch_ptr + sposet_padded_size * iw * 5;
     ValueVector& psi_v(psi_v_list[iw]);
     GradVector& dpsi_v(dpsi_v_list[iw]);
     ValueVector& d2psi_v(d2psi_v_list[iw]);
     for (size_t i = 0; i < orb_size; i++)
     {
       psi_v[i]     = results_iw_ptr[i];
       dpsi_v[i][0] = results_iw_ptr[i + sposet_padded_size];
       dpsi_v[i][1] = results_iw_ptr[i + sposet_padded_size * 2];
       dpsi_v[i][2] = results_iw_ptr[i + sposet_padded_size * 3];
       d2psi_v[i]   = results_iw_ptr[i + sposet_padded_size * 4];
     }
   }
 }

 template<typename ST>
 void SplineC2COMPTarget<ST>::mw_evaluateVGL(const RefVectorWithLeader<SPOSet>& sa_list,
                                             const RefVectorWithLeader<ParticleSet>& P_list,
                                             int iat,
                                             const RefVector<ValueVector>& psi_v_list,
                                             const RefVector<GradVector>& dpsi_v_list,
                                             const RefVector<ValueVector>& d2psi_v_list) const
 {
   assert(this == &sa_list.getLeader());
   auto& phi_leader         = sa_list.getCastedLeader<SplineC2COMPTarget<ST>>();
   auto& mw_mem             = phi_leader.mw_mem_handle_.getResource();
   auto& mw_pos_copy        = mw_mem.mw_pos_copy;
   auto& mw_offload_scratch = mw_mem.mw_offload_scratch;
   auto& mw_results_scratch = mw_mem.mw_results_scratch;
   const int nwalkers       = sa_list.size();
   mw_pos_copy.resize(nwalkers * 6);

   // pack particle positions
   for (int iw = 0; iw < nwalkers; ++iw)
   {
     const PointType& r = P_list[iw].activeR(iat);
     PointType ru(PrimLattice.toUnit_floor(r));
     mw_pos_copy[iw * 6]     = r[0];
     mw_pos_copy[iw * 6 + 1] = r[1];
     mw_pos_copy[iw * 6 + 2] = r[2];
     mw_pos_copy[iw * 6 + 3] = ru[0];
     mw_pos_copy[iw * 6 + 4] = ru[1];
     mw_pos_copy[iw * 6 + 5] = ru[2];
   }

   phi_leader.evaluateVGLMultiPos(mw_pos_copy, mw_offload_scratch, mw_results_scratch, psi_v_list, dpsi_v_list,
                                  d2psi_v_list);
 }

 template<typename ST>
 void SplineC2COMPTarget<ST>::mw_evaluateVGLandDetRatioGrads(const RefVectorWithLeader<SPOSet>& spo_list,
                                                             const RefVectorWithLeader<ParticleSet>& P_list,
                                                             int iat,
                                                             const std::vector<const ValueType*>& invRow_ptr_list,
                                                             OffloadMWVGLArray& phi_vgl_v,
                                                             std::vector<ValueType>& ratios,
                                                             std::vector<GradType>& grads) const
 {
   assert(this == &spo_list.getLeader());
   auto& phi_leader         = spo_list.getCastedLeader<SplineC2COMPTarget<ST>>();
   auto& mw_mem             = phi_leader.mw_mem_handle_.getResource();
   auto& buffer_H2D         = mw_mem.buffer_H2D;
   auto& rg_private         = mw_mem.rg_private;
   auto& mw_offload_scratch = mw_mem.mw_offload_scratch;
   auto& mw_results_scratch = mw_mem.mw_results_scratch;
   const int nwalkers       = spo_list.size();
   buffer_H2D.resize(nwalkers, sizeof(ST) * 6 + sizeof(ValueType*));

   // pack particle positions and invRow pointers.
   for (int iw = 0; iw < nwalkers; ++iw)
   {
     const PointType& r = P_list[iw].activeR(iat);
     PointType ru(PrimLattice.toUnit_floor(r));
     Vector<ST> pos_copy(reinterpret_cast<ST*>(buffer_H2D[iw]), 6);

     pos_copy[0] = r[0];
     pos_copy[1] = r[1];
     pos_copy[2] = r[2];
     pos_copy[3] = ru[0];
     pos_copy[4] = ru[1];
     pos_copy[5] = ru[2];

     auto& invRow_ptr = *reinterpret_cast<const ValueType**>(buffer_H2D[iw] + sizeof(ST) * 6);
     invRow_ptr       = invRow_ptr_list[iw];
   }

   const size_t num_pos          = nwalkers;
   const auto orb_size           = phi_vgl_v.size(2);
   const auto spline_padded_size = myV.size();
   const auto sposet_padded_size = getAlignedSize<ValueType>(OrbitalSetSize);
   const size_t ChunkSizePerTeam = 512;
   const int NumTeams            = (myV.size() + ChunkSizePerTeam - 1) / ChunkSizePerTeam;
   mw_offload_scratch.resize(spline_padded_size * num_pos * SoAFields3D::NUM_FIELDS);
   // for V(1)G(3)L(1) final result
   mw_results_scratch.resize(sposet_padded_size * num_pos * 5);
   // per team ratio and grads
   rg_private.resize(num_pos, NumTeams * 4);

   // Ye: need to extract sizes and pointers before entering target region
   const auto* spline_ptr         = SplineInst->getSplinePtr();
   auto* buffer_H2D_ptr           = buffer_H2D.data();
   auto* offload_scratch_ptr      = mw_offload_scratch.data();
   auto* results_scratch_ptr      = mw_results_scratch.data();
   const auto myKcart_padded_size = myKcart->capacity();
   auto* mKK_ptr                  = mKK->data();
   auto* GGt_ptr                  = GGt_offload->data();
   auto* PrimLattice_G_ptr        = PrimLattice_G_offload->data();
   auto* myKcart_ptr              = myKcart->data();
   auto* phi_vgl_ptr              = phi_vgl_v.data();
   auto* rg_private_ptr           = rg_private.data();
   const size_t buffer_H2D_stride = buffer_H2D.cols();
   const size_t first_spo_local   = first_spo;
   const size_t phi_vgl_stride    = num_pos * orb_size;

   {
     ScopedTimer offload(offload_timer_);
     PRAGMA_OFFLOAD("omp target teams distribute collapse(2) num_teams(NumTeams*num_pos) \
                     map(always, to: buffer_H2D_ptr[:buffer_H2D.size()]) \
                     map(always, from: rg_private_ptr[0:rg_private.size()])")
     for (int iw = 0; iw < num_pos; iw++)
       for (int team_id = 0; team_id < NumTeams; team_id++)
       {
         const size_t first = ChunkSizePerTeam * team_id;
         const size_t last  = omptarget::min(first + ChunkSizePerTeam, spline_padded_size);

         auto* restrict offload_scratch_iw_ptr = offload_scratch_ptr + spline_padded_size * iw * SoAFields3D::NUM_FIELDS;
         auto* restrict psi_iw_ptr             = results_scratch_ptr + sposet_padded_size * iw * 5;
         const auto* restrict pos_iw_ptr       = reinterpret_cast<ST*>(buffer_H2D_ptr + buffer_H2D_stride * iw);
         const auto* restrict invRow_iw_ptr =
             *reinterpret_cast<ValueType**>(buffer_H2D_ptr + buffer_H2D_stride * iw + sizeof(ST) * 6);

         int ix, iy, iz;
         ST a[4], b[4], c[4], da[4], db[4], dc[4], d2a[4], d2b[4], d2c[4];
         spline2::computeLocationAndFractional(spline_ptr, pos_iw_ptr[3], pos_iw_ptr[4], pos_iw_ptr[5], ix, iy, iz, a, b,
                                               c, da, db, dc, d2a, d2b, d2c);

         const ST G[9]      = {PrimLattice_G_ptr[0], PrimLattice_G_ptr[1], PrimLattice_G_ptr[2],
                               PrimLattice_G_ptr[3], PrimLattice_G_ptr[4], PrimLattice_G_ptr[5],
                               PrimLattice_G_ptr[6], PrimLattice_G_ptr[7], PrimLattice_G_ptr[8]};
         const ST symGGt[6] = {GGt_ptr[0], GGt_ptr[1] + GGt_ptr[3], GGt_ptr[2] + GGt_ptr[6],
                               GGt_ptr[4], GGt_ptr[5] + GGt_ptr[7], GGt_ptr[8]};

         PRAGMA_OFFLOAD("omp parallel for")
         for (int index = 0; index < last - first; index++)
         {
           spline2offload::evaluate_vgh_impl_v2(spline_ptr, ix, iy, iz, first + index, a, b, c, da, db, dc, d2a, d2b,
                                                d2c, offload_scratch_iw_ptr + first + index, spline_padded_size);
           const int output_index = first + index;
           offload_scratch_iw_ptr[spline_padded_size * SoAFields3D::LAPL + output_index] =
               SymTrace(offload_scratch_iw_ptr[spline_padded_size * SoAFields3D::HESS00 + output_index],
                        offload_scratch_iw_ptr[spline_padded_size * SoAFields3D::HESS01 + output_index],
                        offload_scratch_iw_ptr[spline_padded_size * SoAFields3D::HESS02 + output_index],
                        offload_scratch_iw_ptr[spline_padded_size * SoAFields3D::HESS11 + output_index],
                        offload_scratch_iw_ptr[spline_padded_size * SoAFields3D::HESS12 + output_index],
                        offload_scratch_iw_ptr[spline_padded_size * SoAFields3D::HESS22 + output_index], symGGt);
         }

         const size_t first_cplx = first / 2;
         const size_t last_cplx  = omptarget::min(last / 2, orb_size);
         PRAGMA_OFFLOAD("omp parallel for")
         for (int index = first_cplx; index < last_cplx; index++)
           C2C::assign_vgl(pos_iw_ptr[0], pos_iw_ptr[1], pos_iw_ptr[2], psi_iw_ptr, sposet_padded_size, mKK_ptr,
                           offload_scratch_iw_ptr, spline_padded_size, G, myKcart_ptr, myKcart_padded_size,
                           first_spo_local, index);

         ValueType* restrict psi    = psi_iw_ptr;
         ValueType* restrict dpsi_x = psi_iw_ptr + sposet_padded_size;
         ValueType* restrict dpsi_y = psi_iw_ptr + sposet_padded_size * 2;
         ValueType* restrict dpsi_z = psi_iw_ptr + sposet_padded_size * 3;
         ValueType* restrict d2psi  = psi_iw_ptr + sposet_padded_size * 4;

         ValueType* restrict out_phi    = phi_vgl_ptr + iw * orb_size;
         ValueType* restrict out_dphi_x = out_phi + phi_vgl_stride;
         ValueType* restrict out_dphi_y = out_dphi_x + phi_vgl_stride;
         ValueType* restrict out_dphi_z = out_dphi_y + phi_vgl_stride;
         ValueType* restrict out_d2phi  = out_dphi_z + phi_vgl_stride;

         ValueType ratio(0), grad_x(0), grad_y(0), grad_z(0);
         PRAGMA_OFFLOAD("omp parallel for reduction(+: ratio, grad_x, grad_y, grad_z)")
         for (int j = first_cplx; j < last_cplx; j++)
         {
           const size_t psiIndex = first_spo_local + j;

           out_phi[psiIndex]    = psi[psiIndex];
           out_dphi_x[psiIndex] = dpsi_x[psiIndex];
           out_dphi_y[psiIndex] = dpsi_y[psiIndex];
           out_dphi_z[psiIndex] = dpsi_z[psiIndex];
           out_d2phi[psiIndex]  = d2psi[psiIndex];

           ratio += psi[psiIndex] * invRow_iw_ptr[psiIndex];
           grad_x += dpsi_x[psiIndex] * invRow_iw_ptr[psiIndex];
           grad_y += dpsi_y[psiIndex] * invRow_iw_ptr[psiIndex];
           grad_z += dpsi_z[psiIndex] * invRow_iw_ptr[psiIndex];
         }

         rg_private_ptr[(iw * NumTeams + team_id) * 4]     = ratio;
         rg_private_ptr[(iw * NumTeams + team_id) * 4 + 1] = grad_x;
         rg_private_ptr[(iw * NumTeams + team_id) * 4 + 2] = grad_y;
         rg_private_ptr[(iw * NumTeams + team_id) * 4 + 3] = grad_z;
       }
   }

   for (int iw = 0; iw < num_pos; iw++)
   {
     ValueType ratio(0);
     for (int team_id = 0; team_id < NumTeams; team_id++)
       ratio += rg_private[iw][team_id * 4];
     ratios[iw] = ratio;

     ValueType grad_x(0), grad_y(0), grad_z(0);
     for (int team_id = 0; team_id < NumTeams; team_id++)
     {
       grad_x += rg_private[iw][team_id * 4 + 1];
       grad_y += rg_private[iw][team_id * 4 + 2];
       grad_z += rg_private[iw][team_id * 4 + 3];
     }
     grads[iw] = GradType{grad_x / ratio, grad_y / ratio, grad_z / ratio};
   }
 }
 template<typename ST>
 void SplineC2COMPTarget<ST>::assign_vgh(const PointType& r,
                                         ValueVector& psi,
                                         GradVector& dpsi,
                                         HessVector& grad_grad_psi,
                                         int first,
                                         int last) const
 {
   // protect last
   const size_t last_cplx = std::min(kPoints.size(), psi.size());
   last                   = last > last_cplx ? last_cplx : last;

   const ST g00 = PrimLattice.G(0), g01 = PrimLattice.G(1), g02 = PrimLattice.G(2), g10 = PrimLattice.G(3),
            g11 = PrimLattice.G(4), g12 = PrimLattice.G(5), g20 = PrimLattice.G(6), g21 = PrimLattice.G(7),
            g22 = PrimLattice.G(8);
   const ST x = r[0], y = r[1], z = r[2];

   const ST* restrict k0 = myKcart->data(0);
   const ST* restrict k1 = myKcart->data(1);
   const ST* restrict k2 = myKcart->data(2);

   const ST* restrict g0  = myG.data(0);
   const ST* restrict g1  = myG.data(1);
   const ST* restrict g2  = myG.data(2);
   const ST* restrict h00 = myH.data(0);
   const ST* restrict h01 = myH.data(1);
   const ST* restrict h02 = myH.data(2);
   const ST* restrict h11 = myH.data(3);
   const ST* restrict h12 = myH.data(4);
   const ST* restrict h22 = myH.data(5);

 #pragma omp simd
   for (size_t j = first; j < last; ++j)
   {
     int jr = j << 1;
     int ji = jr + 1;

     const ST kX    = k0[j];
     const ST kY    = k1[j];
     const ST kZ    = k2[j];
     const ST val_r = myV[jr];
     const ST val_i = myV[ji];

     //phase
     ST s, c;
     omptarget::sincos(-(x * kX + y * kY + z * kZ), &s, &c);

     //dot(PrimLattice.G,myG[j])
     const ST dX_r = g00 * g0[jr] + g01 * g1[jr] + g02 * g2[jr];
     const ST dY_r = g10 * g0[jr] + g11 * g1[jr] + g12 * g2[jr];
     const ST dZ_r = g20 * g0[jr] + g21 * g1[jr] + g22 * g2[jr];

     const ST dX_i = g00 * g0[ji] + g01 * g1[ji] + g02 * g2[ji];
     const ST dY_i = g10 * g0[ji] + g11 * g1[ji] + g12 * g2[ji];
     const ST dZ_i = g20 * g0[ji] + g21 * g1[ji] + g22 * g2[ji];

     // \f$\nabla \psi_r + {\bf k}\psi_i\f$
     const ST gX_r = dX_r + val_i * kX;
     const ST gY_r = dY_r + val_i * kY;
     const ST gZ_r = dZ_r + val_i * kZ;
     const ST gX_i = dX_i - val_r * kX;
     const ST gY_i = dY_i - val_r * kY;
     const ST gZ_i = dZ_i - val_r * kZ;

     const size_t psiIndex = j + first_spo;
     psi[psiIndex]         = ComplexT(c * val_r - s * val_i, c * val_i + s * val_r);
     dpsi[psiIndex][0]     = ComplexT(c * gX_r - s * gX_i, c * gX_i + s * gX_r);
     dpsi[psiIndex][1]     = ComplexT(c * gY_r - s * gY_i, c * gY_i + s * gY_r);
     dpsi[psiIndex][2]     = ComplexT(c * gZ_r - s * gZ_i, c * gZ_i + s * gZ_r);

     const ST h_xx_r =
         v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g00, g01, g02, g00, g01, g02) + kX * (gX_i + dX_i);
     const ST h_xy_r =
         v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g00, g01, g02, g10, g11, g12) + kX * (gY_i + dY_i);
     const ST h_xz_r =
         v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g00, g01, g02, g20, g21, g22) + kX * (gZ_i + dZ_i);
     const ST h_yx_r =
         v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g10, g11, g12, g00, g01, g02) + kY * (gX_i + dX_i);
     const ST h_yy_r =
         v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g10, g11, g12, g10, g11, g12) + kY * (gY_i + dY_i);
     const ST h_yz_r =
         v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g10, g11, g12, g20, g21, g22) + kY * (gZ_i + dZ_i);
     const ST h_zx_r =
         v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g20, g21, g22, g00, g01, g02) + kZ * (gX_i + dX_i);
     const ST h_zy_r =
         v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g20, g21, g22, g10, g11, g12) + kZ * (gY_i + dY_i);
     const ST h_zz_r =
         v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g20, g21, g22, g20, g21, g22) + kZ * (gZ_i + dZ_i);

     const ST h_xx_i =
         v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g00, g01, g02, g00, g01, g02) - kX * (gX_r + dX_r);
     const ST h_xy_i =
         v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g00, g01, g02, g10, g11, g12) - kX * (gY_r + dY_r);
     const ST h_xz_i =
         v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g00, g01, g02, g20, g21, g22) - kX * (gZ_r + dZ_r);
     const ST h_yx_i =
         v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g10, g11, g12, g00, g01, g02) - kY * (gX_r + dX_r);
     const ST h_yy_i =
         v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g10, g11, g12, g10, g11, g12) - kY * (gY_r + dY_r);
     const ST h_yz_i =
         v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g10, g11, g12, g20, g21, g22) - kY * (gZ_r + dZ_r);
     const ST h_zx_i =
         v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g20, g21, g22, g00, g01, g02) - kZ * (gX_r + dX_r);
     const ST h_zy_i =
         v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g20, g21, g22, g10, g11, g12) - kZ * (gY_r + dY_r);
     const ST h_zz_i =
         v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g20, g21, g22, g20, g21, g22) - kZ * (gZ_r + dZ_r);

     grad_grad_psi[psiIndex][0] = ComplexT(c * h_xx_r - s * h_xx_i, c * h_xx_i + s * h_xx_r);
     grad_grad_psi[psiIndex][1] = ComplexT(c * h_xy_r - s * h_xy_i, c * h_xy_i + s * h_xy_r);
     grad_grad_psi[psiIndex][2] = ComplexT(c * h_xz_r - s * h_xz_i, c * h_xz_i + s * h_xz_r);
     grad_grad_psi[psiIndex][3] = ComplexT(c * h_yx_r - s * h_yx_i, c * h_yx_i + s * h_yx_r);
     grad_grad_psi[psiIndex][4] = ComplexT(c * h_yy_r - s * h_yy_i, c * h_yy_i + s * h_yy_r);
     grad_grad_psi[psiIndex][5] = ComplexT(c * h_yz_r - s * h_yz_i, c * h_yz_i + s * h_yz_r);
     grad_grad_psi[psiIndex][6] = ComplexT(c * h_zx_r - s * h_zx_i, c * h_zx_i + s * h_zx_r);
     grad_grad_psi[psiIndex][7] = ComplexT(c * h_zy_r - s * h_zy_i, c * h_zy_i + s * h_zy_r);
     grad_grad_psi[psiIndex][8] = ComplexT(c * h_zz_r - s * h_zz_i, c * h_zz_i + s * h_zz_r);
   }
 }

 template<typename ST>
 void SplineC2COMPTarget<ST>::evaluateVGH(const ParticleSet& P,
                                          const int iat,
                                          ValueVector& psi,
                                          GradVector& dpsi,
                                          HessVector& grad_grad_psi)
 {
   const PointType& r = P.activeR(iat);
   PointType ru(PrimLattice.toUnit_floor(r));

 #pragma omp parallel
   {
     int first, last;
     // Factor of 2 because psi is complex and the spline storage and evaluation uses a real type
     FairDivideAligned(2 * psi.size(), getAlignment<ST>(), omp_get_num_threads(), omp_get_thread_num(), first, last);

     spline2::evaluate3d_vgh(SplineInst->getSplinePtr(), ru, myV, myG, myH, first, last);
     assign_vgh(r, psi, dpsi, grad_grad_psi, first / 2, last / 2);
   }
 }

 template<typename ST>
 void SplineC2COMPTarget<ST>::assign_vghgh(const PointType& r,
                                           ValueVector& psi,
                                           GradVector& dpsi,
                                           HessVector& grad_grad_psi,
                                           GGGVector& grad_grad_grad_psi,
                                           int first,
                                           int last) const
 {
   // protect last
   const size_t last_cplx = std::min(kPoints.size(), psi.size());
   last                   = last < 0 ? last_cplx : (last > last_cplx ? last_cplx : last);

   const ST g00 = PrimLattice.G(0), g01 = PrimLattice.G(1), g02 = PrimLattice.G(2), g10 = PrimLattice.G(3),
            g11 = PrimLattice.G(4), g12 = PrimLattice.G(5), g20 = PrimLattice.G(6), g21 = PrimLattice.G(7),
            g22 = PrimLattice.G(8);
   const ST x = r[0], y = r[1], z = r[2];

   const ST* restrict k0 = myKcart->data(0);
   const ST* restrict k1 = myKcart->data(1);
   const ST* restrict k2 = myKcart->data(2);

   const ST* restrict g0  = myG.data(0);
   const ST* restrict g1  = myG.data(1);
   const ST* restrict g2  = myG.data(2);
   const ST* restrict h00 = myH.data(0);
   const ST* restrict h01 = myH.data(1);
   const ST* restrict h02 = myH.data(2);
   const ST* restrict h11 = myH.data(3);
   const ST* restrict h12 = myH.data(4);
   const ST* restrict h22 = myH.data(5);

   const ST* restrict gh000 = mygH.data(0);
   const ST* restrict gh001 = mygH.data(1);
   const ST* restrict gh002 = mygH.data(2);
   const ST* restrict gh011 = mygH.data(3);
   const ST* restrict gh012 = mygH.data(4);
   const ST* restrict gh022 = mygH.data(5);
   const ST* restrict gh111 = mygH.data(6);
   const ST* restrict gh112 = mygH.data(7);
   const ST* restrict gh122 = mygH.data(8);
   const ST* restrict gh222 = mygH.data(9);

 //SIMD doesn't work quite right yet.  Comment out until further debugging.
 #pragma omp simd
   for (size_t j = first; j < last; ++j)
   {
     int jr = j << 1;
     int ji = jr + 1;

     const ST kX    = k0[j];
     const ST kY    = k1[j];
     const ST kZ    = k2[j];
     const ST val_r = myV[jr];
     const ST val_i = myV[ji];

     //phase
     ST s, c;
     omptarget::sincos(-(x * kX + y * kY + z * kZ), &s, &c);

     //dot(PrimLattice.G,myG[j])
     const ST dX_r = g00 * g0[jr] + g01 * g1[jr] + g02 * g2[jr];
     const ST dY_r = g10 * g0[jr] + g11 * g1[jr] + g12 * g2[jr];
     const ST dZ_r = g20 * g0[jr] + g21 * g1[jr] + g22 * g2[jr];

     const ST dX_i = g00 * g0[ji] + g01 * g1[ji] + g02 * g2[ji];
     const ST dY_i = g10 * g0[ji] + g11 * g1[ji] + g12 * g2[ji];
     const ST dZ_i = g20 * g0[ji] + g21 * g1[ji] + g22 * g2[ji];

     // \f$\nabla \psi_r + {\bf k}\psi_i\f$
     const ST gX_r = dX_r + val_i * kX;
     const ST gY_r = dY_r + val_i * kY;
     const ST gZ_r = dZ_r + val_i * kZ;
     const ST gX_i = dX_i - val_r * kX;
     const ST gY_i = dY_i - val_r * kY;
     const ST gZ_i = dZ_i - val_r * kZ;

     const size_t psiIndex = j + first_spo;
     psi[psiIndex]         = ComplexT(c * val_r - s * val_i, c * val_i + s * val_r);
     dpsi[psiIndex][0]     = ComplexT(c * gX_r - s * gX_i, c * gX_i + s * gX_r);
     dpsi[psiIndex][1]     = ComplexT(c * gY_r - s * gY_i, c * gY_i + s * gY_r);
     dpsi[psiIndex][2]     = ComplexT(c * gZ_r - s * gZ_i, c * gZ_i + s * gZ_r);

     //intermediates for computation of hessian. \partial_i \partial_j phi in cartesian coordinates.
     const ST f_xx_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g00, g01, g02, g00, g01, g02);
     const ST f_xy_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g00, g01, g02, g10, g11, g12);
     const ST f_xz_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g00, g01, g02, g20, g21, g22);
     const ST f_yy_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g10, g11, g12, g10, g11, g12);
     const ST f_yz_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g10, g11, g12, g20, g21, g22);
     const ST f_zz_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g20, g21, g22, g20, g21, g22);

     const ST f_xx_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g00, g01, g02, g00, g01, g02);
     const ST f_xy_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g00, g01, g02, g10, g11, g12);
     const ST f_xz_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g00, g01, g02, g20, g21, g22);
     const ST f_yy_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g10, g11, g12, g10, g11, g12);
     const ST f_yz_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g10, g11, g12, g20, g21, g22);
     const ST f_zz_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g20, g21, g22, g20, g21, g22);

     const ST h_xx_r = f_xx_r + 2 * kX * dX_i - kX * kX * val_r;
     const ST h_xy_r = f_xy_r + (kX * dY_i + kY * dX_i) - kX * kY * val_r;
     const ST h_xz_r = f_xz_r + (kX * dZ_i + kZ * dX_i) - kX * kZ * val_r;
     const ST h_yy_r = f_yy_r + 2 * kY * dY_i - kY * kY * val_r;
     const ST h_yz_r = f_yz_r + (kY * dZ_i + kZ * dY_i) - kY * kZ * val_r;
     const ST h_zz_r = f_zz_r + 2 * kZ * dZ_i - kZ * kZ * val_r;

     const ST h_xx_i = f_xx_i - 2 * kX * dX_r - kX * kX * val_i;
     const ST h_xy_i = f_xy_i - (kX * dY_r + kY * dX_r) - kX * kY * val_i;
     const ST h_xz_i = f_xz_i - (kX * dZ_r + kZ * dX_r) - kX * kZ * val_i;
     const ST h_yy_i = f_yy_i - 2 * kY * dY_r - kY * kY * val_i;
     const ST h_yz_i = f_yz_i - (kZ * dY_r + kY * dZ_r) - kZ * kY * val_i;
     const ST h_zz_i = f_zz_i - 2 * kZ * dZ_r - kZ * kZ * val_i;

     grad_grad_psi[psiIndex][0] = ComplexT(c * h_xx_r - s * h_xx_i, c * h_xx_i + s * h_xx_r);
     grad_grad_psi[psiIndex][1] = ComplexT(c * h_xy_r - s * h_xy_i, c * h_xy_i + s * h_xy_r);
     grad_grad_psi[psiIndex][2] = ComplexT(c * h_xz_r - s * h_xz_i, c * h_xz_i + s * h_xz_r);
     grad_grad_psi[psiIndex][3] = ComplexT(c * h_xy_r - s * h_xy_i, c * h_xy_i + s * h_xy_r);
     grad_grad_psi[psiIndex][4] = ComplexT(c * h_yy_r - s * h_yy_i, c * h_yy_i + s * h_yy_r);
     grad_grad_psi[psiIndex][5] = ComplexT(c * h_yz_r - s * h_yz_i, c * h_yz_i + s * h_yz_r);
     grad_grad_psi[psiIndex][6] = ComplexT(c * h_xz_r - s * h_xz_i, c * h_xz_i + s * h_xz_r);
     grad_grad_psi[psiIndex][7] = ComplexT(c * h_yz_r - s * h_yz_i, c * h_yz_i + s * h_yz_r);
     grad_grad_psi[psiIndex][8] = ComplexT(c * h_zz_r - s * h_zz_i, c * h_zz_i + s * h_zz_r);

     //These are the real and imaginary components of the third SPO derivative.  _xxx denotes
     // third derivative w.r.t. x, _xyz, a derivative with resepect to x,y, and z, and so on.

     const ST f3_xxx_r = t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
                                     gh112[jr], gh122[jr], gh222[jr], g00, g01, g02, g00, g01, g02, g00, g01, g02);
     const ST f3_xxy_r = t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
                                     gh112[jr], gh122[jr], gh222[jr], g00, g01, g02, g00, g01, g02, g10, g11, g12);
     const ST f3_xxz_r = t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
                                     gh112[jr], gh122[jr], gh222[jr], g00, g01, g02, g00, g01, g02, g20, g21, g22);
     const ST f3_xyy_r = t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
                                     gh112[jr], gh122[jr], gh222[jr], g00, g01, g02, g10, g11, g12, g10, g11, g12);
     const ST f3_xyz_r = t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
                                     gh112[jr], gh122[jr], gh222[jr], g00, g01, g02, g10, g11, g12, g20, g21, g22);
     const ST f3_xzz_r = t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
                                     gh112[jr], gh122[jr], gh222[jr], g00, g01, g02, g20, g21, g22, g20, g21, g22);
     const ST f3_yyy_r = t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
                                     gh112[jr], gh122[jr], gh222[jr], g10, g11, g12, g10, g11, g12, g10, g11, g12);
     const ST f3_yyz_r = t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
                                     gh112[jr], gh122[jr], gh222[jr], g10, g11, g12, g10, g11, g12, g20, g21, g22);
     const ST f3_yzz_r = t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
                                     gh112[jr], gh122[jr], gh222[jr], g10, g11, g12, g20, g21, g22, g20, g21, g22);
     const ST f3_zzz_r = t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
                                     gh112[jr], gh122[jr], gh222[jr], g20, g21, g22, g20, g21, g22, g20, g21, g22);

     const ST f3_xxx_i = t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
                                     gh112[ji], gh122[ji], gh222[ji], g00, g01, g02, g00, g01, g02, g00, g01, g02);
     const ST f3_xxy_i = t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
                                     gh112[ji], gh122[ji], gh222[ji], g00, g01, g02, g00, g01, g02, g10, g11, g12);
     const ST f3_xxz_i = t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
                                     gh112[ji], gh122[ji], gh222[ji], g00, g01, g02, g00, g01, g02, g20, g21, g22);
     const ST f3_xyy_i = t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
                                     gh112[ji], gh122[ji], gh222[ji], g00, g01, g02, g10, g11, g12, g10, g11, g12);
     const ST f3_xyz_i = t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
                                     gh112[ji], gh122[ji], gh222[ji], g00, g01, g02, g10, g11, g12, g20, g21, g22);
     const ST f3_xzz_i = t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
                                     gh112[ji], gh122[ji], gh222[ji], g00, g01, g02, g20, g21, g22, g20, g21, g22);
     const ST f3_yyy_i = t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
                                     gh112[ji], gh122[ji], gh222[ji], g10, g11, g12, g10, g11, g12, g10, g11, g12);
     const ST f3_yyz_i = t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
                                     gh112[ji], gh122[ji], gh222[ji], g10, g11, g12, g10, g11, g12, g20, g21, g22);
     const ST f3_yzz_i = t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
                                     gh112[ji], gh122[ji], gh222[ji], g10, g11, g12, g20, g21, g22, g20, g21, g22);
     const ST f3_zzz_i = t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
                                     gh112[ji], gh122[ji], gh222[ji], g20, g21, g22, g20, g21, g22, g20, g21, g22);

     //Here is where we build up the components of the physical hessian gradient, namely, d^3/dx^3(e^{-ik*r}\phi(r)
     const ST gh_xxx_r = f3_xxx_r + 3 * kX * f_xx_i - 3 * kX * kX * dX_r - kX * kX * kX * val_i;
     const ST gh_xxx_i = f3_xxx_i - 3 * kX * f_xx_r - 3 * kX * kX * dX_i + kX * kX * kX * val_r;
     const ST gh_xxy_r =
         f3_xxy_r + (kY * f_xx_i + 2 * kX * f_xy_i) - (kX * kX * dY_r + 2 * kX * kY * dX_r) - kX * kX * kY * val_i;
     const ST gh_xxy_i =
         f3_xxy_i - (kY * f_xx_r + 2 * kX * f_xy_r) - (kX * kX * dY_i + 2 * kX * kY * dX_i) + kX * kX * kY * val_r;
     const ST gh_xxz_r =
         f3_xxz_r + (kZ * f_xx_i + 2 * kX * f_xz_i) - (kX * kX * dZ_r + 2 * kX * kZ * dX_r) - kX * kX * kZ * val_i;
     const ST gh_xxz_i =
         f3_xxz_i - (kZ * f_xx_r + 2 * kX * f_xz_r) - (kX * kX * dZ_i + 2 * kX * kZ * dX_i) + kX * kX * kZ * val_r;
     const ST gh_xyy_r =
         f3_xyy_r + (2 * kY * f_xy_i + kX * f_yy_i) - (2 * kX * kY * dY_r + kY * kY * dX_r) - kX * kY * kY * val_i;
     const ST gh_xyy_i =
         f3_xyy_i - (2 * kY * f_xy_r + kX * f_yy_r) - (2 * kX * kY * dY_i + kY * kY * dX_i) + kX * kY * kY * val_r;
     const ST gh_xyz_r = f3_xyz_r + (kX * f_yz_i + kY * f_xz_i + kZ * f_xy_i) -
         (kX * kY * dZ_r + kY * kZ * dX_r + kZ * kX * dY_r) - kX * kY * kZ * val_i;
     const ST gh_xyz_i = f3_xyz_i - (kX * f_yz_r + kY * f_xz_r + kZ * f_xy_r) -
         (kX * kY * dZ_i + kY * kZ * dX_i + kZ * kX * dY_i) + kX * kY * kZ * val_r;
     const ST gh_xzz_r =
         f3_xzz_r + (2 * kZ * f_xz_i + kX * f_zz_i) - (2 * kX * kZ * dZ_r + kZ * kZ * dX_r) - kX * kZ * kZ * val_i;
     const ST gh_xzz_i =
         f3_xzz_i - (2 * kZ * f_xz_r + kX * f_zz_r) - (2 * kX * kZ * dZ_i + kZ * kZ * dX_i) + kX * kZ * kZ * val_r;
     const ST gh_yyy_r = f3_yyy_r + 3 * kY * f_yy_i - 3 * kY * kY * dY_r - kY * kY * kY * val_i;
     const ST gh_yyy_i = f3_yyy_i - 3 * kY * f_yy_r - 3 * kY * kY * dY_i + kY * kY * kY * val_r;
     const ST gh_yyz_r =
         f3_yyz_r + (kZ * f_yy_i + 2 * kY * f_yz_i) - (kY * kY * dZ_r + 2 * kY * kZ * dY_r) - kY * kY * kZ * val_i;
     const ST gh_yyz_i =
         f3_yyz_i - (kZ * f_yy_r + 2 * kY * f_yz_r) - (kY * kY * dZ_i + 2 * kY * kZ * dY_i) + kY * kY * kZ * val_r;
     const ST gh_yzz_r =
         f3_yzz_r + (2 * kZ * f_yz_i + kY * f_zz_i) - (2 * kY * kZ * dZ_r + kZ * kZ * dY_r) - kY * kZ * kZ * val_i;
     const ST gh_yzz_i =
         f3_yzz_i - (2 * kZ * f_yz_r + kY * f_zz_r) - (2 * kY * kZ * dZ_i + kZ * kZ * dY_i) + kY * kZ * kZ * val_r;
     const ST gh_zzz_r = f3_zzz_r + 3 * kZ * f_zz_i - 3 * kZ * kZ * dZ_r - kZ * kZ * kZ * val_i;
     const ST gh_zzz_i = f3_zzz_i - 3 * kZ * f_zz_r - 3 * kZ * kZ * dZ_i + kZ * kZ * kZ * val_r;

     grad_grad_grad_psi[psiIndex][0][0] = ComplexT(c * gh_xxx_r - s * gh_xxx_i, c * gh_xxx_i + s * gh_xxx_r);
     grad_grad_grad_psi[psiIndex][0][1] = ComplexT(c * gh_xxy_r - s * gh_xxy_i, c * gh_xxy_i + s * gh_xxy_r);
     grad_grad_grad_psi[psiIndex][0][2] = ComplexT(c * gh_xxz_r - s * gh_xxz_i, c * gh_xxz_i + s * gh_xxz_r);
     grad_grad_grad_psi[psiIndex][0][3] = ComplexT(c * gh_xxy_r - s * gh_xxy_i, c * gh_xxy_i + s * gh_xxy_r);
     grad_grad_grad_psi[psiIndex][0][4] = ComplexT(c * gh_xyy_r - s * gh_xyy_i, c * gh_xyy_i + s * gh_xyy_r);
     grad_grad_grad_psi[psiIndex][0][5] = ComplexT(c * gh_xyz_r - s * gh_xyz_i, c * gh_xyz_i + s * gh_xyz_r);
     grad_grad_grad_psi[psiIndex][0][6] = ComplexT(c * gh_xxz_r - s * gh_xxz_i, c * gh_xxz_i + s * gh_xxz_r);
     grad_grad_grad_psi[psiIndex][0][7] = ComplexT(c * gh_xyz_r - s * gh_xyz_i, c * gh_xyz_i + s * gh_xyz_r);
     grad_grad_grad_psi[psiIndex][0][8] = ComplexT(c * gh_xzz_r - s * gh_xzz_i, c * gh_xzz_i + s * gh_xzz_r);

     grad_grad_grad_psi[psiIndex][1][0] = ComplexT(c * gh_xxy_r - s * gh_xxy_i, c * gh_xxy_i + s * gh_xxy_r);
     grad_grad_grad_psi[psiIndex][1][1] = ComplexT(c * gh_xyy_r - s * gh_xyy_i, c * gh_xyy_i + s * gh_xyy_r);
     grad_grad_grad_psi[psiIndex][1][2] = ComplexT(c * gh_xyz_r - s * gh_xyz_i, c * gh_xyz_i + s * gh_xyz_r);
     grad_grad_grad_psi[psiIndex][1][3] = ComplexT(c * gh_xyy_r - s * gh_xyy_i, c * gh_xyy_i + s * gh_xyy_r);
     grad_grad_grad_psi[psiIndex][1][4] = ComplexT(c * gh_yyy_r - s * gh_yyy_i, c * gh_yyy_i + s * gh_yyy_r);
     grad_grad_grad_psi[psiIndex][1][5] = ComplexT(c * gh_yyz_r - s * gh_yyz_i, c * gh_yyz_i + s * gh_yyz_r);
     grad_grad_grad_psi[psiIndex][1][6] = ComplexT(c * gh_xyz_r - s * gh_xyz_i, c * gh_xyz_i + s * gh_xyz_r);
     grad_grad_grad_psi[psiIndex][1][7] = ComplexT(c * gh_yyz_r - s * gh_yyz_i, c * gh_yyz_i + s * gh_yyz_r);
     grad_grad_grad_psi[psiIndex][1][8] = ComplexT(c * gh_yzz_r - s * gh_yzz_i, c * gh_yzz_i + s * gh_yzz_r);


     grad_grad_grad_psi[psiIndex][2][0] = ComplexT(c * gh_xxz_r - s * gh_xxz_i, c * gh_xxz_i + s * gh_xxz_r);
     grad_grad_grad_psi[psiIndex][2][1] = ComplexT(c * gh_xyz_r - s * gh_xyz_i, c * gh_xyz_i + s * gh_xyz_r);
     grad_grad_grad_psi[psiIndex][2][2] = ComplexT(c * gh_xzz_r - s * gh_xzz_i, c * gh_xzz_i + s * gh_xzz_r);
     grad_grad_grad_psi[psiIndex][2][3] = ComplexT(c * gh_xyz_r - s * gh_xyz_i, c * gh_xyz_i + s * gh_xyz_r);
     grad_grad_grad_psi[psiIndex][2][4] = ComplexT(c * gh_yyz_r - s * gh_yyz_i, c * gh_yyz_i + s * gh_yyz_r);
     grad_grad_grad_psi[psiIndex][2][5] = ComplexT(c * gh_yzz_r - s * gh_yzz_i, c * gh_yzz_i + s * gh_yzz_r);
     grad_grad_grad_psi[psiIndex][2][6] = ComplexT(c * gh_xzz_r - s * gh_xzz_i, c * gh_xzz_i + s * gh_xzz_r);
     grad_grad_grad_psi[psiIndex][2][7] = ComplexT(c * gh_yzz_r - s * gh_yzz_i, c * gh_yzz_i + s * gh_yzz_r);
     grad_grad_grad_psi[psiIndex][2][8] = ComplexT(c * gh_zzz_r - s * gh_zzz_i, c * gh_zzz_i + s * gh_zzz_r);
   }
 }

 template<typename ST>
 void SplineC2COMPTarget<ST>::evaluateVGHGH(const ParticleSet& P,
                                            const int iat,
                                            ValueVector& psi,
                                            GradVector& dpsi,
                                            HessVector& grad_grad_psi,
                                            GGGVector& grad_grad_grad_psi)
 {
   const PointType& r = P.activeR(iat);
   PointType ru(PrimLattice.toUnit_floor(r));
 #pragma omp parallel
   {
     int first, last;
     FairDivideAligned(2 * psi.size(), getAlignment<ST>(), omp_get_num_threads(), omp_get_thread_num(), first, last);

     spline2::evaluate3d_vghgh(SplineInst->getSplinePtr(), ru, myV, myG, myH, mygH, first, last);
     assign_vghgh(r, psi, dpsi, grad_grad_psi, grad_grad_grad_psi, first / 2, last / 2);
   }
 }

 template<typename ST>
 void SplineC2COMPTarget<ST>::evaluate_notranspose(const ParticleSet& P,
                                                   int first,
                                                   int last,
                                                   ValueMatrix& logdet,
                                                   GradMatrix& dlogdet,
                                                   ValueMatrix& d2logdet)
 {
   // chunk the [first, last) loop into blocks to save temporary memory usage
   const int block_size = 16;

   // reference vectors refer to the rows of matrices
   std::vector<ValueVector> multi_psi_v;
   std::vector<GradVector> multi_dpsi_v;
   std::vector<ValueVector> multi_d2psi_v;
   RefVector<ValueVector> psi_v_list;
   RefVector<GradVector> dpsi_v_list;
   RefVector<ValueVector> d2psi_v_list;

   multi_psi_v.reserve(block_size);
   multi_dpsi_v.reserve(block_size);
   multi_d2psi_v.reserve(block_size);
   psi_v_list.reserve(block_size);
   dpsi_v_list.reserve(block_size);
   d2psi_v_list.reserve(block_size);

   for (int iat = first, i = 0; iat < last; iat += block_size, i += block_size)
   {
     const int actual_block_size = std::min(last - iat, block_size);
     multi_pos_copy.resize(actual_block_size * 6);
     multi_psi_v.clear();
     multi_dpsi_v.clear();
     multi_d2psi_v.clear();
     psi_v_list.clear();
     dpsi_v_list.clear();
     d2psi_v_list.clear();

     for (int ipos = 0; ipos < actual_block_size; ++ipos)
     {
       // pack particle positions
       const PointType& r = P.activeR(iat + ipos);
       PointType ru(PrimLattice.toUnit_floor(r));
       multi_pos_copy[ipos * 6]     = r[0];
       multi_pos_copy[ipos * 6 + 1] = r[1];
       multi_pos_copy[ipos * 6 + 2] = r[2];
       multi_pos_copy[ipos * 6 + 3] = ru[0];
       multi_pos_copy[ipos * 6 + 4] = ru[1];
       multi_pos_copy[ipos * 6 + 5] = ru[2];

       multi_psi_v.emplace_back(logdet[i + ipos], logdet.cols());
       multi_dpsi_v.emplace_back(dlogdet[i + ipos], dlogdet.cols());
       multi_d2psi_v.emplace_back(d2logdet[i + ipos], d2logdet.cols());

       psi_v_list.push_back(multi_psi_v[ipos]);
       dpsi_v_list.push_back(multi_dpsi_v[ipos]);
       d2psi_v_list.push_back(multi_d2psi_v[ipos]);
     }

     evaluateVGLMultiPos(multi_pos_copy, offload_scratch, results_scratch, psi_v_list, dpsi_v_list, d2psi_v_list);
   }
 }

 template class SplineC2COMPTarget<float>;
 template class SplineC2COMPTarget<double>;

 } // namespace qmcplusplus
qmcplusplus::SPOSet::HessVector
OrbitalSetTraits< ValueType >::HessVector HessVector
Definition: SPOSet.h:53

qmcplusplus::HESS22
Definition: contraction_helper.hpp:29

qmcplusplus::HESS12
Definition: contraction_helper.hpp:28

qmcplusplus::SymTrace
T SymTrace(T h00, T h01, T h02, T h11, T h12, T h22, const T gg[6])
compute Trace(H*G)
Definition: contraction_helper.hpp:45

qmcplusplus::TinyVector
Fixed-size array.
Definition: OhmmsTinyMeta.h:30

ApplyPhaseC2C.hpp

qmcplusplus::Units::time::s
const real s
Definition: unit_conversion.h:47

qmcplusplus::SplineC2COMPTarget::evaluateVGL
virtual void evaluateVGL(const ParticleSet &P, const int iat, ValueVector &psi, GradVector &dpsi, ValueVector &d2psi) override
evaluate the values, gradients and laplacians of this single-particle orbital set ...
Definition: SplineC2COMPTarget.cpp:384

qmcplusplus::HESS00
Definition: contraction_helper.hpp:24

qmcplusplus
helper functions for EinsplineSetBuilder
Definition: Configuration.h:43

qmcplusplus::SplineC2COMPTarget::evaluateValue
virtual void evaluateValue(const ParticleSet &P, const int iat, ValueVector &psi) override
evaluate the values of this single-particle orbital set
Definition: SplineC2COMPTarget.cpp:82

qmcplusplus::QMCTraits::RealType
QTBase::RealType RealType
Definition: Configuration.h:58

qmcplusplus::RefVectorWithLeader::getLeader
T & getLeader() const
Definition: RefVectorWithLeader.h:34

OpenMP.h

qmcplusplus::SplineC2COMPTarget::mw_mem_handle_
ResourceHandle< SplineOMPTargetMultiWalkerMem< ST, ComplexT > > mw_mem_handle_
Definition: SplineC2COMPTarget.h:81

Array::data
Type_t * data()
Definition: OhmmsArray.h:87

qmcplusplus::VirtualParticleSet
A ParticleSet that handles virtual moves of a selected particle of a given physical ParticleSet Virtu...
Definition: VirtualParticleSet.h:39

qmcplusplus::SPOSet::ValueMatrix
OrbitalSetTraits< ValueType >::ValueMatrix ValueMatrix
Definition: SPOSet.h:50

qmcplusplus::VirtualParticleSet::getTotalNum
size_t getTotalNum() const
Definition: VirtualParticleSet.h:98

qmcplusplus::v_m_v
T v_m_v(T h00, T h01, T h02, T h11, T h12, T h22, T g1x, T g1y, T g1z, T g2x, T g2y, T g2z)
compute vector[3]^T x matrix[3][3] x vector[3]
Definition: contraction_helper.hpp:54

qmcplusplus::t3_contract
T t3_contract(T h000, T h001, T h002, T h011, T h012, T h022, T h111, T h112, T h122, T h222, T g1x, T g1y, T g1z, T g2x, T g2y, T g2z, T g3x, T g3y, T g3z)
Coordinate transform for a 3rd rank symmetric tensor representing coordinate derivatives (hence t3_co...
Definition: contraction_helper.hpp:69

qmcplusplus::hdf_archive
class to handle hdf file
Definition: hdf_archive.h:51

qmcplusplus::SplineC2COMPTarget::write_splines
bool write_splines(hdf_archive &h5f)
Definition: SplineC2COMPTarget.cpp:47

qmcplusplus::Vector< ST, aligned_allocator< ST > >

qmcplusplus::C2C::assign_v
void assign_v(ST x, ST y, ST z, TT *restrict results_scratch_ptr, const ST *restrict offload_scratch_ptr, const ST *restrict myKcart_ptr, size_t myKcart_padded_size, size_t first_spo, int index)
Definition: ApplyPhaseC2C.hpp:20

qmcplusplus::TinyVector::size
int size() const
Definition: TinyVector.h:115

qmcplusplus::SplineC2COMPTarget::evaluateVGH
virtual void evaluateVGH(const ParticleSet &P, const int iat, ValueVector &psi, GradVector &dpsi, HessVector &grad_grad_psi) override
evaluate the values, gradients and hessians of this single-particle orbital set
Definition: SplineC2COMPTarget.cpp:891

qmcplusplus::SplineC2COMPTarget
class to match std::complex<ST> spline with BsplineSet::ValueType (complex) SPOs with OpenMP offload ...
Definition: SplineC2COMPTarget.h:41

omptarget::min
T min(T a, T b)
Definition: OMPTargetMath.hpp:36

qmcplusplus::SplineC2COMPTarget::mw_evaluateVGLandDetRatioGrads
virtual void mw_evaluateVGLandDetRatioGrads(const RefVectorWithLeader< SPOSet > &spo_list, const RefVectorWithLeader< ParticleSet > &P_list, int iat, const std::vector< const ValueType *> &invRow_ptr_list, OffloadMWVGLArray &phi_vgl_v, std::vector< ValueType > &ratios, std::vector< GradType > &grads) const override
evaluate the values, gradients and laplacians of this single-particle orbital sets and determinant ra...
Definition: SplineC2COMPTarget.cpp:601

qmcplusplus::SplineC2COMPTarget::evaluateVGLMultiPos
void evaluateVGLMultiPos(const Vector< ST, OffloadPinnedAllocator< ST >> &multi_pos_copy, Vector< ST, OffloadPinnedAllocator< ST >> &offload_scratch, Vector< ComplexT, OffloadPinnedAllocator< ComplexT >> &results_scratch, const RefVector< ValueVector > &psi_v_list, const RefVector< GradVector > &dpsi_v_list, const RefVector< ValueVector > &d2psi_v_list) const
Definition: SplineC2COMPTarget.cpp:470

qmcplusplus::SPOSet::GradMatrix
OrbitalSetTraits< ValueType >::GradMatrix GradMatrix
Definition: SPOSet.h:52

qmcplusplus::HESS02
Definition: contraction_helper.hpp:26

omp_get_thread_num
omp_int_t omp_get_thread_num()
Definition: OpenMP.h:25

qmcplusplus::ParticleSet
Specialized paritlce class for atomistic simulations.
Definition: ParticleSet.h:55

qmcplusplus::HESS01
Definition: contraction_helper.hpp:25

qmcplusplus::SplineC2COMPTarget::SplineC2COMPTarget
SplineC2COMPTarget(const std::string &my_name)
Definition: SplineC2COMPTarget.h:110

qmcplusplus::HESS11
Definition: contraction_helper.hpp:27

qmcplusplus::QMCTraits::ValueType
QTBase::ValueType ValueType
Definition: Configuration.h:60

FairDivideAligned
void FairDivideAligned(const int ntot, const int base, const int npart, const int me, int &first, int &last)
Partition ntot over npart and the size of each partition is a multiple of base size.
Definition: FairDivide.h:96

qmcplusplus::SPOSet::ValueVector
OrbitalSetTraits< ValueType >::ValueVector ValueVector
Definition: SPOSet.h:49

qmcplusplus::SplineC2COMPTarget::mw_evaluateDetRatios
virtual void mw_evaluateDetRatios(const RefVectorWithLeader< SPOSet > &spo_list, const RefVectorWithLeader< const VirtualParticleSet > &vp_list, const RefVector< ValueVector > &psi_list, const std::vector< const ValueType *> &invRow_ptr_list, std::vector< std::vector< ValueType >> &ratios_list) const override
evaluate determinant ratios for virtual moves, e.g., sphere move for nonlocalPP, of multiple walkers ...
Definition: SplineC2COMPTarget.cpp:192

qmcplusplus::NUM_FIELDS
Definition: contraction_helper.hpp:31

Array::size
size_t size() const
Definition: OhmmsArray.h:57

qmcplusplus::SplineC2COMPTarget::assign_vgh
void assign_vgh(const PointType &r, ValueVector &psi, GradVector &dpsi, HessVector &grad_grad_psi, int first, int last) const
Definition: SplineC2COMPTarget.cpp:771

qmcplusplus::SplineC2COMPTarget::assign_vghgh
void assign_vghgh(const PointType &r, ValueVector &psi, GradVector &dpsi, HessVector &grad_grad_psi, GGGVector &grad_grad_grad_psi, int first=0, int last=-1) const
Definition: SplineC2COMPTarget.cpp:912

qmcplusplus::ParticleSet::activeR
const PosType & activeR(int iat) const
return the active position if the particle is active or the return current position if not ...
Definition: ParticleSet.h:265

omp_get_num_threads
omp_int_t omp_get_num_threads()
Definition: OpenMP.h:27

qmcplusplus::OMPallocator
OMPallocator is an allocator with fused device and dualspace allocator functionality.
Definition: OMPallocator.hpp:60

qmcplusplus::SplineC2COMPTarget::ComplexT
typename BsplineSet::ValueType ComplexT
Definition: SplineC2COMPTarget.h:50

contraction_helper.hpp

qmcplusplus::SplineC2COMPTarget::assign_v
void assign_v(const PointType &r, const vContainer_type &myV, ValueVector &psi, int first, int last) const
Definition: SplineC2COMPTarget.cpp:56

qmcplusplus::SplineC2COMPTarget::evaluate_notranspose
virtual void evaluate_notranspose(const ParticleSet &P, int first, int last, ValueMatrix &logdet, GradMatrix &dlogdet, ValueMatrix &d2logdet) override
evaluate the values, gradients and laplacians of this single-particle orbital for [first...
Definition: SplineC2COMPTarget.cpp:1168

ompReductionComplex.hpp

qmcplusplus::RefVector
std::vector< std::reference_wrapper< T > > RefVector
Definition: template_types.hpp:32

qmcplusplus::SplineC2COMPTarget::mw_evaluateVGL
virtual void mw_evaluateVGL(const RefVectorWithLeader< SPOSet > &sa_list, const RefVectorWithLeader< ParticleSet > &P_list, int iat, const RefVector< ValueVector > &psi_v_list, const RefVector< GradVector > &dpsi_v_list, const RefVector< ValueVector > &d2psi_v_list) const override
evaluate the values, gradients and laplacians of this single-particle orbital sets of multiple walker...
Definition: SplineC2COMPTarget.cpp:567

qmcplusplus::SplineC2COMPTarget::read_splines
bool read_splines(hdf_archive &h5f)
Definition: SplineC2COMPTarget.cpp:38

qmcplusplus::RefVectorWithLeader::getCastedLeader
CASTTYPE & getCastedLeader() const
Definition: RefVectorWithLeader.h:39

qmcplusplus::TinyVector::data
Type_t * data()
Definition: TinyVector.h:138

qmcplusplus::SPOSet::GGGVector
OrbitalSetTraits< ValueType >::GradHessVector GGGVector
Definition: SPOSet.h:55

qmcplusplus::syclBLAS::copy_n
sycl::event copy_n(sycl::queue &aq, const T1 *restrict VA, size_t array_size, T2 *restrict VC, const std::vector< sycl::event > &events)
Definition: syclBLAS.cpp:548

qmcplusplus::SPOSet::GradVector
OrbitalSetTraits< ValueType >::GradVector GradVector
Definition: SPOSet.h:51

qmcplusplus::C2C::assign_vgl
void assign_vgl(ST x, ST y, ST z, TT *restrict results_scratch_ptr, size_t orb_padded_size, const ST *mKK_ptr, const ST *restrict offload_scratch_ptr, size_t spline_padded_size, const ST G[9], const ST *myKcart_ptr, size_t myKcart_padded_size, size_t first_spo, int index)
assign_vgl
Definition: ApplyPhaseC2C.hpp:49

qmcplusplus::SplineC2COMPTarget::assign_vgl_from_l
void assign_vgl_from_l(const PointType &r, ValueVector &psi, GradVector &dpsi, ValueVector &d2psi)
assign_vgl_from_l can be used when myL is precomputed and myV,myG,myL in cartesian ...
Definition: SplineC2COMPTarget.cpp:320

qmcplusplus::SplineC2COMPTarget::evaluateDetRatios
virtual void evaluateDetRatios(const VirtualParticleSet &VP, ValueVector &psi, const ValueVector &psiinv, std::vector< ValueType > &ratios) override
evaluate determinant ratios for virtual moves, e.g., sphere move for nonlocalPP
Definition: SplineC2COMPTarget.cpp:99

SplineC2COMPTarget.h
class to handle complex splines to complex orbitals with splines of arbitrary precision splines stora...

qmcplusplus::sincos
void sincos(T a, T *restrict s, T *restrict c)
sincos function wrapper
Definition: math.hpp:62

qmcplusplus::SplineC2COMPTarget::evaluateVGHGH
virtual void evaluateVGHGH(const ParticleSet &P, const int iat, ValueVector &psi, GradVector &dpsi, HessVector &grad_grad_psi, GGGVector &grad_grad_grad_psi) override
evaluate the values, gradients, hessians, and grad hessians of this single-particle orbital set ...
Definition: SplineC2COMPTarget.cpp:1148

qmcplusplus::RefVectorWithLeader
Definition: RefVectorWithLeader.h:23

qmcplusplus::Units::force::N
const real N
Definition: unit_conversion.h:92

qmcplusplus::hdf_archive::readEntry
bool readEntry(T &data, const std::string &aname)
read the data from the group aname and return status use read() for inbuilt error checking ...
Definition: hdf_archive.h:293

Array
A D-dimensional Array class based on PETE.
Definition: OhmmsArray.h:25

qmcplusplus::SplineC2COMPTarget::set_spline
void set_spline(SingleSplineType *spline_r, SingleSplineType *spline_i, int twist, int ispline, int level)
Definition: SplineC2COMPTarget.cpp:27

qmcplusplus::hdf_archive::writeEntry
bool writeEntry(T &data, const std::string &aname)
write the data to the group aname and return status use write() for inbuilt error checking ...
Definition: hdf_archive.h:244

qmcplusplus::ScopeGuard
Definition: NewTimer.h:241

qmcplusplus::LAPL
Definition: contraction_helper.hpp:30

qmcplusplus::SplineC2COMPTarget::SingleSplineType
UBspline_3d_d SingleSplineType
Definition: SplineC2COMPTarget.h:48