QMCPACK
SplineC2COMPTarget.cpp
Go to the documentation of this file.
1 //////////////////////////////////////////////////////////////////////////////////////
2 // This file is distributed under the University of Illinois/NCSA Open Source License.
3 // See LICENSE file in top directory for details.
4 //
5 // Copyright (c) 2020 QMCPACK developers.
6 //
7 // File developed by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
8 //
9 // File created by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
10 //////////////////////////////////////////////////////////////////////////////////////
11 
12 
13 #include "SplineC2COMPTarget.h"
14 #include "spline2/MultiBsplineEval.hpp"
15 #include "spline2/MultiBsplineEval_OMPoffload.hpp"
18 #include "ApplyPhaseC2C.hpp"
19 #include "Concurrency/OpenMP.h"
20 
21 namespace qmcplusplus
22 {
23 template<typename ST>
24 SplineC2COMPTarget<ST>::SplineC2COMPTarget(const SplineC2COMPTarget& in) = default;
25 
26 template<typename ST>
28  SingleSplineType* spline_i,
29  int twist,
30  int ispline,
31  int level)
32 {
33  SplineInst->copy_spline(spline_r, 2 * ispline);
34  SplineInst->copy_spline(spline_i, 2 * ispline + 1);
35 }
36 
37 template<typename ST>
39 {
40  std::ostringstream o;
41  o << "spline_" << MyIndex;
42  einspline_engine<SplineType> bigtable(SplineInst->getSplinePtr());
43  return h5f.readEntry(bigtable, o.str().c_str()); //"spline_0");
44 }
45 
46 template<typename ST>
48 {
49  std::ostringstream o;
50  o << "spline_" << MyIndex;
51  einspline_engine<SplineType> bigtable(SplineInst->getSplinePtr());
52  return h5f.writeEntry(bigtable, o.str().c_str()); //"spline_0");
53 }
54 
55 template<typename ST>
57  const vContainer_type& myV,
58  ValueVector& psi,
59  int first,
60  int last) const
61 {
62  // protect last
63  const size_t last_cplx = std::min(kPoints.size(), psi.size());
64  last = last > last_cplx ? last_cplx : last;
65 
66  const ST x = r[0], y = r[1], z = r[2];
67  const ST* restrict kx = myKcart->data(0);
68  const ST* restrict ky = myKcart->data(1);
69  const ST* restrict kz = myKcart->data(2);
70 #pragma omp simd
71  for (size_t j = first; j < last; ++j)
72  {
73  ST s, c;
74  const ST val_r = myV[2 * j];
75  const ST val_i = myV[2 * j + 1];
76  omptarget::sincos(-(x * kx[j] + y * ky[j] + z * kz[j]), &s, &c);
77  psi[j + first_spo] = ComplexT(val_r * c - val_i * s, val_i * c + val_r * s);
78  }
79 }
80 
81 template<typename ST>
83 {
84  const PointType& r = P.activeR(iat);
85  PointType ru(PrimLattice.toUnit_floor(r));
86 
87 #pragma omp parallel
88  {
89  int first, last;
90  // Factor of 2 because psi is complex and the spline storage and evaluation uses a real type
91  FairDivideAligned(2 * psi.size(), getAlignment<ST>(), omp_get_num_threads(), omp_get_thread_num(), first, last);
92 
93  spline2::evaluate3d(SplineInst->getSplinePtr(), ru, myV, first, last);
94  assign_v(r, myV, psi, first / 2, last / 2);
95  }
96 }
97 
98 template<typename ST>
100  ValueVector& psi,
101  const ValueVector& psiinv,
102  std::vector<ValueType>& ratios)
103 {
104  const int nVP = VP.getTotalNum();
105  psiinv_pos_copy.resize(psiinv.size() + nVP * 3);
106 
107  // stage psiinv to psiinv_pos_copy
108  std::copy_n(psiinv.data(), psiinv.size(), psiinv_pos_copy.data());
109 
110  // pack particle positions
111  auto* restrict pos_scratch = reinterpret_cast<RealType*>(psiinv_pos_copy.data() + psiinv.size());
112  for (int iat = 0; iat < nVP; ++iat)
113  {
114  const PointType& r = VP.activeR(iat);
115  PointType ru(PrimLattice.toUnit_floor(r));
116  pos_scratch[iat * 6] = r[0];
117  pos_scratch[iat * 6 + 1] = r[1];
118  pos_scratch[iat * 6 + 2] = r[2];
119  pos_scratch[iat * 6 + 3] = ru[0];
120  pos_scratch[iat * 6 + 4] = ru[1];
121  pos_scratch[iat * 6 + 5] = ru[2];
122  }
123 
124  const size_t ChunkSizePerTeam = 512;
125  const int NumTeams = (myV.size() + ChunkSizePerTeam - 1) / ChunkSizePerTeam;
126  ratios_private.resize(nVP, NumTeams);
127  const auto spline_padded_size = myV.size();
128  const auto sposet_padded_size = getAlignedSize<ValueType>(OrbitalSetSize);
129  offload_scratch.resize(spline_padded_size * nVP);
130  results_scratch.resize(sposet_padded_size * nVP);
131 
132  // Ye: need to extract sizes and pointers before entering target region
133  const auto* spline_ptr = SplineInst->getSplinePtr();
134  auto* offload_scratch_ptr = offload_scratch.data();
135  auto* results_scratch_ptr = results_scratch.data();
136  const auto myKcart_padded_size = myKcart->capacity();
137  auto* myKcart_ptr = myKcart->data();
138  auto* psiinv_ptr = psiinv_pos_copy.data();
139  auto* ratios_private_ptr = ratios_private.data();
140  const size_t first_spo_local = first_spo;
141  const auto orb_size = psiinv.size();
142 
143  {
144  ScopedTimer offload(offload_timer_);
145  PRAGMA_OFFLOAD("omp target teams distribute collapse(2) num_teams(NumTeams*nVP) \
146  map(always, to: psiinv_ptr[0:psiinv_pos_copy.size()]) \
147  map(always, from: ratios_private_ptr[0:NumTeams*nVP])")
148  for (int iat = 0; iat < nVP; iat++)
149  for (int team_id = 0; team_id < NumTeams; team_id++)
150  {
151  const size_t first = ChunkSizePerTeam * team_id;
152  const size_t last = omptarget::min(first + ChunkSizePerTeam, spline_padded_size);
153 
154  auto* restrict offload_scratch_iat_ptr = offload_scratch_ptr + spline_padded_size * iat;
155  auto* restrict psi_iat_ptr = results_scratch_ptr + sposet_padded_size * iat;
156  auto* restrict pos_scratch = reinterpret_cast<RealType*>(psiinv_ptr + orb_size);
157 
158  int ix, iy, iz;
159  ST a[4], b[4], c[4];
160  spline2::computeLocationAndFractional(spline_ptr, ST(pos_scratch[iat * 6 + 3]), ST(pos_scratch[iat * 6 + 4]),
161  ST(pos_scratch[iat * 6 + 5]), ix, iy, iz, a, b, c);
162 
163  PRAGMA_OFFLOAD("omp parallel for")
164  for (int index = 0; index < last - first; index++)
165  spline2offload::evaluate_v_impl_v2(spline_ptr, ix, iy, iz, first + index, a, b, c,
166  offload_scratch_iat_ptr + first + index);
167  const size_t first_cplx = first / 2;
168  const size_t last_cplx = omptarget::min(last / 2, orb_size);
169  PRAGMA_OFFLOAD("omp parallel for")
170  for (int index = first_cplx; index < last_cplx; index++)
171  C2C::assign_v(ST(pos_scratch[iat * 6]), ST(pos_scratch[iat * 6 + 1]), ST(pos_scratch[iat * 6 + 2]),
172  psi_iat_ptr, offload_scratch_iat_ptr, myKcart_ptr, myKcart_padded_size, first_spo_local, index);
173 
174  ComplexT sum(0);
175  PRAGMA_OFFLOAD("omp parallel for simd reduction(+:sum)")
176  for (int i = first_cplx; i < last_cplx; i++)
177  sum += psi_iat_ptr[i] * psiinv_ptr[i];
178  ratios_private_ptr[iat * NumTeams + team_id] = sum;
179  }
180  }
181 
182  // do the reduction manually
183  for (int iat = 0; iat < nVP; ++iat)
184  {
185  ratios[iat] = ComplexT(0);
186  for (int tid = 0; tid < NumTeams; tid++)
187  ratios[iat] += ratios_private[iat][tid];
188  }
189 }
190 
191 template<typename ST>
194  const RefVector<ValueVector>& psi_list,
195  const std::vector<const ValueType*>& invRow_ptr_list,
196  std::vector<std::vector<ValueType>>& ratios_list) const
197 {
198  assert(this == &spo_list.getLeader());
199  auto& phi_leader = spo_list.getCastedLeader<SplineC2COMPTarget<ST>>();
200  auto& mw_mem = phi_leader.mw_mem_handle_.getResource();
201  auto& det_ratios_buffer_H2D = mw_mem.det_ratios_buffer_H2D;
202  auto& mw_ratios_private = mw_mem.mw_ratios_private;
203  auto& mw_offload_scratch = mw_mem.mw_offload_scratch;
204  auto& mw_results_scratch = mw_mem.mw_results_scratch;
205  const size_t nw = spo_list.size();
206  const size_t orb_size = phi_leader.size();
207 
208  size_t mw_nVP = 0;
209  for (const VirtualParticleSet& VP : vp_list)
210  mw_nVP += VP.getTotalNum();
211 
212  const size_t packed_size = nw * sizeof(ValueType*) + mw_nVP * (6 * sizeof(ST) + sizeof(int));
213  det_ratios_buffer_H2D.resize(packed_size);
214 
215  // pack invRow_ptr_list to det_ratios_buffer_H2D
216  Vector<const ValueType*> ptr_buffer(reinterpret_cast<const ValueType**>(det_ratios_buffer_H2D.data()), nw);
217  for (size_t iw = 0; iw < nw; iw++)
218  ptr_buffer[iw] = invRow_ptr_list[iw];
219 
220  // pack particle positions
221  auto* pos_ptr = reinterpret_cast<ST*>(det_ratios_buffer_H2D.data() + nw * sizeof(ValueType*));
222  auto* ref_id_ptr =
223  reinterpret_cast<int*>(det_ratios_buffer_H2D.data() + nw * sizeof(ValueType*) + mw_nVP * 6 * sizeof(ST));
224  size_t iVP = 0;
225  for (size_t iw = 0; iw < nw; iw++)
226  {
227  const VirtualParticleSet& VP = vp_list[iw];
228  assert(ratios_list[iw].size() == VP.getTotalNum());
229  for (size_t iat = 0; iat < VP.getTotalNum(); ++iat, ++iVP)
230  {
231  ref_id_ptr[iVP] = iw;
232  const PointType& r = VP.activeR(iat);
233  PointType ru(PrimLattice.toUnit_floor(r));
234  pos_ptr[0] = r[0];
235  pos_ptr[1] = r[1];
236  pos_ptr[2] = r[2];
237  pos_ptr[3] = ru[0];
238  pos_ptr[4] = ru[1];
239  pos_ptr[5] = ru[2];
240  pos_ptr += 6;
241  }
242  }
243 
244  const size_t ChunkSizePerTeam = 512;
245  const int NumTeams = (myV.size() + ChunkSizePerTeam - 1) / ChunkSizePerTeam;
246  mw_ratios_private.resize(mw_nVP, NumTeams);
247  const auto spline_padded_size = myV.size();
248  const auto sposet_padded_size = getAlignedSize<ValueType>(OrbitalSetSize);
249  mw_offload_scratch.resize(spline_padded_size * mw_nVP);
250  mw_results_scratch.resize(sposet_padded_size * mw_nVP);
251 
252  // Ye: need to extract sizes and pointers before entering target region
253  const auto* spline_ptr = SplineInst->getSplinePtr();
254  auto* offload_scratch_ptr = mw_offload_scratch.data();
255  auto* results_scratch_ptr = mw_results_scratch.data();
256  const auto myKcart_padded_size = myKcart->capacity();
257  auto* myKcart_ptr = myKcart->data();
258  auto* buffer_H2D_ptr = det_ratios_buffer_H2D.data();
259  auto* ratios_private_ptr = mw_ratios_private.data();
260  const size_t first_spo_local = first_spo;
261 
262  {
263  ScopedTimer offload(offload_timer_);
264  PRAGMA_OFFLOAD("omp target teams distribute collapse(2) num_teams(NumTeams*mw_nVP) \
265  map(always, to: buffer_H2D_ptr[0:det_ratios_buffer_H2D.size()]) \
266  map(always, from: ratios_private_ptr[0:NumTeams*mw_nVP])")
267  for (int iat = 0; iat < mw_nVP; iat++)
268  for (int team_id = 0; team_id < NumTeams; team_id++)
269  {
270  const size_t first = ChunkSizePerTeam * team_id;
271  const size_t last = omptarget::min(first + ChunkSizePerTeam, spline_padded_size);
272 
273  auto* restrict offload_scratch_iat_ptr = offload_scratch_ptr + spline_padded_size * iat;
274  auto* restrict psi_iat_ptr = results_scratch_ptr + sposet_padded_size * iat;
275  auto* ref_id_ptr = reinterpret_cast<int*>(buffer_H2D_ptr + nw * sizeof(ValueType*) + mw_nVP * 6 * sizeof(ST));
276  auto* restrict psiinv_ptr = reinterpret_cast<const ValueType**>(buffer_H2D_ptr)[ref_id_ptr[iat]];
277  auto* restrict pos_scratch = reinterpret_cast<ST*>(buffer_H2D_ptr + nw * sizeof(ValueType*));
278 
279  int ix, iy, iz;
280  ST a[4], b[4], c[4];
281  spline2::computeLocationAndFractional(spline_ptr, pos_scratch[iat * 6 + 3], pos_scratch[iat * 6 + 4],
282  pos_scratch[iat * 6 + 5], ix, iy, iz, a, b, c);
283 
284  PRAGMA_OFFLOAD("omp parallel for")
285  for (int index = 0; index < last - first; index++)
286  spline2offload::evaluate_v_impl_v2(spline_ptr, ix, iy, iz, first + index, a, b, c,
287  offload_scratch_iat_ptr + first + index);
288  const size_t first_cplx = first / 2;
289  const size_t last_cplx = omptarget::min(last / 2, orb_size);
290  PRAGMA_OFFLOAD("omp parallel for")
291  for (int index = first_cplx; index < last_cplx; index++)
292  C2C::assign_v(pos_scratch[iat * 6], pos_scratch[iat * 6 + 1], pos_scratch[iat * 6 + 2], psi_iat_ptr,
293  offload_scratch_iat_ptr, myKcart_ptr, myKcart_padded_size, first_spo_local, index);
294 
295  ComplexT sum(0);
296  PRAGMA_OFFLOAD("omp parallel for simd reduction(+:sum)")
297  for (int i = first_cplx; i < last_cplx; i++)
298  sum += psi_iat_ptr[i] * psiinv_ptr[i];
299  ratios_private_ptr[iat * NumTeams + team_id] = sum;
300  }
301  }
302 
303  // do the reduction manually
304  iVP = 0;
305  for (size_t iw = 0; iw < nw; iw++)
306  {
307  auto& ratios = ratios_list[iw];
308  for (size_t iat = 0; iat < ratios.size(); iat++, iVP++)
309  {
310  ratios[iat] = ComplexT(0);
311  for (int tid = 0; tid < NumTeams; ++tid)
312  ratios[iat] += mw_ratios_private[iVP][tid];
313  }
314  }
315 }
316 
317 /** assign_vgl_from_l can be used when myL is precomputed and myV,myG,myL in cartesian
318  */
319 template<typename ST>
321  ValueVector& psi,
322  GradVector& dpsi,
323  ValueVector& d2psi)
324 {
325  constexpr ST two(2);
326  const ST x = r[0], y = r[1], z = r[2];
327 
328  const ST* restrict k0 = myKcart->data(0);
329  const ST* restrict k1 = myKcart->data(1);
330  const ST* restrict k2 = myKcart->data(2);
331 
332  const ST* restrict g0 = myG.data(0);
333  const ST* restrict g1 = myG.data(1);
334  const ST* restrict g2 = myG.data(2);
335 
336  const size_t last_cplx = last_spo > psi.size() ? psi.size() : last_spo;
337  const size_t N = last_cplx - first_spo;
338 #pragma omp simd
339  for (size_t j = 0; j < N; ++j)
340  {
341  const size_t jr = j << 1;
342  const size_t ji = jr + 1;
343 
344  const ST kX = k0[j];
345  const ST kY = k1[j];
346  const ST kZ = k2[j];
347  const ST val_r = myV[jr];
348  const ST val_i = myV[ji];
349 
350  //phase
351  ST s, c;
352  omptarget::sincos(-(x * kX + y * kY + z * kZ), &s, &c);
353 
354  //dot(PrimLattice.G,myG[j])
355  const ST dX_r = g0[jr];
356  const ST dY_r = g1[jr];
357  const ST dZ_r = g2[jr];
358 
359  const ST dX_i = g0[ji];
360  const ST dY_i = g1[ji];
361  const ST dZ_i = g2[ji];
362 
363  // \f$\nabla \psi_r + {\bf k}\psi_i\f$
364  const ST gX_r = dX_r + val_i * kX;
365  const ST gY_r = dY_r + val_i * kY;
366  const ST gZ_r = dZ_r + val_i * kZ;
367  const ST gX_i = dX_i - val_r * kX;
368  const ST gY_i = dY_i - val_r * kY;
369  const ST gZ_i = dZ_i - val_r * kZ;
370 
371  const ST lap_r = myL[jr] + (*mKK)[j] * val_r + two * (kX * dX_i + kY * dY_i + kZ * dZ_i);
372  const ST lap_i = myL[ji] + (*mKK)[j] * val_i - two * (kX * dX_r + kY * dY_r + kZ * dZ_r);
373 
374  const size_t psiIndex = j + first_spo;
375  psi[psiIndex] = ComplexT(c * val_r - s * val_i, c * val_i + s * val_r);
376  dpsi[psiIndex][0] = ComplexT(c * gX_r - s * gX_i, c * gX_i + s * gX_r);
377  dpsi[psiIndex][1] = ComplexT(c * gY_r - s * gY_i, c * gY_i + s * gY_r);
378  dpsi[psiIndex][2] = ComplexT(c * gZ_r - s * gZ_i, c * gZ_i + s * gZ_r);
379  d2psi[psiIndex] = ComplexT(c * lap_r - s * lap_i, c * lap_i + s * lap_r);
380  }
381 }
382 
383 template<typename ST>
385  const int iat,
386  ValueVector& psi,
387  GradVector& dpsi,
388  ValueVector& d2psi)
389 {
390  const PointType& r = P.activeR(iat);
391  PointType ru(PrimLattice.toUnit_floor(r));
392 
393  const size_t ChunkSizePerTeam = 512;
394  const int NumTeams = (myV.size() + ChunkSizePerTeam - 1) / ChunkSizePerTeam;
395 
396  const auto spline_padded_size = myV.size();
397  const auto sposet_padded_size = getAlignedSize<ValueType>(OrbitalSetSize);
398  offload_scratch.resize(spline_padded_size * SoAFields3D::NUM_FIELDS);
399  // for V(1)G(3)L(1) final result
400  results_scratch.resize(sposet_padded_size * 5);
401 
402  // Ye: need to extract sizes and pointers before entering target region
403  const auto* spline_ptr = SplineInst->getSplinePtr();
404  auto* offload_scratch_ptr = offload_scratch.data();
405  auto* results_scratch_ptr = results_scratch.data();
406  const auto x = r[0], y = r[1], z = r[2];
407  const auto rux = ru[0], ruy = ru[1], ruz = ru[2];
408  const auto myKcart_padded_size = myKcart->capacity();
409  auto* mKK_ptr = mKK->data();
410  auto* GGt_ptr = GGt_offload->data();
411  auto* PrimLattice_G_ptr = PrimLattice_G_offload->data();
412  auto* myKcart_ptr = myKcart->data();
413  const size_t first_spo_local = first_spo;
414  const auto orb_size = psi.size();
415 
416  {
417  ScopedTimer offload(offload_timer_);
418  PRAGMA_OFFLOAD("omp target teams distribute num_teams(NumTeams) \
419  map(always, from: results_scratch_ptr[0:sposet_padded_size*5])")
420  for (int team_id = 0; team_id < NumTeams; team_id++)
421  {
422  const size_t first = ChunkSizePerTeam * team_id;
423  const size_t last = omptarget::min(first + ChunkSizePerTeam, spline_padded_size);
424 
425  int ix, iy, iz;
426  ST a[4], b[4], c[4], da[4], db[4], dc[4], d2a[4], d2b[4], d2c[4];
427  spline2::computeLocationAndFractional(spline_ptr, rux, ruy, ruz, ix, iy, iz, a, b, c, da, db, dc, d2a, d2b, d2c);
428 
429  const ST G[9] = {PrimLattice_G_ptr[0], PrimLattice_G_ptr[1], PrimLattice_G_ptr[2],
430  PrimLattice_G_ptr[3], PrimLattice_G_ptr[4], PrimLattice_G_ptr[5],
431  PrimLattice_G_ptr[6], PrimLattice_G_ptr[7], PrimLattice_G_ptr[8]};
432  const ST symGGt[6] = {GGt_ptr[0], GGt_ptr[1] + GGt_ptr[3], GGt_ptr[2] + GGt_ptr[6],
433  GGt_ptr[4], GGt_ptr[5] + GGt_ptr[7], GGt_ptr[8]};
434 
435  PRAGMA_OFFLOAD("omp parallel for")
436  for (int index = 0; index < last - first; index++)
437  {
438  spline2offload::evaluate_vgh_impl_v2(spline_ptr, ix, iy, iz, first + index, a, b, c, da, db, dc, d2a, d2b, d2c,
439  offload_scratch_ptr + first + index, spline_padded_size);
440  const int output_index = first + index;
441  offload_scratch_ptr[spline_padded_size * SoAFields3D::LAPL + output_index] =
442  SymTrace(offload_scratch_ptr[spline_padded_size * SoAFields3D::HESS00 + output_index],
443  offload_scratch_ptr[spline_padded_size * SoAFields3D::HESS01 + output_index],
444  offload_scratch_ptr[spline_padded_size * SoAFields3D::HESS02 + output_index],
445  offload_scratch_ptr[spline_padded_size * SoAFields3D::HESS11 + output_index],
446  offload_scratch_ptr[spline_padded_size * SoAFields3D::HESS12 + output_index],
447  offload_scratch_ptr[spline_padded_size * SoAFields3D::HESS22 + output_index], symGGt);
448  }
449 
450  const size_t first_cplx = first / 2;
451  const size_t last_cplx = omptarget::min(last / 2, orb_size);
452  PRAGMA_OFFLOAD("omp parallel for")
453  for (int index = first_cplx; index < last_cplx; index++)
454  C2C::assign_vgl(x, y, z, results_scratch_ptr, sposet_padded_size, mKK_ptr, offload_scratch_ptr,
455  spline_padded_size, G, myKcart_ptr, myKcart_padded_size, first_spo_local, index);
456  }
457  }
458 
459  for (size_t i = 0; i < orb_size; i++)
460  {
461  psi[i] = results_scratch[i];
462  dpsi[i][0] = results_scratch[i + sposet_padded_size];
463  dpsi[i][1] = results_scratch[i + sposet_padded_size * 2];
464  dpsi[i][2] = results_scratch[i + sposet_padded_size * 3];
465  d2psi[i] = results_scratch[i + sposet_padded_size * 4];
466  }
467 }
468 
469 template<typename ST>
471  Vector<ST, OffloadPinnedAllocator<ST>>& offload_scratch,
473  const RefVector<ValueVector>& psi_v_list,
474  const RefVector<GradVector>& dpsi_v_list,
475  const RefVector<ValueVector>& d2psi_v_list) const
476 {
477  const size_t num_pos = psi_v_list.size();
478  const size_t ChunkSizePerTeam = 512;
479  const int NumTeams = (myV.size() + ChunkSizePerTeam - 1) / ChunkSizePerTeam;
480  const auto spline_padded_size = myV.size();
481  const auto sposet_padded_size = getAlignedSize<ValueType>(OrbitalSetSize);
482  offload_scratch.resize(spline_padded_size * num_pos * SoAFields3D::NUM_FIELDS);
483  const auto orb_size = psi_v_list[0].get().size();
484  // for V(1)G(3)L(1) final result
485  results_scratch.resize(sposet_padded_size * num_pos * 5);
486 
487  // Ye: need to extract sizes and pointers before entering target region
488  const auto* spline_ptr = SplineInst->getSplinePtr();
489  auto* pos_copy_ptr = multi_pos.data();
490  auto* offload_scratch_ptr = offload_scratch.data();
491  auto* results_scratch_ptr = results_scratch.data();
492  const auto myKcart_padded_size = myKcart->capacity();
493  auto* mKK_ptr = mKK->data();
494  auto* GGt_ptr = GGt_offload->data();
495  auto* PrimLattice_G_ptr = PrimLattice_G_offload->data();
496  auto* myKcart_ptr = myKcart->data();
497  const size_t first_spo_local = first_spo;
498 
499  {
500  ScopedTimer offload(offload_timer_);
501  PRAGMA_OFFLOAD("omp target teams distribute collapse(2) num_teams(NumTeams*num_pos) \
502  map(always, to: pos_copy_ptr[0:num_pos*6]) \
503  map(always, from: results_scratch_ptr[0:sposet_padded_size*num_pos*5])")
504  for (int iw = 0; iw < num_pos; iw++)
505  for (int team_id = 0; team_id < NumTeams; team_id++)
506  {
507  const size_t first = ChunkSizePerTeam * team_id;
508  const size_t last = omptarget::min(first + ChunkSizePerTeam, spline_padded_size);
509 
510  auto* restrict offload_scratch_iw_ptr = offload_scratch_ptr + spline_padded_size * iw * SoAFields3D::NUM_FIELDS;
511  auto* restrict psi_iw_ptr = results_scratch_ptr + sposet_padded_size * iw * 5;
512 
513  int ix, iy, iz;
514  ST a[4], b[4], c[4], da[4], db[4], dc[4], d2a[4], d2b[4], d2c[4];
515  spline2::computeLocationAndFractional(spline_ptr, pos_copy_ptr[iw * 6 + 3], pos_copy_ptr[iw * 6 + 4],
516  pos_copy_ptr[iw * 6 + 5], ix, iy, iz, a, b, c, da, db, dc, d2a, d2b, d2c);
517 
518  const ST G[9] = {PrimLattice_G_ptr[0], PrimLattice_G_ptr[1], PrimLattice_G_ptr[2],
519  PrimLattice_G_ptr[3], PrimLattice_G_ptr[4], PrimLattice_G_ptr[5],
520  PrimLattice_G_ptr[6], PrimLattice_G_ptr[7], PrimLattice_G_ptr[8]};
521  const ST symGGt[6] = {GGt_ptr[0], GGt_ptr[1] + GGt_ptr[3], GGt_ptr[2] + GGt_ptr[6],
522  GGt_ptr[4], GGt_ptr[5] + GGt_ptr[7], GGt_ptr[8]};
523 
524  PRAGMA_OFFLOAD("omp parallel for")
525  for (int index = 0; index < last - first; index++)
526  {
527  spline2offload::evaluate_vgh_impl_v2(spline_ptr, ix, iy, iz, first + index, a, b, c, da, db, dc, d2a, d2b,
528  d2c, offload_scratch_iw_ptr + first + index, spline_padded_size);
529  const int output_index = first + index;
530  offload_scratch_iw_ptr[spline_padded_size * SoAFields3D::LAPL + output_index] =
531  SymTrace(offload_scratch_iw_ptr[spline_padded_size * SoAFields3D::HESS00 + output_index],
532  offload_scratch_iw_ptr[spline_padded_size * SoAFields3D::HESS01 + output_index],
533  offload_scratch_iw_ptr[spline_padded_size * SoAFields3D::HESS02 + output_index],
534  offload_scratch_iw_ptr[spline_padded_size * SoAFields3D::HESS11 + output_index],
535  offload_scratch_iw_ptr[spline_padded_size * SoAFields3D::HESS12 + output_index],
536  offload_scratch_iw_ptr[spline_padded_size * SoAFields3D::HESS22 + output_index], symGGt);
537  }
538 
539  const size_t first_cplx = first / 2;
540  const size_t last_cplx = omptarget::min(last / 2, orb_size);
541  PRAGMA_OFFLOAD("omp parallel for")
542  for (int index = first_cplx; index < last_cplx; index++)
543  C2C::assign_vgl(pos_copy_ptr[iw * 6], pos_copy_ptr[iw * 6 + 1], pos_copy_ptr[iw * 6 + 2], psi_iw_ptr,
544  sposet_padded_size, mKK_ptr, offload_scratch_iw_ptr, spline_padded_size, G, myKcart_ptr,
545  myKcart_padded_size, first_spo_local, index);
546  }
547  }
548 
549  for (int iw = 0; iw < num_pos; ++iw)
550  {
551  auto* restrict results_iw_ptr = results_scratch_ptr + sposet_padded_size * iw * 5;
552  ValueVector& psi_v(psi_v_list[iw]);
553  GradVector& dpsi_v(dpsi_v_list[iw]);
554  ValueVector& d2psi_v(d2psi_v_list[iw]);
555  for (size_t i = 0; i < orb_size; i++)
556  {
557  psi_v[i] = results_iw_ptr[i];
558  dpsi_v[i][0] = results_iw_ptr[i + sposet_padded_size];
559  dpsi_v[i][1] = results_iw_ptr[i + sposet_padded_size * 2];
560  dpsi_v[i][2] = results_iw_ptr[i + sposet_padded_size * 3];
561  d2psi_v[i] = results_iw_ptr[i + sposet_padded_size * 4];
562  }
563  }
564 }
565 
566 template<typename ST>
568  const RefVectorWithLeader<ParticleSet>& P_list,
569  int iat,
570  const RefVector<ValueVector>& psi_v_list,
571  const RefVector<GradVector>& dpsi_v_list,
572  const RefVector<ValueVector>& d2psi_v_list) const
573 {
574  assert(this == &sa_list.getLeader());
575  auto& phi_leader = sa_list.getCastedLeader<SplineC2COMPTarget<ST>>();
576  auto& mw_mem = phi_leader.mw_mem_handle_.getResource();
577  auto& mw_pos_copy = mw_mem.mw_pos_copy;
578  auto& mw_offload_scratch = mw_mem.mw_offload_scratch;
579  auto& mw_results_scratch = mw_mem.mw_results_scratch;
580  const int nwalkers = sa_list.size();
581  mw_pos_copy.resize(nwalkers * 6);
582 
583  // pack particle positions
584  for (int iw = 0; iw < nwalkers; ++iw)
585  {
586  const PointType& r = P_list[iw].activeR(iat);
587  PointType ru(PrimLattice.toUnit_floor(r));
588  mw_pos_copy[iw * 6] = r[0];
589  mw_pos_copy[iw * 6 + 1] = r[1];
590  mw_pos_copy[iw * 6 + 2] = r[2];
591  mw_pos_copy[iw * 6 + 3] = ru[0];
592  mw_pos_copy[iw * 6 + 4] = ru[1];
593  mw_pos_copy[iw * 6 + 5] = ru[2];
594  }
595 
596  phi_leader.evaluateVGLMultiPos(mw_pos_copy, mw_offload_scratch, mw_results_scratch, psi_v_list, dpsi_v_list,
597  d2psi_v_list);
598 }
599 
600 template<typename ST>
602  const RefVectorWithLeader<ParticleSet>& P_list,
603  int iat,
604  const std::vector<const ValueType*>& invRow_ptr_list,
605  OffloadMWVGLArray& phi_vgl_v,
606  std::vector<ValueType>& ratios,
607  std::vector<GradType>& grads) const
608 {
609  assert(this == &spo_list.getLeader());
610  auto& phi_leader = spo_list.getCastedLeader<SplineC2COMPTarget<ST>>();
611  auto& mw_mem = phi_leader.mw_mem_handle_.getResource();
612  auto& buffer_H2D = mw_mem.buffer_H2D;
613  auto& rg_private = mw_mem.rg_private;
614  auto& mw_offload_scratch = mw_mem.mw_offload_scratch;
615  auto& mw_results_scratch = mw_mem.mw_results_scratch;
616  const int nwalkers = spo_list.size();
617  buffer_H2D.resize(nwalkers, sizeof(ST) * 6 + sizeof(ValueType*));
618 
619  // pack particle positions and invRow pointers.
620  for (int iw = 0; iw < nwalkers; ++iw)
621  {
622  const PointType& r = P_list[iw].activeR(iat);
623  PointType ru(PrimLattice.toUnit_floor(r));
624  Vector<ST> pos_copy(reinterpret_cast<ST*>(buffer_H2D[iw]), 6);
625 
626  pos_copy[0] = r[0];
627  pos_copy[1] = r[1];
628  pos_copy[2] = r[2];
629  pos_copy[3] = ru[0];
630  pos_copy[4] = ru[1];
631  pos_copy[5] = ru[2];
632 
633  auto& invRow_ptr = *reinterpret_cast<const ValueType**>(buffer_H2D[iw] + sizeof(ST) * 6);
634  invRow_ptr = invRow_ptr_list[iw];
635  }
636 
637  const size_t num_pos = nwalkers;
638  const auto orb_size = phi_vgl_v.size(2);
639  const auto spline_padded_size = myV.size();
640  const auto sposet_padded_size = getAlignedSize<ValueType>(OrbitalSetSize);
641  const size_t ChunkSizePerTeam = 512;
642  const int NumTeams = (myV.size() + ChunkSizePerTeam - 1) / ChunkSizePerTeam;
643  mw_offload_scratch.resize(spline_padded_size * num_pos * SoAFields3D::NUM_FIELDS);
644  // for V(1)G(3)L(1) final result
645  mw_results_scratch.resize(sposet_padded_size * num_pos * 5);
646  // per team ratio and grads
647  rg_private.resize(num_pos, NumTeams * 4);
648 
649  // Ye: need to extract sizes and pointers before entering target region
650  const auto* spline_ptr = SplineInst->getSplinePtr();
651  auto* buffer_H2D_ptr = buffer_H2D.data();
652  auto* offload_scratch_ptr = mw_offload_scratch.data();
653  auto* results_scratch_ptr = mw_results_scratch.data();
654  const auto myKcart_padded_size = myKcart->capacity();
655  auto* mKK_ptr = mKK->data();
656  auto* GGt_ptr = GGt_offload->data();
657  auto* PrimLattice_G_ptr = PrimLattice_G_offload->data();
658  auto* myKcart_ptr = myKcart->data();
659  auto* phi_vgl_ptr = phi_vgl_v.data();
660  auto* rg_private_ptr = rg_private.data();
661  const size_t buffer_H2D_stride = buffer_H2D.cols();
662  const size_t first_spo_local = first_spo;
663  const size_t phi_vgl_stride = num_pos * orb_size;
664 
665  {
666  ScopedTimer offload(offload_timer_);
667  PRAGMA_OFFLOAD("omp target teams distribute collapse(2) num_teams(NumTeams*num_pos) \
668  map(always, to: buffer_H2D_ptr[:buffer_H2D.size()]) \
669  map(always, from: rg_private_ptr[0:rg_private.size()])")
670  for (int iw = 0; iw < num_pos; iw++)
671  for (int team_id = 0; team_id < NumTeams; team_id++)
672  {
673  const size_t first = ChunkSizePerTeam * team_id;
674  const size_t last = omptarget::min(first + ChunkSizePerTeam, spline_padded_size);
675 
676  auto* restrict offload_scratch_iw_ptr = offload_scratch_ptr + spline_padded_size * iw * SoAFields3D::NUM_FIELDS;
677  auto* restrict psi_iw_ptr = results_scratch_ptr + sposet_padded_size * iw * 5;
678  const auto* restrict pos_iw_ptr = reinterpret_cast<ST*>(buffer_H2D_ptr + buffer_H2D_stride * iw);
679  const auto* restrict invRow_iw_ptr =
680  *reinterpret_cast<ValueType**>(buffer_H2D_ptr + buffer_H2D_stride * iw + sizeof(ST) * 6);
681 
682  int ix, iy, iz;
683  ST a[4], b[4], c[4], da[4], db[4], dc[4], d2a[4], d2b[4], d2c[4];
684  spline2::computeLocationAndFractional(spline_ptr, pos_iw_ptr[3], pos_iw_ptr[4], pos_iw_ptr[5], ix, iy, iz, a, b,
685  c, da, db, dc, d2a, d2b, d2c);
686 
687  const ST G[9] = {PrimLattice_G_ptr[0], PrimLattice_G_ptr[1], PrimLattice_G_ptr[2],
688  PrimLattice_G_ptr[3], PrimLattice_G_ptr[4], PrimLattice_G_ptr[5],
689  PrimLattice_G_ptr[6], PrimLattice_G_ptr[7], PrimLattice_G_ptr[8]};
690  const ST symGGt[6] = {GGt_ptr[0], GGt_ptr[1] + GGt_ptr[3], GGt_ptr[2] + GGt_ptr[6],
691  GGt_ptr[4], GGt_ptr[5] + GGt_ptr[7], GGt_ptr[8]};
692 
693  PRAGMA_OFFLOAD("omp parallel for")
694  for (int index = 0; index < last - first; index++)
695  {
696  spline2offload::evaluate_vgh_impl_v2(spline_ptr, ix, iy, iz, first + index, a, b, c, da, db, dc, d2a, d2b,
697  d2c, offload_scratch_iw_ptr + first + index, spline_padded_size);
698  const int output_index = first + index;
699  offload_scratch_iw_ptr[spline_padded_size * SoAFields3D::LAPL + output_index] =
700  SymTrace(offload_scratch_iw_ptr[spline_padded_size * SoAFields3D::HESS00 + output_index],
701  offload_scratch_iw_ptr[spline_padded_size * SoAFields3D::HESS01 + output_index],
702  offload_scratch_iw_ptr[spline_padded_size * SoAFields3D::HESS02 + output_index],
703  offload_scratch_iw_ptr[spline_padded_size * SoAFields3D::HESS11 + output_index],
704  offload_scratch_iw_ptr[spline_padded_size * SoAFields3D::HESS12 + output_index],
705  offload_scratch_iw_ptr[spline_padded_size * SoAFields3D::HESS22 + output_index], symGGt);
706  }
707 
708  const size_t first_cplx = first / 2;
709  const size_t last_cplx = omptarget::min(last / 2, orb_size);
710  PRAGMA_OFFLOAD("omp parallel for")
711  for (int index = first_cplx; index < last_cplx; index++)
712  C2C::assign_vgl(pos_iw_ptr[0], pos_iw_ptr[1], pos_iw_ptr[2], psi_iw_ptr, sposet_padded_size, mKK_ptr,
713  offload_scratch_iw_ptr, spline_padded_size, G, myKcart_ptr, myKcart_padded_size,
714  first_spo_local, index);
715 
716  ValueType* restrict psi = psi_iw_ptr;
717  ValueType* restrict dpsi_x = psi_iw_ptr + sposet_padded_size;
718  ValueType* restrict dpsi_y = psi_iw_ptr + sposet_padded_size * 2;
719  ValueType* restrict dpsi_z = psi_iw_ptr + sposet_padded_size * 3;
720  ValueType* restrict d2psi = psi_iw_ptr + sposet_padded_size * 4;
721 
722  ValueType* restrict out_phi = phi_vgl_ptr + iw * orb_size;
723  ValueType* restrict out_dphi_x = out_phi + phi_vgl_stride;
724  ValueType* restrict out_dphi_y = out_dphi_x + phi_vgl_stride;
725  ValueType* restrict out_dphi_z = out_dphi_y + phi_vgl_stride;
726  ValueType* restrict out_d2phi = out_dphi_z + phi_vgl_stride;
727 
728  ValueType ratio(0), grad_x(0), grad_y(0), grad_z(0);
729  PRAGMA_OFFLOAD("omp parallel for reduction(+: ratio, grad_x, grad_y, grad_z)")
730  for (int j = first_cplx; j < last_cplx; j++)
731  {
732  const size_t psiIndex = first_spo_local + j;
733 
734  out_phi[psiIndex] = psi[psiIndex];
735  out_dphi_x[psiIndex] = dpsi_x[psiIndex];
736  out_dphi_y[psiIndex] = dpsi_y[psiIndex];
737  out_dphi_z[psiIndex] = dpsi_z[psiIndex];
738  out_d2phi[psiIndex] = d2psi[psiIndex];
739 
740  ratio += psi[psiIndex] * invRow_iw_ptr[psiIndex];
741  grad_x += dpsi_x[psiIndex] * invRow_iw_ptr[psiIndex];
742  grad_y += dpsi_y[psiIndex] * invRow_iw_ptr[psiIndex];
743  grad_z += dpsi_z[psiIndex] * invRow_iw_ptr[psiIndex];
744  }
745 
746  rg_private_ptr[(iw * NumTeams + team_id) * 4] = ratio;
747  rg_private_ptr[(iw * NumTeams + team_id) * 4 + 1] = grad_x;
748  rg_private_ptr[(iw * NumTeams + team_id) * 4 + 2] = grad_y;
749  rg_private_ptr[(iw * NumTeams + team_id) * 4 + 3] = grad_z;
750  }
751  }
752 
753  for (int iw = 0; iw < num_pos; iw++)
754  {
755  ValueType ratio(0);
756  for (int team_id = 0; team_id < NumTeams; team_id++)
757  ratio += rg_private[iw][team_id * 4];
758  ratios[iw] = ratio;
759 
760  ValueType grad_x(0), grad_y(0), grad_z(0);
761  for (int team_id = 0; team_id < NumTeams; team_id++)
762  {
763  grad_x += rg_private[iw][team_id * 4 + 1];
764  grad_y += rg_private[iw][team_id * 4 + 2];
765  grad_z += rg_private[iw][team_id * 4 + 3];
766  }
767  grads[iw] = GradType{grad_x / ratio, grad_y / ratio, grad_z / ratio};
768  }
769 }
770 template<typename ST>
772  ValueVector& psi,
773  GradVector& dpsi,
774  HessVector& grad_grad_psi,
775  int first,
776  int last) const
777 {
778  // protect last
779  const size_t last_cplx = std::min(kPoints.size(), psi.size());
780  last = last > last_cplx ? last_cplx : last;
781 
782  const ST g00 = PrimLattice.G(0), g01 = PrimLattice.G(1), g02 = PrimLattice.G(2), g10 = PrimLattice.G(3),
783  g11 = PrimLattice.G(4), g12 = PrimLattice.G(5), g20 = PrimLattice.G(6), g21 = PrimLattice.G(7),
784  g22 = PrimLattice.G(8);
785  const ST x = r[0], y = r[1], z = r[2];
786 
787  const ST* restrict k0 = myKcart->data(0);
788  const ST* restrict k1 = myKcart->data(1);
789  const ST* restrict k2 = myKcart->data(2);
790 
791  const ST* restrict g0 = myG.data(0);
792  const ST* restrict g1 = myG.data(1);
793  const ST* restrict g2 = myG.data(2);
794  const ST* restrict h00 = myH.data(0);
795  const ST* restrict h01 = myH.data(1);
796  const ST* restrict h02 = myH.data(2);
797  const ST* restrict h11 = myH.data(3);
798  const ST* restrict h12 = myH.data(4);
799  const ST* restrict h22 = myH.data(5);
800 
801 #pragma omp simd
802  for (size_t j = first; j < last; ++j)
803  {
804  int jr = j << 1;
805  int ji = jr + 1;
806 
807  const ST kX = k0[j];
808  const ST kY = k1[j];
809  const ST kZ = k2[j];
810  const ST val_r = myV[jr];
811  const ST val_i = myV[ji];
812 
813  //phase
814  ST s, c;
815  omptarget::sincos(-(x * kX + y * kY + z * kZ), &s, &c);
816 
817  //dot(PrimLattice.G,myG[j])
818  const ST dX_r = g00 * g0[jr] + g01 * g1[jr] + g02 * g2[jr];
819  const ST dY_r = g10 * g0[jr] + g11 * g1[jr] + g12 * g2[jr];
820  const ST dZ_r = g20 * g0[jr] + g21 * g1[jr] + g22 * g2[jr];
821 
822  const ST dX_i = g00 * g0[ji] + g01 * g1[ji] + g02 * g2[ji];
823  const ST dY_i = g10 * g0[ji] + g11 * g1[ji] + g12 * g2[ji];
824  const ST dZ_i = g20 * g0[ji] + g21 * g1[ji] + g22 * g2[ji];
825 
826  // \f$\nabla \psi_r + {\bf k}\psi_i\f$
827  const ST gX_r = dX_r + val_i * kX;
828  const ST gY_r = dY_r + val_i * kY;
829  const ST gZ_r = dZ_r + val_i * kZ;
830  const ST gX_i = dX_i - val_r * kX;
831  const ST gY_i = dY_i - val_r * kY;
832  const ST gZ_i = dZ_i - val_r * kZ;
833 
834  const size_t psiIndex = j + first_spo;
835  psi[psiIndex] = ComplexT(c * val_r - s * val_i, c * val_i + s * val_r);
836  dpsi[psiIndex][0] = ComplexT(c * gX_r - s * gX_i, c * gX_i + s * gX_r);
837  dpsi[psiIndex][1] = ComplexT(c * gY_r - s * gY_i, c * gY_i + s * gY_r);
838  dpsi[psiIndex][2] = ComplexT(c * gZ_r - s * gZ_i, c * gZ_i + s * gZ_r);
839 
840  const ST h_xx_r =
841  v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g00, g01, g02, g00, g01, g02) + kX * (gX_i + dX_i);
842  const ST h_xy_r =
843  v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g00, g01, g02, g10, g11, g12) + kX * (gY_i + dY_i);
844  const ST h_xz_r =
845  v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g00, g01, g02, g20, g21, g22) + kX * (gZ_i + dZ_i);
846  const ST h_yx_r =
847  v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g10, g11, g12, g00, g01, g02) + kY * (gX_i + dX_i);
848  const ST h_yy_r =
849  v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g10, g11, g12, g10, g11, g12) + kY * (gY_i + dY_i);
850  const ST h_yz_r =
851  v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g10, g11, g12, g20, g21, g22) + kY * (gZ_i + dZ_i);
852  const ST h_zx_r =
853  v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g20, g21, g22, g00, g01, g02) + kZ * (gX_i + dX_i);
854  const ST h_zy_r =
855  v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g20, g21, g22, g10, g11, g12) + kZ * (gY_i + dY_i);
856  const ST h_zz_r =
857  v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g20, g21, g22, g20, g21, g22) + kZ * (gZ_i + dZ_i);
858 
859  const ST h_xx_i =
860  v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g00, g01, g02, g00, g01, g02) - kX * (gX_r + dX_r);
861  const ST h_xy_i =
862  v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g00, g01, g02, g10, g11, g12) - kX * (gY_r + dY_r);
863  const ST h_xz_i =
864  v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g00, g01, g02, g20, g21, g22) - kX * (gZ_r + dZ_r);
865  const ST h_yx_i =
866  v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g10, g11, g12, g00, g01, g02) - kY * (gX_r + dX_r);
867  const ST h_yy_i =
868  v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g10, g11, g12, g10, g11, g12) - kY * (gY_r + dY_r);
869  const ST h_yz_i =
870  v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g10, g11, g12, g20, g21, g22) - kY * (gZ_r + dZ_r);
871  const ST h_zx_i =
872  v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g20, g21, g22, g00, g01, g02) - kZ * (gX_r + dX_r);
873  const ST h_zy_i =
874  v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g20, g21, g22, g10, g11, g12) - kZ * (gY_r + dY_r);
875  const ST h_zz_i =
876  v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g20, g21, g22, g20, g21, g22) - kZ * (gZ_r + dZ_r);
877 
878  grad_grad_psi[psiIndex][0] = ComplexT(c * h_xx_r - s * h_xx_i, c * h_xx_i + s * h_xx_r);
879  grad_grad_psi[psiIndex][1] = ComplexT(c * h_xy_r - s * h_xy_i, c * h_xy_i + s * h_xy_r);
880  grad_grad_psi[psiIndex][2] = ComplexT(c * h_xz_r - s * h_xz_i, c * h_xz_i + s * h_xz_r);
881  grad_grad_psi[psiIndex][3] = ComplexT(c * h_yx_r - s * h_yx_i, c * h_yx_i + s * h_yx_r);
882  grad_grad_psi[psiIndex][4] = ComplexT(c * h_yy_r - s * h_yy_i, c * h_yy_i + s * h_yy_r);
883  grad_grad_psi[psiIndex][5] = ComplexT(c * h_yz_r - s * h_yz_i, c * h_yz_i + s * h_yz_r);
884  grad_grad_psi[psiIndex][6] = ComplexT(c * h_zx_r - s * h_zx_i, c * h_zx_i + s * h_zx_r);
885  grad_grad_psi[psiIndex][7] = ComplexT(c * h_zy_r - s * h_zy_i, c * h_zy_i + s * h_zy_r);
886  grad_grad_psi[psiIndex][8] = ComplexT(c * h_zz_r - s * h_zz_i, c * h_zz_i + s * h_zz_r);
887  }
888 }
889 
890 template<typename ST>
892  const int iat,
893  ValueVector& psi,
894  GradVector& dpsi,
895  HessVector& grad_grad_psi)
896 {
897  const PointType& r = P.activeR(iat);
898  PointType ru(PrimLattice.toUnit_floor(r));
899 
900 #pragma omp parallel
901  {
902  int first, last;
903  // Factor of 2 because psi is complex and the spline storage and evaluation uses a real type
904  FairDivideAligned(2 * psi.size(), getAlignment<ST>(), omp_get_num_threads(), omp_get_thread_num(), first, last);
905 
906  spline2::evaluate3d_vgh(SplineInst->getSplinePtr(), ru, myV, myG, myH, first, last);
907  assign_vgh(r, psi, dpsi, grad_grad_psi, first / 2, last / 2);
908  }
909 }
910 
911 template<typename ST>
913  ValueVector& psi,
914  GradVector& dpsi,
915  HessVector& grad_grad_psi,
916  GGGVector& grad_grad_grad_psi,
917  int first,
918  int last) const
919 {
920  // protect last
921  const size_t last_cplx = std::min(kPoints.size(), psi.size());
922  last = last < 0 ? last_cplx : (last > last_cplx ? last_cplx : last);
923 
924  const ST g00 = PrimLattice.G(0), g01 = PrimLattice.G(1), g02 = PrimLattice.G(2), g10 = PrimLattice.G(3),
925  g11 = PrimLattice.G(4), g12 = PrimLattice.G(5), g20 = PrimLattice.G(6), g21 = PrimLattice.G(7),
926  g22 = PrimLattice.G(8);
927  const ST x = r[0], y = r[1], z = r[2];
928 
929  const ST* restrict k0 = myKcart->data(0);
930  const ST* restrict k1 = myKcart->data(1);
931  const ST* restrict k2 = myKcart->data(2);
932 
933  const ST* restrict g0 = myG.data(0);
934  const ST* restrict g1 = myG.data(1);
935  const ST* restrict g2 = myG.data(2);
936  const ST* restrict h00 = myH.data(0);
937  const ST* restrict h01 = myH.data(1);
938  const ST* restrict h02 = myH.data(2);
939  const ST* restrict h11 = myH.data(3);
940  const ST* restrict h12 = myH.data(4);
941  const ST* restrict h22 = myH.data(5);
942 
943  const ST* restrict gh000 = mygH.data(0);
944  const ST* restrict gh001 = mygH.data(1);
945  const ST* restrict gh002 = mygH.data(2);
946  const ST* restrict gh011 = mygH.data(3);
947  const ST* restrict gh012 = mygH.data(4);
948  const ST* restrict gh022 = mygH.data(5);
949  const ST* restrict gh111 = mygH.data(6);
950  const ST* restrict gh112 = mygH.data(7);
951  const ST* restrict gh122 = mygH.data(8);
952  const ST* restrict gh222 = mygH.data(9);
953 
954 //SIMD doesn't work quite right yet. Comment out until further debugging.
955 #pragma omp simd
956  for (size_t j = first; j < last; ++j)
957  {
958  int jr = j << 1;
959  int ji = jr + 1;
960 
961  const ST kX = k0[j];
962  const ST kY = k1[j];
963  const ST kZ = k2[j];
964  const ST val_r = myV[jr];
965  const ST val_i = myV[ji];
966 
967  //phase
968  ST s, c;
969  omptarget::sincos(-(x * kX + y * kY + z * kZ), &s, &c);
970 
971  //dot(PrimLattice.G,myG[j])
972  const ST dX_r = g00 * g0[jr] + g01 * g1[jr] + g02 * g2[jr];
973  const ST dY_r = g10 * g0[jr] + g11 * g1[jr] + g12 * g2[jr];
974  const ST dZ_r = g20 * g0[jr] + g21 * g1[jr] + g22 * g2[jr];
975 
976  const ST dX_i = g00 * g0[ji] + g01 * g1[ji] + g02 * g2[ji];
977  const ST dY_i = g10 * g0[ji] + g11 * g1[ji] + g12 * g2[ji];
978  const ST dZ_i = g20 * g0[ji] + g21 * g1[ji] + g22 * g2[ji];
979 
980  // \f$\nabla \psi_r + {\bf k}\psi_i\f$
981  const ST gX_r = dX_r + val_i * kX;
982  const ST gY_r = dY_r + val_i * kY;
983  const ST gZ_r = dZ_r + val_i * kZ;
984  const ST gX_i = dX_i - val_r * kX;
985  const ST gY_i = dY_i - val_r * kY;
986  const ST gZ_i = dZ_i - val_r * kZ;
987 
988  const size_t psiIndex = j + first_spo;
989  psi[psiIndex] = ComplexT(c * val_r - s * val_i, c * val_i + s * val_r);
990  dpsi[psiIndex][0] = ComplexT(c * gX_r - s * gX_i, c * gX_i + s * gX_r);
991  dpsi[psiIndex][1] = ComplexT(c * gY_r - s * gY_i, c * gY_i + s * gY_r);
992  dpsi[psiIndex][2] = ComplexT(c * gZ_r - s * gZ_i, c * gZ_i + s * gZ_r);
993 
994  //intermediates for computation of hessian. \partial_i \partial_j phi in cartesian coordinates.
995  const ST f_xx_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g00, g01, g02, g00, g01, g02);
996  const ST f_xy_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g00, g01, g02, g10, g11, g12);
997  const ST f_xz_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g00, g01, g02, g20, g21, g22);
998  const ST f_yy_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g10, g11, g12, g10, g11, g12);
999  const ST f_yz_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g10, g11, g12, g20, g21, g22);
1000  const ST f_zz_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g20, g21, g22, g20, g21, g22);
1001 
1002  const ST f_xx_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g00, g01, g02, g00, g01, g02);
1003  const ST f_xy_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g00, g01, g02, g10, g11, g12);
1004  const ST f_xz_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g00, g01, g02, g20, g21, g22);
1005  const ST f_yy_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g10, g11, g12, g10, g11, g12);
1006  const ST f_yz_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g10, g11, g12, g20, g21, g22);
1007  const ST f_zz_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g20, g21, g22, g20, g21, g22);
1008 
1009  const ST h_xx_r = f_xx_r + 2 * kX * dX_i - kX * kX * val_r;
1010  const ST h_xy_r = f_xy_r + (kX * dY_i + kY * dX_i) - kX * kY * val_r;
1011  const ST h_xz_r = f_xz_r + (kX * dZ_i + kZ * dX_i) - kX * kZ * val_r;
1012  const ST h_yy_r = f_yy_r + 2 * kY * dY_i - kY * kY * val_r;
1013  const ST h_yz_r = f_yz_r + (kY * dZ_i + kZ * dY_i) - kY * kZ * val_r;
1014  const ST h_zz_r = f_zz_r + 2 * kZ * dZ_i - kZ * kZ * val_r;
1015 
1016  const ST h_xx_i = f_xx_i - 2 * kX * dX_r - kX * kX * val_i;
1017  const ST h_xy_i = f_xy_i - (kX * dY_r + kY * dX_r) - kX * kY * val_i;
1018  const ST h_xz_i = f_xz_i - (kX * dZ_r + kZ * dX_r) - kX * kZ * val_i;
1019  const ST h_yy_i = f_yy_i - 2 * kY * dY_r - kY * kY * val_i;
1020  const ST h_yz_i = f_yz_i - (kZ * dY_r + kY * dZ_r) - kZ * kY * val_i;
1021  const ST h_zz_i = f_zz_i - 2 * kZ * dZ_r - kZ * kZ * val_i;
1022 
1023  grad_grad_psi[psiIndex][0] = ComplexT(c * h_xx_r - s * h_xx_i, c * h_xx_i + s * h_xx_r);
1024  grad_grad_psi[psiIndex][1] = ComplexT(c * h_xy_r - s * h_xy_i, c * h_xy_i + s * h_xy_r);
1025  grad_grad_psi[psiIndex][2] = ComplexT(c * h_xz_r - s * h_xz_i, c * h_xz_i + s * h_xz_r);
1026  grad_grad_psi[psiIndex][3] = ComplexT(c * h_xy_r - s * h_xy_i, c * h_xy_i + s * h_xy_r);
1027  grad_grad_psi[psiIndex][4] = ComplexT(c * h_yy_r - s * h_yy_i, c * h_yy_i + s * h_yy_r);
1028  grad_grad_psi[psiIndex][5] = ComplexT(c * h_yz_r - s * h_yz_i, c * h_yz_i + s * h_yz_r);
1029  grad_grad_psi[psiIndex][6] = ComplexT(c * h_xz_r - s * h_xz_i, c * h_xz_i + s * h_xz_r);
1030  grad_grad_psi[psiIndex][7] = ComplexT(c * h_yz_r - s * h_yz_i, c * h_yz_i + s * h_yz_r);
1031  grad_grad_psi[psiIndex][8] = ComplexT(c * h_zz_r - s * h_zz_i, c * h_zz_i + s * h_zz_r);
1032 
1033  //These are the real and imaginary components of the third SPO derivative. _xxx denotes
1034  // third derivative w.r.t. x, _xyz, a derivative with resepect to x,y, and z, and so on.
1035 
1036  const ST f3_xxx_r = t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
1037  gh112[jr], gh122[jr], gh222[jr], g00, g01, g02, g00, g01, g02, g00, g01, g02);
1038  const ST f3_xxy_r = t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
1039  gh112[jr], gh122[jr], gh222[jr], g00, g01, g02, g00, g01, g02, g10, g11, g12);
1040  const ST f3_xxz_r = t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
1041  gh112[jr], gh122[jr], gh222[jr], g00, g01, g02, g00, g01, g02, g20, g21, g22);
1042  const ST f3_xyy_r = t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
1043  gh112[jr], gh122[jr], gh222[jr], g00, g01, g02, g10, g11, g12, g10, g11, g12);
1044  const ST f3_xyz_r = t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
1045  gh112[jr], gh122[jr], gh222[jr], g00, g01, g02, g10, g11, g12, g20, g21, g22);
1046  const ST f3_xzz_r = t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
1047  gh112[jr], gh122[jr], gh222[jr], g00, g01, g02, g20, g21, g22, g20, g21, g22);
1048  const ST f3_yyy_r = t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
1049  gh112[jr], gh122[jr], gh222[jr], g10, g11, g12, g10, g11, g12, g10, g11, g12);
1050  const ST f3_yyz_r = t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
1051  gh112[jr], gh122[jr], gh222[jr], g10, g11, g12, g10, g11, g12, g20, g21, g22);
1052  const ST f3_yzz_r = t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
1053  gh112[jr], gh122[jr], gh222[jr], g10, g11, g12, g20, g21, g22, g20, g21, g22);
1054  const ST f3_zzz_r = t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
1055  gh112[jr], gh122[jr], gh222[jr], g20, g21, g22, g20, g21, g22, g20, g21, g22);
1056 
1057  const ST f3_xxx_i = t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
1058  gh112[ji], gh122[ji], gh222[ji], g00, g01, g02, g00, g01, g02, g00, g01, g02);
1059  const ST f3_xxy_i = t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
1060  gh112[ji], gh122[ji], gh222[ji], g00, g01, g02, g00, g01, g02, g10, g11, g12);
1061  const ST f3_xxz_i = t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
1062  gh112[ji], gh122[ji], gh222[ji], g00, g01, g02, g00, g01, g02, g20, g21, g22);
1063  const ST f3_xyy_i = t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
1064  gh112[ji], gh122[ji], gh222[ji], g00, g01, g02, g10, g11, g12, g10, g11, g12);
1065  const ST f3_xyz_i = t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
1066  gh112[ji], gh122[ji], gh222[ji], g00, g01, g02, g10, g11, g12, g20, g21, g22);
1067  const ST f3_xzz_i = t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
1068  gh112[ji], gh122[ji], gh222[ji], g00, g01, g02, g20, g21, g22, g20, g21, g22);
1069  const ST f3_yyy_i = t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
1070  gh112[ji], gh122[ji], gh222[ji], g10, g11, g12, g10, g11, g12, g10, g11, g12);
1071  const ST f3_yyz_i = t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
1072  gh112[ji], gh122[ji], gh222[ji], g10, g11, g12, g10, g11, g12, g20, g21, g22);
1073  const ST f3_yzz_i = t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
1074  gh112[ji], gh122[ji], gh222[ji], g10, g11, g12, g20, g21, g22, g20, g21, g22);
1075  const ST f3_zzz_i = t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
1076  gh112[ji], gh122[ji], gh222[ji], g20, g21, g22, g20, g21, g22, g20, g21, g22);
1077 
1078  //Here is where we build up the components of the physical hessian gradient, namely, d^3/dx^3(e^{-ik*r}\phi(r)
1079  const ST gh_xxx_r = f3_xxx_r + 3 * kX * f_xx_i - 3 * kX * kX * dX_r - kX * kX * kX * val_i;
1080  const ST gh_xxx_i = f3_xxx_i - 3 * kX * f_xx_r - 3 * kX * kX * dX_i + kX * kX * kX * val_r;
1081  const ST gh_xxy_r =
1082  f3_xxy_r + (kY * f_xx_i + 2 * kX * f_xy_i) - (kX * kX * dY_r + 2 * kX * kY * dX_r) - kX * kX * kY * val_i;
1083  const ST gh_xxy_i =
1084  f3_xxy_i - (kY * f_xx_r + 2 * kX * f_xy_r) - (kX * kX * dY_i + 2 * kX * kY * dX_i) + kX * kX * kY * val_r;
1085  const ST gh_xxz_r =
1086  f3_xxz_r + (kZ * f_xx_i + 2 * kX * f_xz_i) - (kX * kX * dZ_r + 2 * kX * kZ * dX_r) - kX * kX * kZ * val_i;
1087  const ST gh_xxz_i =
1088  f3_xxz_i - (kZ * f_xx_r + 2 * kX * f_xz_r) - (kX * kX * dZ_i + 2 * kX * kZ * dX_i) + kX * kX * kZ * val_r;
1089  const ST gh_xyy_r =
1090  f3_xyy_r + (2 * kY * f_xy_i + kX * f_yy_i) - (2 * kX * kY * dY_r + kY * kY * dX_r) - kX * kY * kY * val_i;
1091  const ST gh_xyy_i =
1092  f3_xyy_i - (2 * kY * f_xy_r + kX * f_yy_r) - (2 * kX * kY * dY_i + kY * kY * dX_i) + kX * kY * kY * val_r;
1093  const ST gh_xyz_r = f3_xyz_r + (kX * f_yz_i + kY * f_xz_i + kZ * f_xy_i) -
1094  (kX * kY * dZ_r + kY * kZ * dX_r + kZ * kX * dY_r) - kX * kY * kZ * val_i;
1095  const ST gh_xyz_i = f3_xyz_i - (kX * f_yz_r + kY * f_xz_r + kZ * f_xy_r) -
1096  (kX * kY * dZ_i + kY * kZ * dX_i + kZ * kX * dY_i) + kX * kY * kZ * val_r;
1097  const ST gh_xzz_r =
1098  f3_xzz_r + (2 * kZ * f_xz_i + kX * f_zz_i) - (2 * kX * kZ * dZ_r + kZ * kZ * dX_r) - kX * kZ * kZ * val_i;
1099  const ST gh_xzz_i =
1100  f3_xzz_i - (2 * kZ * f_xz_r + kX * f_zz_r) - (2 * kX * kZ * dZ_i + kZ * kZ * dX_i) + kX * kZ * kZ * val_r;
1101  const ST gh_yyy_r = f3_yyy_r + 3 * kY * f_yy_i - 3 * kY * kY * dY_r - kY * kY * kY * val_i;
1102  const ST gh_yyy_i = f3_yyy_i - 3 * kY * f_yy_r - 3 * kY * kY * dY_i + kY * kY * kY * val_r;
1103  const ST gh_yyz_r =
1104  f3_yyz_r + (kZ * f_yy_i + 2 * kY * f_yz_i) - (kY * kY * dZ_r + 2 * kY * kZ * dY_r) - kY * kY * kZ * val_i;
1105  const ST gh_yyz_i =
1106  f3_yyz_i - (kZ * f_yy_r + 2 * kY * f_yz_r) - (kY * kY * dZ_i + 2 * kY * kZ * dY_i) + kY * kY * kZ * val_r;
1107  const ST gh_yzz_r =
1108  f3_yzz_r + (2 * kZ * f_yz_i + kY * f_zz_i) - (2 * kY * kZ * dZ_r + kZ * kZ * dY_r) - kY * kZ * kZ * val_i;
1109  const ST gh_yzz_i =
1110  f3_yzz_i - (2 * kZ * f_yz_r + kY * f_zz_r) - (2 * kY * kZ * dZ_i + kZ * kZ * dY_i) + kY * kZ * kZ * val_r;
1111  const ST gh_zzz_r = f3_zzz_r + 3 * kZ * f_zz_i - 3 * kZ * kZ * dZ_r - kZ * kZ * kZ * val_i;
1112  const ST gh_zzz_i = f3_zzz_i - 3 * kZ * f_zz_r - 3 * kZ * kZ * dZ_i + kZ * kZ * kZ * val_r;
1113 
1114  grad_grad_grad_psi[psiIndex][0][0] = ComplexT(c * gh_xxx_r - s * gh_xxx_i, c * gh_xxx_i + s * gh_xxx_r);
1115  grad_grad_grad_psi[psiIndex][0][1] = ComplexT(c * gh_xxy_r - s * gh_xxy_i, c * gh_xxy_i + s * gh_xxy_r);
1116  grad_grad_grad_psi[psiIndex][0][2] = ComplexT(c * gh_xxz_r - s * gh_xxz_i, c * gh_xxz_i + s * gh_xxz_r);
1117  grad_grad_grad_psi[psiIndex][0][3] = ComplexT(c * gh_xxy_r - s * gh_xxy_i, c * gh_xxy_i + s * gh_xxy_r);
1118  grad_grad_grad_psi[psiIndex][0][4] = ComplexT(c * gh_xyy_r - s * gh_xyy_i, c * gh_xyy_i + s * gh_xyy_r);
1119  grad_grad_grad_psi[psiIndex][0][5] = ComplexT(c * gh_xyz_r - s * gh_xyz_i, c * gh_xyz_i + s * gh_xyz_r);
1120  grad_grad_grad_psi[psiIndex][0][6] = ComplexT(c * gh_xxz_r - s * gh_xxz_i, c * gh_xxz_i + s * gh_xxz_r);
1121  grad_grad_grad_psi[psiIndex][0][7] = ComplexT(c * gh_xyz_r - s * gh_xyz_i, c * gh_xyz_i + s * gh_xyz_r);
1122  grad_grad_grad_psi[psiIndex][0][8] = ComplexT(c * gh_xzz_r - s * gh_xzz_i, c * gh_xzz_i + s * gh_xzz_r);
1123 
1124  grad_grad_grad_psi[psiIndex][1][0] = ComplexT(c * gh_xxy_r - s * gh_xxy_i, c * gh_xxy_i + s * gh_xxy_r);
1125  grad_grad_grad_psi[psiIndex][1][1] = ComplexT(c * gh_xyy_r - s * gh_xyy_i, c * gh_xyy_i + s * gh_xyy_r);
1126  grad_grad_grad_psi[psiIndex][1][2] = ComplexT(c * gh_xyz_r - s * gh_xyz_i, c * gh_xyz_i + s * gh_xyz_r);
1127  grad_grad_grad_psi[psiIndex][1][3] = ComplexT(c * gh_xyy_r - s * gh_xyy_i, c * gh_xyy_i + s * gh_xyy_r);
1128  grad_grad_grad_psi[psiIndex][1][4] = ComplexT(c * gh_yyy_r - s * gh_yyy_i, c * gh_yyy_i + s * gh_yyy_r);
1129  grad_grad_grad_psi[psiIndex][1][5] = ComplexT(c * gh_yyz_r - s * gh_yyz_i, c * gh_yyz_i + s * gh_yyz_r);
1130  grad_grad_grad_psi[psiIndex][1][6] = ComplexT(c * gh_xyz_r - s * gh_xyz_i, c * gh_xyz_i + s * gh_xyz_r);
1131  grad_grad_grad_psi[psiIndex][1][7] = ComplexT(c * gh_yyz_r - s * gh_yyz_i, c * gh_yyz_i + s * gh_yyz_r);
1132  grad_grad_grad_psi[psiIndex][1][8] = ComplexT(c * gh_yzz_r - s * gh_yzz_i, c * gh_yzz_i + s * gh_yzz_r);
1133 
1134 
1135  grad_grad_grad_psi[psiIndex][2][0] = ComplexT(c * gh_xxz_r - s * gh_xxz_i, c * gh_xxz_i + s * gh_xxz_r);
1136  grad_grad_grad_psi[psiIndex][2][1] = ComplexT(c * gh_xyz_r - s * gh_xyz_i, c * gh_xyz_i + s * gh_xyz_r);
1137  grad_grad_grad_psi[psiIndex][2][2] = ComplexT(c * gh_xzz_r - s * gh_xzz_i, c * gh_xzz_i + s * gh_xzz_r);
1138  grad_grad_grad_psi[psiIndex][2][3] = ComplexT(c * gh_xyz_r - s * gh_xyz_i, c * gh_xyz_i + s * gh_xyz_r);
1139  grad_grad_grad_psi[psiIndex][2][4] = ComplexT(c * gh_yyz_r - s * gh_yyz_i, c * gh_yyz_i + s * gh_yyz_r);
1140  grad_grad_grad_psi[psiIndex][2][5] = ComplexT(c * gh_yzz_r - s * gh_yzz_i, c * gh_yzz_i + s * gh_yzz_r);
1141  grad_grad_grad_psi[psiIndex][2][6] = ComplexT(c * gh_xzz_r - s * gh_xzz_i, c * gh_xzz_i + s * gh_xzz_r);
1142  grad_grad_grad_psi[psiIndex][2][7] = ComplexT(c * gh_yzz_r - s * gh_yzz_i, c * gh_yzz_i + s * gh_yzz_r);
1143  grad_grad_grad_psi[psiIndex][2][8] = ComplexT(c * gh_zzz_r - s * gh_zzz_i, c * gh_zzz_i + s * gh_zzz_r);
1144  }
1145 }
1146 
1147 template<typename ST>
1149  const int iat,
1150  ValueVector& psi,
1151  GradVector& dpsi,
1152  HessVector& grad_grad_psi,
1153  GGGVector& grad_grad_grad_psi)
1154 {
1155  const PointType& r = P.activeR(iat);
1156  PointType ru(PrimLattice.toUnit_floor(r));
1157 #pragma omp parallel
1158  {
1159  int first, last;
1160  FairDivideAligned(2 * psi.size(), getAlignment<ST>(), omp_get_num_threads(), omp_get_thread_num(), first, last);
1161 
1162  spline2::evaluate3d_vghgh(SplineInst->getSplinePtr(), ru, myV, myG, myH, mygH, first, last);
1163  assign_vghgh(r, psi, dpsi, grad_grad_psi, grad_grad_grad_psi, first / 2, last / 2);
1164  }
1165 }
1166 
1167 template<typename ST>
1169  int first,
1170  int last,
1171  ValueMatrix& logdet,
1172  GradMatrix& dlogdet,
1173  ValueMatrix& d2logdet)
1174 {
1175  // chunk the [first, last) loop into blocks to save temporary memory usage
1176  const int block_size = 16;
1177 
1178  // reference vectors refer to the rows of matrices
1179  std::vector<ValueVector> multi_psi_v;
1180  std::vector<GradVector> multi_dpsi_v;
1181  std::vector<ValueVector> multi_d2psi_v;
1182  RefVector<ValueVector> psi_v_list;
1183  RefVector<GradVector> dpsi_v_list;
1184  RefVector<ValueVector> d2psi_v_list;
1185 
1186  multi_psi_v.reserve(block_size);
1187  multi_dpsi_v.reserve(block_size);
1188  multi_d2psi_v.reserve(block_size);
1189  psi_v_list.reserve(block_size);
1190  dpsi_v_list.reserve(block_size);
1191  d2psi_v_list.reserve(block_size);
1192 
1193  for (int iat = first, i = 0; iat < last; iat += block_size, i += block_size)
1194  {
1195  const int actual_block_size = std::min(last - iat, block_size);
1196  multi_pos_copy.resize(actual_block_size * 6);
1197  multi_psi_v.clear();
1198  multi_dpsi_v.clear();
1199  multi_d2psi_v.clear();
1200  psi_v_list.clear();
1201  dpsi_v_list.clear();
1202  d2psi_v_list.clear();
1203 
1204  for (int ipos = 0; ipos < actual_block_size; ++ipos)
1205  {
1206  // pack particle positions
1207  const PointType& r = P.activeR(iat + ipos);
1208  PointType ru(PrimLattice.toUnit_floor(r));
1209  multi_pos_copy[ipos * 6] = r[0];
1210  multi_pos_copy[ipos * 6 + 1] = r[1];
1211  multi_pos_copy[ipos * 6 + 2] = r[2];
1212  multi_pos_copy[ipos * 6 + 3] = ru[0];
1213  multi_pos_copy[ipos * 6 + 4] = ru[1];
1214  multi_pos_copy[ipos * 6 + 5] = ru[2];
1215 
1216  multi_psi_v.emplace_back(logdet[i + ipos], logdet.cols());
1217  multi_dpsi_v.emplace_back(dlogdet[i + ipos], dlogdet.cols());
1218  multi_d2psi_v.emplace_back(d2logdet[i + ipos], d2logdet.cols());
1219 
1220  psi_v_list.push_back(multi_psi_v[ipos]);
1221  dpsi_v_list.push_back(multi_dpsi_v[ipos]);
1222  d2psi_v_list.push_back(multi_d2psi_v[ipos]);
1223  }
1224 
1225  evaluateVGLMultiPos(multi_pos_copy, offload_scratch, results_scratch, psi_v_list, dpsi_v_list, d2psi_v_list);
1226  }
1227 }
1228 
1229 template class SplineC2COMPTarget<float>;
1230 template class SplineC2COMPTarget<double>;
1231 
1232 } // namespace qmcplusplus
OrbitalSetTraits< ValueType >::HessVector HessVector
Definition: SPOSet.h:53
T SymTrace(T h00, T h01, T h02, T h11, T h12, T h22, const T gg[6])
compute Trace(H*G)
Fixed-size array.
Definition: OhmmsTinyMeta.h:30
virtual void evaluateVGL(const ParticleSet &P, const int iat, ValueVector &psi, GradVector &dpsi, ValueVector &d2psi) override
evaluate the values, gradients and laplacians of this single-particle orbital set ...
helper functions for EinsplineSetBuilder
Definition: Configuration.h:43
virtual void evaluateValue(const ParticleSet &P, const int iat, ValueVector &psi) override
evaluate the values of this single-particle orbital set
QTBase::RealType RealType
Definition: Configuration.h:58
ResourceHandle< SplineOMPTargetMultiWalkerMem< ST, ComplexT > > mw_mem_handle_
Type_t * data()
Definition: OhmmsArray.h:87
A ParticleSet that handles virtual moves of a selected particle of a given physical ParticleSet Virtu...
OrbitalSetTraits< ValueType >::ValueMatrix ValueMatrix
Definition: SPOSet.h:50
T v_m_v(T h00, T h01, T h02, T h11, T h12, T h22, T g1x, T g1y, T g1z, T g2x, T g2y, T g2z)
compute vector[3]^T x matrix[3][3] x vector[3]
T t3_contract(T h000, T h001, T h002, T h011, T h012, T h022, T h111, T h112, T h122, T h222, T g1x, T g1y, T g1z, T g2x, T g2y, T g2z, T g3x, T g3y, T g3z)
Coordinate transform for a 3rd rank symmetric tensor representing coordinate derivatives (hence t3_co...
class to handle hdf file
Definition: hdf_archive.h:51
bool write_splines(hdf_archive &h5f)
void assign_v(ST x, ST y, ST z, TT *restrict results_scratch_ptr, const ST *restrict offload_scratch_ptr, const ST *restrict myKcart_ptr, size_t myKcart_padded_size, size_t first_spo, int index)
virtual void evaluateVGH(const ParticleSet &P, const int iat, ValueVector &psi, GradVector &dpsi, HessVector &grad_grad_psi) override
evaluate the values, gradients and hessians of this single-particle orbital set
class to match std::complex<ST> spline with BsplineSet::ValueType (complex) SPOs with OpenMP offload ...
T min(T a, T b)
virtual void mw_evaluateVGLandDetRatioGrads(const RefVectorWithLeader< SPOSet > &spo_list, const RefVectorWithLeader< ParticleSet > &P_list, int iat, const std::vector< const ValueType *> &invRow_ptr_list, OffloadMWVGLArray &phi_vgl_v, std::vector< ValueType > &ratios, std::vector< GradType > &grads) const override
evaluate the values, gradients and laplacians of this single-particle orbital sets and determinant ra...
void evaluateVGLMultiPos(const Vector< ST, OffloadPinnedAllocator< ST >> &multi_pos_copy, Vector< ST, OffloadPinnedAllocator< ST >> &offload_scratch, Vector< ComplexT, OffloadPinnedAllocator< ComplexT >> &results_scratch, const RefVector< ValueVector > &psi_v_list, const RefVector< GradVector > &dpsi_v_list, const RefVector< ValueVector > &d2psi_v_list) const
OrbitalSetTraits< ValueType >::GradMatrix GradMatrix
Definition: SPOSet.h:52
omp_int_t omp_get_thread_num()
Definition: OpenMP.h:25
Specialized paritlce class for atomistic simulations.
Definition: ParticleSet.h:55
SplineC2COMPTarget(const std::string &my_name)
QTBase::ValueType ValueType
Definition: Configuration.h:60
void FairDivideAligned(const int ntot, const int base, const int npart, const int me, int &first, int &last)
Partition ntot over npart and the size of each partition is a multiple of base size.
Definition: FairDivide.h:96
OrbitalSetTraits< ValueType >::ValueVector ValueVector
Definition: SPOSet.h:49
virtual void mw_evaluateDetRatios(const RefVectorWithLeader< SPOSet > &spo_list, const RefVectorWithLeader< const VirtualParticleSet > &vp_list, const RefVector< ValueVector > &psi_list, const std::vector< const ValueType *> &invRow_ptr_list, std::vector< std::vector< ValueType >> &ratios_list) const override
evaluate determinant ratios for virtual moves, e.g., sphere move for nonlocalPP, of multiple walkers ...
size_t size() const
Definition: OhmmsArray.h:57
void assign_vgh(const PointType &r, ValueVector &psi, GradVector &dpsi, HessVector &grad_grad_psi, int first, int last) const
void assign_vghgh(const PointType &r, ValueVector &psi, GradVector &dpsi, HessVector &grad_grad_psi, GGGVector &grad_grad_grad_psi, int first=0, int last=-1) const
const PosType & activeR(int iat) const
return the active position if the particle is active or the return current position if not ...
Definition: ParticleSet.h:265
omp_int_t omp_get_num_threads()
Definition: OpenMP.h:27
OMPallocator is an allocator with fused device and dualspace allocator functionality.
typename BsplineSet::ValueType ComplexT
void assign_v(const PointType &r, const vContainer_type &myV, ValueVector &psi, int first, int last) const
virtual void evaluate_notranspose(const ParticleSet &P, int first, int last, ValueMatrix &logdet, GradMatrix &dlogdet, ValueMatrix &d2logdet) override
evaluate the values, gradients and laplacians of this single-particle orbital for [first...
std::vector< std::reference_wrapper< T > > RefVector
virtual void mw_evaluateVGL(const RefVectorWithLeader< SPOSet > &sa_list, const RefVectorWithLeader< ParticleSet > &P_list, int iat, const RefVector< ValueVector > &psi_v_list, const RefVector< GradVector > &dpsi_v_list, const RefVector< ValueVector > &d2psi_v_list) const override
evaluate the values, gradients and laplacians of this single-particle orbital sets of multiple walker...
bool read_splines(hdf_archive &h5f)
OrbitalSetTraits< ValueType >::GradHessVector GGGVector
Definition: SPOSet.h:55
sycl::event copy_n(sycl::queue &aq, const T1 *restrict VA, size_t array_size, T2 *restrict VC, const std::vector< sycl::event > &events)
Definition: syclBLAS.cpp:548
OrbitalSetTraits< ValueType >::GradVector GradVector
Definition: SPOSet.h:51
void assign_vgl(ST x, ST y, ST z, TT *restrict results_scratch_ptr, size_t orb_padded_size, const ST *mKK_ptr, const ST *restrict offload_scratch_ptr, size_t spline_padded_size, const ST G[9], const ST *myKcart_ptr, size_t myKcart_padded_size, size_t first_spo, int index)
assign_vgl
void assign_vgl_from_l(const PointType &r, ValueVector &psi, GradVector &dpsi, ValueVector &d2psi)
assign_vgl_from_l can be used when myL is precomputed and myV,myG,myL in cartesian ...
virtual void evaluateDetRatios(const VirtualParticleSet &VP, ValueVector &psi, const ValueVector &psiinv, std::vector< ValueType > &ratios) override
evaluate determinant ratios for virtual moves, e.g., sphere move for nonlocalPP
class to handle complex splines to complex orbitals with splines of arbitrary precision splines stora...
void sincos(T a, T *restrict s, T *restrict c)
sincos function wrapper
Definition: math.hpp:62
virtual void evaluateVGHGH(const ParticleSet &P, const int iat, ValueVector &psi, GradVector &dpsi, HessVector &grad_grad_psi, GGGVector &grad_grad_grad_psi) override
evaluate the values, gradients, hessians, and grad hessians of this single-particle orbital set ...
bool readEntry(T &data, const std::string &aname)
read the data from the group aname and return status use read() for inbuilt error checking ...
Definition: hdf_archive.h:293
A D-dimensional Array class based on PETE.
Definition: OhmmsArray.h:25
void set_spline(SingleSplineType *spline_r, SingleSplineType *spline_i, int twist, int ispline, int level)
bool writeEntry(T &data, const std::string &aname)
write the data to the group aname and return status use write() for inbuilt error checking ...
Definition: hdf_archive.h:244