QMCPACK
SplineC2ROMPTarget.cpp
Go to the documentation of this file.
1 //////////////////////////////////////////////////////////////////////////////////////
2 // This file is distributed under the University of Illinois/NCSA Open Source License.
3 // See LICENSE file in top directory for details.
4 //
5 // Copyright (c) 2019 QMCPACK developers.
6 //
7 // File developed by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
8 //
9 // File created by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
10 //////////////////////////////////////////////////////////////////////////////////////
11 
12 
13 #include "SplineC2ROMPTarget.h"
14 #include "spline2/MultiBsplineEval.hpp"
15 #include "spline2/MultiBsplineEval_OMPoffload.hpp"
17 #include "ApplyPhaseC2R.hpp"
18 #include "Concurrency/OpenMP.h"
19 
20 namespace qmcplusplus
21 {
22 template<typename ST>
23 SplineC2ROMPTarget<ST>::SplineC2ROMPTarget(const SplineC2ROMPTarget& in) = default;
24 
25 template<typename ST>
27  SingleSplineType* spline_i,
28  int twist,
29  int ispline,
30  int level)
31 {
32  SplineInst->copy_spline(spline_r, 2 * ispline);
33  SplineInst->copy_spline(spline_i, 2 * ispline + 1);
34 }
35 
36 template<typename ST>
38 {
39  std::ostringstream o;
40  o << "spline_" << MyIndex;
41  einspline_engine<SplineType> bigtable(SplineInst->getSplinePtr());
42  return h5f.readEntry(bigtable, o.str().c_str()); //"spline_0");
43 }
44 
45 template<typename ST>
47 {
48  std::ostringstream o;
49  o << "spline_" << MyIndex;
50  einspline_engine<SplineType> bigtable(SplineInst->getSplinePtr());
51  return h5f.writeEntry(bigtable, o.str().c_str()); //"spline_0");
52 }
53 
54 template<typename ST>
56  const vContainer_type& myV,
57  ValueVector& psi,
58  int first,
59  int last) const
60 {
61  // protect last
62  last = last > kPoints.size() ? kPoints.size() : last;
63 
64  const ST x = r[0], y = r[1], z = r[2];
65  const ST* restrict kx = myKcart->data(0);
66  const ST* restrict ky = myKcart->data(1);
67  const ST* restrict kz = myKcart->data(2);
68 
69  TT* restrict psi_s = psi.data() + first_spo;
70  const size_t requested_orb_size = psi.size();
71 #pragma omp simd
72  for (size_t j = first; j < std::min(nComplexBands, last); j++)
73  {
74  ST s, c;
75  const size_t jr = j << 1;
76  const size_t ji = jr + 1;
77  const ST val_r = myV[jr];
78  const ST val_i = myV[ji];
79  omptarget::sincos(-(x * kx[j] + y * ky[j] + z * kz[j]), &s, &c);
80  if (jr < requested_orb_size)
81  psi_s[jr] = val_r * c - val_i * s;
82  if (ji < requested_orb_size)
83  psi_s[ji] = val_i * c + val_r * s;
84  }
85 
86  psi_s += nComplexBands;
87 #pragma omp simd
88  for (size_t j = std::max(nComplexBands, first); j < last; j++)
89  {
90  ST s, c;
91  const ST val_r = myV[2 * j];
92  const ST val_i = myV[2 * j + 1];
93  omptarget::sincos(-(x * kx[j] + y * ky[j] + z * kz[j]), &s, &c);
94  if (j + nComplexBands < requested_orb_size)
95  psi_s[j] = val_r * c - val_i * s;
96  }
97 }
98 
99 template<typename ST>
101 {
102  const PointType& r = P.activeR(iat);
103  PointType ru(PrimLattice.toUnit_floor(r));
104 
105  if (true)
106  {
107 #pragma omp parallel
108  {
109  int first, last;
110  FairDivideAligned(myV.size(), getAlignment<ST>(), omp_get_num_threads(), omp_get_thread_num(), first, last);
111 
112  spline2::evaluate3d(SplineInst->getSplinePtr(), ru, myV, first, last);
113  assign_v(r, myV, psi, first / 2, last / 2);
114  }
115  }
116  else
117  {
118  const size_t ChunkSizePerTeam = 512;
119  const int NumTeams = (myV.size() + ChunkSizePerTeam - 1) / ChunkSizePerTeam;
120 
121  const auto spline_padded_size = myV.size();
122  const auto sposet_padded_size = getAlignedSize<TT>(OrbitalSetSize);
123  offload_scratch.resize(spline_padded_size);
124  results_scratch.resize(sposet_padded_size);
125 
126  // Ye: need to extract sizes and pointers before entering target region
127  const auto* spline_ptr = SplineInst->getSplinePtr();
128  auto* offload_scratch_ptr = offload_scratch.data();
129  auto* results_scratch_ptr = results_scratch.data();
130  auto* psi_ptr = psi.data();
131  const auto x = r[0], y = r[1], z = r[2];
132  const auto rux = ru[0], ruy = ru[1], ruz = ru[2];
133  const auto myKcart_padded_size = myKcart->capacity();
134  auto* myKcart_ptr = myKcart->data();
135  const size_t first_spo_local = first_spo;
136  const size_t nComplexBands_local = nComplexBands;
137  const auto requested_orb_size = psi.size();
138  const auto num_complex_splines = kPoints.size();
139 
140  {
141  ScopedTimer offload(offload_timer_);
142  PRAGMA_OFFLOAD("omp target teams distribute num_teams(NumTeams) \
143  map(always, from: results_scratch_ptr[0:sposet_padded_size])")
144  for (int team_id = 0; team_id < NumTeams; team_id++)
145  {
146  const size_t first = ChunkSizePerTeam * team_id;
147  const size_t last = omptarget::min(first + ChunkSizePerTeam, spline_padded_size);
148 
149  int ix, iy, iz;
150  ST a[4], b[4], c[4];
151  spline2::computeLocationAndFractional(spline_ptr, rux, ruy, ruz, ix, iy, iz, a, b, c);
152 
153  PRAGMA_OFFLOAD("omp parallel for")
154  for (int index = 0; index < last - first; index++)
155  spline2offload::evaluate_v_impl_v2(spline_ptr, ix, iy, iz, first + index, a, b, c,
156  offload_scratch_ptr + first + index);
157  const size_t first_cplx = first / 2;
158  const size_t last_cplx = omptarget::min(last / 2, num_complex_splines);
159  PRAGMA_OFFLOAD("omp parallel for")
160  for (int index = first_cplx; index < last_cplx; index++)
161  C2R::assign_v(x, y, z, results_scratch_ptr, offload_scratch_ptr, myKcart_ptr, myKcart_padded_size,
162  first_spo_local, nComplexBands_local, index);
163  }
164 
165  for (size_t i = 0; i < requested_orb_size; i++)
166  psi[i] = results_scratch[i];
167  }
168  }
169 }
170 
171 template<typename ST>
173  ValueVector& psi,
174  const ValueVector& psiinv,
175  std::vector<ValueType>& ratios)
176 {
177  const int nVP = VP.getTotalNum();
178  psiinv_pos_copy.resize(psiinv.size() + nVP * 6);
179 
180  // stage psiinv to psiinv_pos_copy
181  std::copy_n(psiinv.data(), psiinv.size(), psiinv_pos_copy.data());
182 
183  // pack particle positions
184  auto* restrict pos_scratch = psiinv_pos_copy.data() + psiinv.size();
185  for (int iat = 0; iat < nVP; ++iat)
186  {
187  const PointType& r = VP.activeR(iat);
188  PointType ru(PrimLattice.toUnit_floor(r));
189  pos_scratch[iat * 6] = r[0];
190  pos_scratch[iat * 6 + 1] = r[1];
191  pos_scratch[iat * 6 + 2] = r[2];
192  pos_scratch[iat * 6 + 3] = ru[0];
193  pos_scratch[iat * 6 + 4] = ru[1];
194  pos_scratch[iat * 6 + 5] = ru[2];
195  }
196 
197  const size_t ChunkSizePerTeam = 512;
198  const int NumTeams = (myV.size() + ChunkSizePerTeam - 1) / ChunkSizePerTeam;
199  ratios_private.resize(nVP, NumTeams);
200  const auto spline_padded_size = myV.size();
201  const auto sposet_padded_size = getAlignedSize<TT>(OrbitalSetSize);
202  offload_scratch.resize(spline_padded_size * nVP);
203  results_scratch.resize(sposet_padded_size * nVP);
204 
205  // Ye: need to extract sizes and pointers before entering target region
206  const auto* spline_ptr = SplineInst->getSplinePtr();
207  auto* offload_scratch_ptr = offload_scratch.data();
208  auto* results_scratch_ptr = results_scratch.data();
209  const auto myKcart_padded_size = myKcart->capacity();
210  auto* myKcart_ptr = myKcart->data();
211  auto* psiinv_ptr = psiinv_pos_copy.data();
212  auto* ratios_private_ptr = ratios_private.data();
213  const size_t first_spo_local = first_spo;
214  const size_t nComplexBands_local = nComplexBands;
215  const auto requested_orb_size = psiinv.size();
216  const auto num_complex_splines = kPoints.size();
217 
218  {
219  ScopedTimer offload(offload_timer_);
220  PRAGMA_OFFLOAD("omp target teams distribute collapse(2) num_teams(NumTeams*nVP) \
221  map(always, to: psiinv_ptr[0:psiinv_pos_copy.size()]) \
222  map(always, from: ratios_private_ptr[0:NumTeams*nVP])")
223  for (int iat = 0; iat < nVP; iat++)
224  for (int team_id = 0; team_id < NumTeams; team_id++)
225  {
226  const size_t first = ChunkSizePerTeam * team_id;
227  const size_t last = omptarget::min(first + ChunkSizePerTeam, spline_padded_size);
228 
229  auto* restrict offload_scratch_iat_ptr = offload_scratch_ptr + spline_padded_size * iat;
230  auto* restrict psi_iat_ptr = results_scratch_ptr + sposet_padded_size * iat;
231  auto* restrict pos_scratch = psiinv_ptr + requested_orb_size;
232 
233  int ix, iy, iz;
234  ST a[4], b[4], c[4];
235  spline2::computeLocationAndFractional(spline_ptr, ST(pos_scratch[iat * 6 + 3]), ST(pos_scratch[iat * 6 + 4]),
236  ST(pos_scratch[iat * 6 + 5]), ix, iy, iz, a, b, c);
237 
238  PRAGMA_OFFLOAD("omp parallel for")
239  for (int index = 0; index < last - first; index++)
240  spline2offload::evaluate_v_impl_v2(spline_ptr, ix, iy, iz, first + index, a, b, c,
241  offload_scratch_iat_ptr + first + index);
242  const size_t first_cplx = first / 2;
243  const size_t last_cplx = omptarget::min(last / 2, num_complex_splines);
244  PRAGMA_OFFLOAD("omp parallel for")
245  for (int index = first_cplx; index < last_cplx; index++)
246  C2R::assign_v(ST(pos_scratch[iat * 6]), ST(pos_scratch[iat * 6 + 1]), ST(pos_scratch[iat * 6 + 2]),
247  psi_iat_ptr, offload_scratch_iat_ptr, myKcart_ptr, myKcart_padded_size, first_spo_local,
248  nComplexBands_local, index);
249 
250  const size_t first_real = first_cplx + omptarget::min(nComplexBands_local, first_cplx);
251  const size_t last_real =
252  omptarget::min(last_cplx + omptarget::min(nComplexBands_local, last_cplx), requested_orb_size);
253  TT sum(0);
254  PRAGMA_OFFLOAD("omp parallel for simd reduction(+:sum)")
255  for (int i = first_real; i < last_real; i++)
256  sum += psi_iat_ptr[i] * psiinv_ptr[i];
257  ratios_private_ptr[iat * NumTeams + team_id] = sum;
258  }
259  }
260 
261  // do the reduction manually
262  for (int iat = 0; iat < nVP; ++iat)
263  {
264  ratios[iat] = TT(0);
265  for (int tid = 0; tid < NumTeams; tid++)
266  ratios[iat] += ratios_private[iat][tid];
267  }
268 }
269 
270 template<typename ST>
273  const RefVector<ValueVector>& psi_list,
274  const std::vector<const ValueType*>& invRow_ptr_list,
275  std::vector<std::vector<ValueType>>& ratios_list) const
276 {
277  assert(this == &spo_list.getLeader());
278  auto& phi_leader = spo_list.getCastedLeader<SplineC2ROMPTarget<ST>>();
279  auto& mw_mem = phi_leader.mw_mem_handle_.getResource();
280  auto& det_ratios_buffer_H2D = mw_mem.det_ratios_buffer_H2D;
281  auto& mw_ratios_private = mw_mem.mw_ratios_private;
282  auto& mw_offload_scratch = mw_mem.mw_offload_scratch;
283  auto& mw_results_scratch = mw_mem.mw_results_scratch;
284  const size_t nw = spo_list.size();
285  const size_t requested_orb_size = phi_leader.size();
286 
287  size_t mw_nVP = 0;
288  for (const VirtualParticleSet& VP : vp_list)
289  mw_nVP += VP.getTotalNum();
290 
291  const size_t packed_size = nw * sizeof(ValueType*) + mw_nVP * (6 * sizeof(TT) + sizeof(int));
292  det_ratios_buffer_H2D.resize(packed_size);
293 
294  // pack invRow_ptr_list to det_ratios_buffer_H2D
295  Vector<const ValueType*> ptr_buffer(reinterpret_cast<const ValueType**>(det_ratios_buffer_H2D.data()), nw);
296  for (size_t iw = 0; iw < nw; iw++)
297  ptr_buffer[iw] = invRow_ptr_list[iw];
298 
299  // pack particle positions
300  auto* pos_ptr = reinterpret_cast<TT*>(det_ratios_buffer_H2D.data() + nw * sizeof(ValueType*));
301  auto* ref_id_ptr =
302  reinterpret_cast<int*>(det_ratios_buffer_H2D.data() + nw * sizeof(ValueType*) + mw_nVP * 6 * sizeof(TT));
303  size_t iVP = 0;
304  for (size_t iw = 0; iw < nw; iw++)
305  {
306  const VirtualParticleSet& VP = vp_list[iw];
307  assert(ratios_list[iw].size() == VP.getTotalNum());
308  for (size_t iat = 0; iat < VP.getTotalNum(); ++iat, ++iVP)
309  {
310  ref_id_ptr[iVP] = iw;
311  const PointType& r = VP.activeR(iat);
312  PointType ru(PrimLattice.toUnit_floor(r));
313  pos_ptr[0] = r[0];
314  pos_ptr[1] = r[1];
315  pos_ptr[2] = r[2];
316  pos_ptr[3] = ru[0];
317  pos_ptr[4] = ru[1];
318  pos_ptr[5] = ru[2];
319  pos_ptr += 6;
320  }
321  }
322 
323  const size_t ChunkSizePerTeam = 512;
324  const int NumTeams = (myV.size() + ChunkSizePerTeam - 1) / ChunkSizePerTeam;
325  mw_ratios_private.resize(mw_nVP, NumTeams);
326  const auto spline_padded_size = myV.size();
327  const auto sposet_padded_size = getAlignedSize<TT>(OrbitalSetSize);
328  mw_offload_scratch.resize(spline_padded_size * mw_nVP);
329  mw_results_scratch.resize(sposet_padded_size * mw_nVP);
330 
331  // Ye: need to extract sizes and pointers before entering target region
332  const auto* spline_ptr = SplineInst->getSplinePtr();
333  auto* offload_scratch_ptr = mw_offload_scratch.data();
334  auto* results_scratch_ptr = mw_results_scratch.data();
335  const auto myKcart_padded_size = myKcart->capacity();
336  auto* myKcart_ptr = myKcart->data();
337  auto* buffer_H2D_ptr = det_ratios_buffer_H2D.data();
338  auto* ratios_private_ptr = mw_ratios_private.data();
339  const size_t first_spo_local = first_spo;
340  const size_t nComplexBands_local = nComplexBands;
341  const auto num_complex_splines = kPoints.size();
342 
343  {
344  ScopedTimer offload(offload_timer_);
345  PRAGMA_OFFLOAD("omp target teams distribute collapse(2) num_teams(NumTeams*mw_nVP) \
346  map(always, to: buffer_H2D_ptr[0:det_ratios_buffer_H2D.size()]) \
347  map(always, from: ratios_private_ptr[0:NumTeams*mw_nVP])")
348  for (int iat = 0; iat < mw_nVP; iat++)
349  for (int team_id = 0; team_id < NumTeams; team_id++)
350  {
351  const size_t first = ChunkSizePerTeam * team_id;
352  const size_t last = omptarget::min(first + ChunkSizePerTeam, spline_padded_size);
353 
354  auto* restrict offload_scratch_iat_ptr = offload_scratch_ptr + spline_padded_size * iat;
355  auto* restrict psi_iat_ptr = results_scratch_ptr + sposet_padded_size * iat;
356  auto* ref_id_ptr = reinterpret_cast<int*>(buffer_H2D_ptr + nw * sizeof(ValueType*) + mw_nVP * 6 * sizeof(TT));
357  auto* restrict psiinv_ptr = reinterpret_cast<const ValueType**>(buffer_H2D_ptr)[ref_id_ptr[iat]];
358  auto* restrict pos_scratch = reinterpret_cast<TT*>(buffer_H2D_ptr + nw * sizeof(ValueType*));
359 
360  int ix, iy, iz;
361  ST a[4], b[4], c[4];
362  spline2::computeLocationAndFractional(spline_ptr, ST(pos_scratch[iat * 6 + 3]), ST(pos_scratch[iat * 6 + 4]),
363  ST(pos_scratch[iat * 6 + 5]), ix, iy, iz, a, b, c);
364 
365  PRAGMA_OFFLOAD("omp parallel for")
366  for (int index = 0; index < last - first; index++)
367  spline2offload::evaluate_v_impl_v2(spline_ptr, ix, iy, iz, first + index, a, b, c,
368  offload_scratch_iat_ptr + first + index);
369  const size_t first_cplx = first / 2;
370  const size_t last_cplx = omptarget::min(last / 2, num_complex_splines);
371  PRAGMA_OFFLOAD("omp parallel for")
372  for (int index = first_cplx; index < last_cplx; index++)
373  C2R::assign_v(ST(pos_scratch[iat * 6]), ST(pos_scratch[iat * 6 + 1]), ST(pos_scratch[iat * 6 + 2]),
374  psi_iat_ptr, offload_scratch_iat_ptr, myKcart_ptr, myKcart_padded_size, first_spo_local,
375  nComplexBands_local, index);
376 
377  const size_t first_real = first_cplx + omptarget::min(nComplexBands_local, first_cplx);
378  const size_t last_real =
379  omptarget::min(last_cplx + omptarget::min(nComplexBands_local, last_cplx), requested_orb_size);
380  TT sum(0);
381  PRAGMA_OFFLOAD("omp parallel for simd reduction(+:sum)")
382  for (int i = first_real; i < last_real; i++)
383  sum += psi_iat_ptr[i] * psiinv_ptr[i];
384  ratios_private_ptr[iat * NumTeams + team_id] = sum;
385  }
386  }
387 
388  // do the reduction manually
389  iVP = 0;
390  for (size_t iw = 0; iw < nw; iw++)
391  {
392  auto& ratios = ratios_list[iw];
393  for (size_t iat = 0; iat < ratios.size(); iat++, iVP++)
394  {
395  ratios[iat] = TT(0);
396  for (int tid = 0; tid < NumTeams; ++tid)
397  ratios[iat] += mw_ratios_private[iVP][tid];
398  }
399  }
400 }
401 
402 /** assign_vgl_from_l can be used when myL is precomputed and myV,myG,myL in cartesian
403  */
404 template<typename ST>
406  ValueVector& psi,
407  GradVector& dpsi,
408  ValueVector& d2psi)
409 {
410  constexpr ST two(2);
411  const ST x = r[0], y = r[1], z = r[2];
412 
413  const ST* restrict k0 = myKcart->data(0);
414  ASSUME_ALIGNED(k0);
415  const ST* restrict k1 = myKcart->data(1);
416  ASSUME_ALIGNED(k1);
417  const ST* restrict k2 = myKcart->data(2);
418  ASSUME_ALIGNED(k2);
419 
420  const ST* restrict g0 = myG.data(0);
421  ASSUME_ALIGNED(g0);
422  const ST* restrict g1 = myG.data(1);
423  ASSUME_ALIGNED(g1);
424  const ST* restrict g2 = myG.data(2);
425  ASSUME_ALIGNED(g2);
426 
427  const size_t N = kPoints.size();
428 
429  const size_t requested_orb_size = psi.size();
430 #pragma omp simd
431  for (size_t j = 0; j < nComplexBands; j++)
432  {
433  const size_t jr = j << 1;
434  const size_t ji = jr + 1;
435 
436  const ST kX = k0[j];
437  const ST kY = k1[j];
438  const ST kZ = k2[j];
439  const ST val_r = myV[jr];
440  const ST val_i = myV[ji];
441 
442  //phase
443  ST s, c;
444  omptarget::sincos(-(x * kX + y * kY + z * kZ), &s, &c);
445 
446  //dot(PrimLattice.G,myG[j])
447  const ST dX_r = g0[jr];
448  const ST dY_r = g1[jr];
449  const ST dZ_r = g2[jr];
450 
451  const ST dX_i = g0[ji];
452  const ST dY_i = g1[ji];
453  const ST dZ_i = g2[ji];
454 
455  // \f$\nabla \psi_r + {\bf k}\psi_i\f$
456  const ST gX_r = dX_r + val_i * kX;
457  const ST gY_r = dY_r + val_i * kY;
458  const ST gZ_r = dZ_r + val_i * kZ;
459  const ST gX_i = dX_i - val_r * kX;
460  const ST gY_i = dY_i - val_r * kY;
461  const ST gZ_i = dZ_i - val_r * kZ;
462 
463  const ST lap_r = myL[jr] + (*mKK)[j] * val_r + two * (kX * dX_i + kY * dY_i + kZ * dZ_i);
464  const ST lap_i = myL[ji] + (*mKK)[j] * val_i - two * (kX * dX_r + kY * dY_r + kZ * dZ_r);
465 
466  const size_t psiIndex = first_spo + jr;
467  if (psiIndex < requested_orb_size)
468  {
469  psi[psiIndex] = c * val_r - s * val_i;
470  d2psi[psiIndex] = c * lap_r - s * lap_i;
471  dpsi[psiIndex][0] = c * gX_r - s * gX_i;
472  dpsi[psiIndex][1] = c * gY_r - s * gY_i;
473  dpsi[psiIndex][2] = c * gZ_r - s * gZ_i;
474  }
475  if (psiIndex + 1 < requested_orb_size)
476  {
477  psi[psiIndex + 1] = c * val_i + s * val_r;
478  d2psi[psiIndex + 1] = c * lap_i + s * lap_r;
479  dpsi[psiIndex + 1][0] = c * gX_i + s * gX_r;
480  dpsi[psiIndex + 1][1] = c * gY_i + s * gY_r;
481  dpsi[psiIndex + 1][2] = c * gZ_i + s * gZ_r;
482  }
483  }
484 
485 #pragma omp simd
486  for (size_t j = nComplexBands; j < N; j++)
487  {
488  const size_t jr = j << 1;
489  const size_t ji = jr + 1;
490 
491  const ST kX = k0[j];
492  const ST kY = k1[j];
493  const ST kZ = k2[j];
494  const ST val_r = myV[jr];
495  const ST val_i = myV[ji];
496 
497  //phase
498  ST s, c;
499  omptarget::sincos(-(x * kX + y * kY + z * kZ), &s, &c);
500 
501  //dot(PrimLattice.G,myG[j])
502  const ST dX_r = g0[jr];
503  const ST dY_r = g1[jr];
504  const ST dZ_r = g2[jr];
505 
506  const ST dX_i = g0[ji];
507  const ST dY_i = g1[ji];
508  const ST dZ_i = g2[ji];
509 
510  // \f$\nabla \psi_r + {\bf k}\psi_i\f$
511  const ST gX_r = dX_r + val_i * kX;
512  const ST gY_r = dY_r + val_i * kY;
513  const ST gZ_r = dZ_r + val_i * kZ;
514  const ST gX_i = dX_i - val_r * kX;
515  const ST gY_i = dY_i - val_r * kY;
516  const ST gZ_i = dZ_i - val_r * kZ;
517  if (const size_t psiIndex = first_spo + nComplexBands + j; psiIndex < requested_orb_size)
518  {
519  psi[psiIndex] = c * val_r - s * val_i;
520  dpsi[psiIndex][0] = c * gX_r - s * gX_i;
521  dpsi[psiIndex][1] = c * gY_r - s * gY_i;
522  dpsi[psiIndex][2] = c * gZ_r - s * gZ_i;
523 
524  const ST lap_r = myL[jr] + (*mKK)[j] * val_r + two * (kX * dX_i + kY * dY_i + kZ * dZ_i);
525  const ST lap_i = myL[ji] + (*mKK)[j] * val_i - two * (kX * dX_r + kY * dY_r + kZ * dZ_r);
526  d2psi[psiIndex] = c * lap_r - s * lap_i;
527  }
528  }
529 }
530 
531 template<typename ST>
533  const int iat,
534  ValueVector& psi,
535  GradVector& dpsi,
536  ValueVector& d2psi)
537 {
538  const PointType& r = P.activeR(iat);
539  PointType ru(PrimLattice.toUnit_floor(r));
540 
541  const size_t ChunkSizePerTeam = 512;
542  const int NumTeams = (myV.size() + ChunkSizePerTeam - 1) / ChunkSizePerTeam;
543 
544  const auto spline_padded_size = myV.size();
545  const auto sposet_padded_size = getAlignedSize<TT>(OrbitalSetSize);
546  // for V(1)G(3)H(6) intermediate result
547  offload_scratch.resize(spline_padded_size * SoAFields3D::NUM_FIELDS);
548  // for V(1)G(3)L(1) final result
549  results_scratch.resize(sposet_padded_size * 5);
550 
551  // Ye: need to extract sizes and pointers before entering target region
552  const auto* spline_ptr = SplineInst->getSplinePtr();
553  auto* offload_scratch_ptr = offload_scratch.data();
554  auto* results_scratch_ptr = results_scratch.data();
555  const auto x = r[0], y = r[1], z = r[2];
556  const auto rux = ru[0], ruy = ru[1], ruz = ru[2];
557  const auto myKcart_padded_size = myKcart->capacity();
558  auto* mKK_ptr = mKK->data();
559  auto* GGt_ptr = GGt_offload->data();
560  auto* PrimLattice_G_ptr = PrimLattice_G_offload->data();
561  auto* myKcart_ptr = myKcart->data();
562  const size_t first_spo_local = first_spo;
563  const size_t nComplexBands_local = nComplexBands;
564  const auto requested_orb_size = psi.size();
565  const auto num_complex_splines = kPoints.size();
566 
567  {
568  ScopedTimer offload(offload_timer_);
569  PRAGMA_OFFLOAD("omp target teams distribute num_teams(NumTeams) \
570  map(always, from: results_scratch_ptr[0:sposet_padded_size*5])")
571  for (int team_id = 0; team_id < NumTeams; team_id++)
572  {
573  const size_t first = ChunkSizePerTeam * team_id;
574  const size_t last = omptarget::min(first + ChunkSizePerTeam, spline_padded_size);
575 
576  int ix, iy, iz;
577  ST a[4], b[4], c[4], da[4], db[4], dc[4], d2a[4], d2b[4], d2c[4];
578  spline2::computeLocationAndFractional(spline_ptr, rux, ruy, ruz, ix, iy, iz, a, b, c, da, db, dc, d2a, d2b, d2c);
579 
580  const ST G[9] = {PrimLattice_G_ptr[0], PrimLattice_G_ptr[1], PrimLattice_G_ptr[2],
581  PrimLattice_G_ptr[3], PrimLattice_G_ptr[4], PrimLattice_G_ptr[5],
582  PrimLattice_G_ptr[6], PrimLattice_G_ptr[7], PrimLattice_G_ptr[8]};
583  const ST symGGt[6] = {GGt_ptr[0], GGt_ptr[1] + GGt_ptr[3], GGt_ptr[2] + GGt_ptr[6],
584  GGt_ptr[4], GGt_ptr[5] + GGt_ptr[7], GGt_ptr[8]};
585 
586  PRAGMA_OFFLOAD("omp parallel for")
587  for (int index = 0; index < last - first; index++)
588  {
589  spline2offload::evaluate_vgh_impl_v2(spline_ptr, ix, iy, iz, first + index, a, b, c, da, db, dc, d2a, d2b, d2c,
590  offload_scratch_ptr + first + index, spline_padded_size);
591  const int output_index = first + index;
592  offload_scratch_ptr[spline_padded_size * SoAFields3D::LAPL + output_index] =
593  SymTrace(offload_scratch_ptr[spline_padded_size * SoAFields3D::HESS00 + output_index],
594  offload_scratch_ptr[spline_padded_size * SoAFields3D::HESS01 + output_index],
595  offload_scratch_ptr[spline_padded_size * SoAFields3D::HESS02 + output_index],
596  offload_scratch_ptr[spline_padded_size * SoAFields3D::HESS11 + output_index],
597  offload_scratch_ptr[spline_padded_size * SoAFields3D::HESS12 + output_index],
598  offload_scratch_ptr[spline_padded_size * SoAFields3D::HESS22 + output_index], symGGt);
599  }
600  const size_t first_cplx = first / 2;
601  const size_t last_cplx = omptarget::min(last / 2, num_complex_splines);
602  PRAGMA_OFFLOAD("omp parallel for")
603  for (int index = first_cplx; index < last_cplx; index++)
604  C2R::assign_vgl(x, y, z, results_scratch_ptr, sposet_padded_size, mKK_ptr, offload_scratch_ptr,
605  spline_padded_size, G, myKcart_ptr, myKcart_padded_size, first_spo_local, nComplexBands_local,
606  index);
607  }
608  }
609 
610  for (size_t i = 0; i < requested_orb_size; i++)
611  {
612  psi[i] = results_scratch[i];
613  dpsi[i][0] = results_scratch[i + sposet_padded_size * 1];
614  dpsi[i][1] = results_scratch[i + sposet_padded_size * 2];
615  dpsi[i][2] = results_scratch[i + sposet_padded_size * 3];
616  d2psi[i] = results_scratch[i + sposet_padded_size * 4];
617  }
618 }
619 
620 template<typename ST>
622  Vector<ST, OffloadPinnedAllocator<ST>>& offload_scratch,
623  Vector<TT, OffloadPinnedAllocator<TT>>& results_scratch,
624  const RefVector<ValueVector>& psi_v_list,
625  const RefVector<GradVector>& dpsi_v_list,
626  const RefVector<ValueVector>& d2psi_v_list) const
627 {
628  const size_t num_pos = psi_v_list.size();
629  const size_t ChunkSizePerTeam = 512;
630  const int NumTeams = (myV.size() + ChunkSizePerTeam - 1) / ChunkSizePerTeam;
631  const auto spline_padded_size = myV.size();
632  const auto sposet_padded_size = getAlignedSize<TT>(OrbitalSetSize);
633  // for V(1)G(3)H(6) intermediate result
634  offload_scratch.resize(spline_padded_size * num_pos * SoAFields3D::NUM_FIELDS);
635  // for V(1)G(3)L(1) final result
636  results_scratch.resize(sposet_padded_size * num_pos * 5);
637 
638  // Ye: need to extract sizes and pointers before entering target region
639  const auto* spline_ptr = SplineInst->getSplinePtr();
640  auto* pos_copy_ptr = multi_pos.data();
641  auto* offload_scratch_ptr = offload_scratch.data();
642  auto* results_scratch_ptr = results_scratch.data();
643  const auto myKcart_padded_size = myKcart->capacity();
644  auto* mKK_ptr = mKK->data();
645  auto* GGt_ptr = GGt_offload->data();
646  auto* PrimLattice_G_ptr = PrimLattice_G_offload->data();
647  auto* myKcart_ptr = myKcart->data();
648  const size_t first_spo_local = first_spo;
649  const size_t nComplexBands_local = nComplexBands;
650  const auto requested_orb_size = psi_v_list[0].get().size();
651  const auto num_complex_splines = kPoints.size();
652 
653  {
654  ScopedTimer offload(offload_timer_);
655  PRAGMA_OFFLOAD("omp target teams distribute collapse(2) num_teams(NumTeams*num_pos) \
656  map(always, to: pos_copy_ptr[0:num_pos*6]) \
657  map(always, from: results_scratch_ptr[0:sposet_padded_size*num_pos*5])")
658  for (int iw = 0; iw < num_pos; iw++)
659  for (int team_id = 0; team_id < NumTeams; team_id++)
660  {
661  const size_t first = ChunkSizePerTeam * team_id;
662  const size_t last = omptarget::min(first + ChunkSizePerTeam, spline_padded_size);
663 
664  auto* restrict offload_scratch_iw_ptr = offload_scratch_ptr + spline_padded_size * iw * SoAFields3D::NUM_FIELDS;
665  auto* restrict psi_iw_ptr = results_scratch_ptr + sposet_padded_size * iw * 5;
666 
667  int ix, iy, iz;
668  ST a[4], b[4], c[4], da[4], db[4], dc[4], d2a[4], d2b[4], d2c[4];
669  spline2::computeLocationAndFractional(spline_ptr, pos_copy_ptr[iw * 6 + 3], pos_copy_ptr[iw * 6 + 4],
670  pos_copy_ptr[iw * 6 + 5], ix, iy, iz, a, b, c, da, db, dc, d2a, d2b, d2c);
671 
672  const ST G[9] = {PrimLattice_G_ptr[0], PrimLattice_G_ptr[1], PrimLattice_G_ptr[2],
673  PrimLattice_G_ptr[3], PrimLattice_G_ptr[4], PrimLattice_G_ptr[5],
674  PrimLattice_G_ptr[6], PrimLattice_G_ptr[7], PrimLattice_G_ptr[8]};
675  const ST symGGt[6] = {GGt_ptr[0], GGt_ptr[1] + GGt_ptr[3], GGt_ptr[2] + GGt_ptr[6],
676  GGt_ptr[4], GGt_ptr[5] + GGt_ptr[7], GGt_ptr[8]};
677 
678  PRAGMA_OFFLOAD("omp parallel for")
679  for (int index = 0; index < last - first; index++)
680  {
681  spline2offload::evaluate_vgh_impl_v2(spline_ptr, ix, iy, iz, first + index, a, b, c, da, db, dc, d2a, d2b,
682  d2c, offload_scratch_iw_ptr + first + index, spline_padded_size);
683  const int output_index = first + index;
684  offload_scratch_iw_ptr[spline_padded_size * SoAFields3D::LAPL + output_index] =
685  SymTrace(offload_scratch_iw_ptr[spline_padded_size * SoAFields3D::HESS00 + output_index],
686  offload_scratch_iw_ptr[spline_padded_size * SoAFields3D::HESS01 + output_index],
687  offload_scratch_iw_ptr[spline_padded_size * SoAFields3D::HESS02 + output_index],
688  offload_scratch_iw_ptr[spline_padded_size * SoAFields3D::HESS11 + output_index],
689  offload_scratch_iw_ptr[spline_padded_size * SoAFields3D::HESS12 + output_index],
690  offload_scratch_iw_ptr[spline_padded_size * SoAFields3D::HESS22 + output_index], symGGt);
691  }
692  const size_t first_cplx = first / 2;
693  const size_t last_cplx = omptarget::min(last / 2, num_complex_splines);
694  PRAGMA_OFFLOAD("omp parallel for")
695  for (int index = first_cplx; index < last_cplx; index++)
696  C2R::assign_vgl(pos_copy_ptr[iw * 6], pos_copy_ptr[iw * 6 + 1], pos_copy_ptr[iw * 6 + 2], psi_iw_ptr,
697  sposet_padded_size, mKK_ptr, offload_scratch_iw_ptr, spline_padded_size, G, myKcart_ptr,
698  myKcart_padded_size, first_spo_local, nComplexBands_local, index);
699  }
700  }
701 
702  for (int iw = 0; iw < num_pos; ++iw)
703  {
704  auto* restrict results_iw_ptr = results_scratch_ptr + sposet_padded_size * iw * 5;
705  ValueVector& psi_v(psi_v_list[iw]);
706  GradVector& dpsi_v(dpsi_v_list[iw]);
707  ValueVector& d2psi_v(d2psi_v_list[iw]);
708  for (size_t i = 0; i < requested_orb_size; i++)
709  {
710  psi_v[i] = results_iw_ptr[i];
711  dpsi_v[i][0] = results_iw_ptr[i + sposet_padded_size];
712  dpsi_v[i][1] = results_iw_ptr[i + sposet_padded_size * 2];
713  dpsi_v[i][2] = results_iw_ptr[i + sposet_padded_size * 3];
714  d2psi_v[i] = results_iw_ptr[i + sposet_padded_size * 4];
715  }
716  }
717 }
718 
719 template<typename ST>
721  const RefVectorWithLeader<ParticleSet>& P_list,
722  int iat,
723  const RefVector<ValueVector>& psi_v_list,
724  const RefVector<GradVector>& dpsi_v_list,
725  const RefVector<ValueVector>& d2psi_v_list) const
726 {
727  assert(this == &sa_list.getLeader());
728  auto& phi_leader = sa_list.getCastedLeader<SplineC2ROMPTarget<ST>>();
729  auto& mw_mem = phi_leader.mw_mem_handle_.getResource();
730  auto& mw_pos_copy = mw_mem.mw_pos_copy;
731  auto& mw_offload_scratch = mw_mem.mw_offload_scratch;
732  auto& mw_results_scratch = mw_mem.mw_results_scratch;
733  const int nwalkers = sa_list.size();
734  mw_pos_copy.resize(nwalkers * 6);
735 
736  // pack particle positions
737  for (int iw = 0; iw < nwalkers; ++iw)
738  {
739  const PointType& r = P_list[iw].activeR(iat);
740  PointType ru(PrimLattice.toUnit_floor(r));
741  mw_pos_copy[iw * 6] = r[0];
742  mw_pos_copy[iw * 6 + 1] = r[1];
743  mw_pos_copy[iw * 6 + 2] = r[2];
744  mw_pos_copy[iw * 6 + 3] = ru[0];
745  mw_pos_copy[iw * 6 + 4] = ru[1];
746  mw_pos_copy[iw * 6 + 5] = ru[2];
747  }
748 
749  phi_leader.evaluateVGLMultiPos(mw_pos_copy, mw_offload_scratch, mw_results_scratch, psi_v_list, dpsi_v_list,
750  d2psi_v_list);
751 }
752 
753 template<typename ST>
755  const RefVectorWithLeader<ParticleSet>& P_list,
756  int iat,
757  const std::vector<const ValueType*>& invRow_ptr_list,
758  OffloadMWVGLArray& phi_vgl_v,
759  std::vector<ValueType>& ratios,
760  std::vector<GradType>& grads) const
761 {
762  assert(this == &spo_list.getLeader());
763  auto& phi_leader = spo_list.getCastedLeader<SplineC2ROMPTarget<ST>>();
764  auto& mw_mem = phi_leader.mw_mem_handle_.getResource();
765  auto& buffer_H2D = mw_mem.buffer_H2D;
766  auto& rg_private = mw_mem.rg_private;
767  auto& mw_offload_scratch = mw_mem.mw_offload_scratch;
768  auto& mw_results_scratch = mw_mem.mw_results_scratch;
769  const int nwalkers = spo_list.size();
770  buffer_H2D.resize(nwalkers, sizeof(ST) * 6 + sizeof(ValueType*));
771 
772  // pack particle positions and invRow pointers.
773  for (int iw = 0; iw < nwalkers; ++iw)
774  {
775  const PointType& r = P_list[iw].activeR(iat);
776  PointType ru(PrimLattice.toUnit_floor(r));
777  Vector<ST> pos_copy(reinterpret_cast<ST*>(buffer_H2D[iw]), 6);
778 
779  pos_copy[0] = r[0];
780  pos_copy[1] = r[1];
781  pos_copy[2] = r[2];
782  pos_copy[3] = ru[0];
783  pos_copy[4] = ru[1];
784  pos_copy[5] = ru[2];
785 
786  auto& invRow_ptr = *reinterpret_cast<const ValueType**>(buffer_H2D[iw] + sizeof(ST) * 6);
787  invRow_ptr = invRow_ptr_list[iw];
788  }
789 
790  const size_t num_pos = nwalkers;
791  const auto spline_padded_size = myV.size();
792  const auto sposet_padded_size = getAlignedSize<TT>(OrbitalSetSize);
793  const size_t ChunkSizePerTeam = 512;
794  const int NumTeams = (myV.size() + ChunkSizePerTeam - 1) / ChunkSizePerTeam;
795 
796  mw_offload_scratch.resize(spline_padded_size * num_pos * SoAFields3D::NUM_FIELDS);
797  // for V(1)G(3)L(1) final result
798  mw_results_scratch.resize(sposet_padded_size * num_pos * 5);
799  // per team ratio and grads
800  rg_private.resize(num_pos, NumTeams * 4);
801 
802  // Ye: need to extract sizes and pointers before entering target region
803  const auto* spline_ptr = SplineInst->getSplinePtr();
804  auto* buffer_H2D_ptr = buffer_H2D.data();
805  auto* offload_scratch_ptr = mw_offload_scratch.data();
806  auto* results_scratch_ptr = mw_results_scratch.data();
807  const auto myKcart_padded_size = myKcart->capacity();
808  auto* mKK_ptr = mKK->data();
809  auto* GGt_ptr = GGt_offload->data();
810  auto* PrimLattice_G_ptr = PrimLattice_G_offload->data();
811  auto* myKcart_ptr = myKcart->data();
812  auto* phi_vgl_ptr = phi_vgl_v.data();
813  auto* rg_private_ptr = rg_private.data();
814  const size_t buffer_H2D_stride = buffer_H2D.cols();
815  const size_t first_spo_local = first_spo;
816  const auto requested_orb_size = phi_vgl_v.size(2);
817  const size_t phi_vgl_stride = num_pos * requested_orb_size;
818  const size_t nComplexBands_local = nComplexBands;
819  const auto num_complex_splines = kPoints.size();
820 
821  {
822  ScopedTimer offload(offload_timer_);
823  PRAGMA_OFFLOAD("omp target teams distribute collapse(2) num_teams(NumTeams*num_pos) \
824  map(always, to: buffer_H2D_ptr[:buffer_H2D.size()]) \
825  map(always, from: rg_private_ptr[0:rg_private.size()])")
826  for (int iw = 0; iw < num_pos; iw++)
827  for (int team_id = 0; team_id < NumTeams; team_id++)
828  {
829  const size_t first = ChunkSizePerTeam * team_id;
830  const size_t last = omptarget::min(first + ChunkSizePerTeam, spline_padded_size);
831 
832  auto* restrict offload_scratch_iw_ptr = offload_scratch_ptr + spline_padded_size * iw * SoAFields3D::NUM_FIELDS;
833  auto* restrict psi_iw_ptr = results_scratch_ptr + sposet_padded_size * iw * 5;
834  const auto* restrict pos_iw_ptr = reinterpret_cast<ST*>(buffer_H2D_ptr + buffer_H2D_stride * iw);
835  const auto* restrict invRow_iw_ptr =
836  *reinterpret_cast<ValueType**>(buffer_H2D_ptr + buffer_H2D_stride * iw + sizeof(ST) * 6);
837 
838  int ix, iy, iz;
839  ST a[4], b[4], c[4], da[4], db[4], dc[4], d2a[4], d2b[4], d2c[4];
840  spline2::computeLocationAndFractional(spline_ptr, pos_iw_ptr[3], pos_iw_ptr[4], pos_iw_ptr[5], ix, iy, iz, a, b,
841  c, da, db, dc, d2a, d2b, d2c);
842 
843  const ST G[9] = {PrimLattice_G_ptr[0], PrimLattice_G_ptr[1], PrimLattice_G_ptr[2],
844  PrimLattice_G_ptr[3], PrimLattice_G_ptr[4], PrimLattice_G_ptr[5],
845  PrimLattice_G_ptr[6], PrimLattice_G_ptr[7], PrimLattice_G_ptr[8]};
846  const ST symGGt[6] = {GGt_ptr[0], GGt_ptr[1] + GGt_ptr[3], GGt_ptr[2] + GGt_ptr[6],
847  GGt_ptr[4], GGt_ptr[5] + GGt_ptr[7], GGt_ptr[8]};
848 
849  PRAGMA_OFFLOAD("omp parallel for")
850  for (int index = 0; index < last - first; index++)
851  {
852  spline2offload::evaluate_vgh_impl_v2(spline_ptr, ix, iy, iz, first + index, a, b, c, da, db, dc, d2a, d2b,
853  d2c, offload_scratch_iw_ptr + first + index, spline_padded_size);
854  const int output_index = first + index;
855  offload_scratch_iw_ptr[spline_padded_size * SoAFields3D::LAPL + output_index] =
856  SymTrace(offload_scratch_iw_ptr[spline_padded_size * SoAFields3D::HESS00 + output_index],
857  offload_scratch_iw_ptr[spline_padded_size * SoAFields3D::HESS01 + output_index],
858  offload_scratch_iw_ptr[spline_padded_size * SoAFields3D::HESS02 + output_index],
859  offload_scratch_iw_ptr[spline_padded_size * SoAFields3D::HESS11 + output_index],
860  offload_scratch_iw_ptr[spline_padded_size * SoAFields3D::HESS12 + output_index],
861  offload_scratch_iw_ptr[spline_padded_size * SoAFields3D::HESS22 + output_index], symGGt);
862  }
863  const size_t first_cplx = first / 2;
864  const size_t last_cplx = omptarget::min(last / 2, num_complex_splines);
865  PRAGMA_OFFLOAD("omp parallel for")
866  for (int index = first_cplx; index < last_cplx; index++)
867  C2R::assign_vgl(pos_iw_ptr[0], pos_iw_ptr[1], pos_iw_ptr[2], psi_iw_ptr, sposet_padded_size, mKK_ptr,
868  offload_scratch_iw_ptr, spline_padded_size, G, myKcart_ptr, myKcart_padded_size,
869  first_spo_local, nComplexBands_local, index);
870 
871  ValueType* restrict psi = psi_iw_ptr;
872  ValueType* restrict dpsi_x = psi_iw_ptr + sposet_padded_size;
873  ValueType* restrict dpsi_y = psi_iw_ptr + sposet_padded_size * 2;
874  ValueType* restrict dpsi_z = psi_iw_ptr + sposet_padded_size * 3;
875  ValueType* restrict d2psi = psi_iw_ptr + sposet_padded_size * 4;
876 
877  ValueType* restrict out_phi = phi_vgl_ptr + iw * requested_orb_size;
878  ValueType* restrict out_dphi_x = out_phi + phi_vgl_stride;
879  ValueType* restrict out_dphi_y = out_dphi_x + phi_vgl_stride;
880  ValueType* restrict out_dphi_z = out_dphi_y + phi_vgl_stride;
881  ValueType* restrict out_d2phi = out_dphi_z + phi_vgl_stride;
882 
883  const size_t first_real = first_cplx + omptarget::min(nComplexBands_local, first_cplx);
884  const size_t last_real =
885  omptarget::min(last_cplx + omptarget::min(nComplexBands_local, last_cplx), requested_orb_size);
886  ValueType ratio(0), grad_x(0), grad_y(0), grad_z(0);
887  PRAGMA_OFFLOAD("omp parallel for reduction(+: ratio, grad_x, grad_y, grad_z)")
888  for (int j = first_real; j < last_real; j++)
889  {
890  out_phi[j] = psi[j];
891  out_dphi_x[j] = dpsi_x[j];
892  out_dphi_y[j] = dpsi_y[j];
893  out_dphi_z[j] = dpsi_z[j];
894  out_d2phi[j] = d2psi[j];
895 
896  ratio += psi[j] * invRow_iw_ptr[j];
897  grad_x += dpsi_x[j] * invRow_iw_ptr[j];
898  grad_y += dpsi_y[j] * invRow_iw_ptr[j];
899  grad_z += dpsi_z[j] * invRow_iw_ptr[j];
900  }
901 
902  rg_private_ptr[(iw * NumTeams + team_id) * 4] = ratio;
903  rg_private_ptr[(iw * NumTeams + team_id) * 4 + 1] = grad_x;
904  rg_private_ptr[(iw * NumTeams + team_id) * 4 + 2] = grad_y;
905  rg_private_ptr[(iw * NumTeams + team_id) * 4 + 3] = grad_z;
906  }
907  }
908 
909  for (int iw = 0; iw < num_pos; iw++)
910  {
911  ValueType ratio(0);
912  for (int team_id = 0; team_id < NumTeams; team_id++)
913  ratio += rg_private[iw][team_id * 4];
914  ratios[iw] = ratio;
915 
916  ValueType grad_x(0), grad_y(0), grad_z(0);
917  for (int team_id = 0; team_id < NumTeams; team_id++)
918  {
919  grad_x += rg_private[iw][team_id * 4 + 1];
920  grad_y += rg_private[iw][team_id * 4 + 2];
921  grad_z += rg_private[iw][team_id * 4 + 3];
922  }
923  grads[iw] = GradType{grad_x / ratio, grad_y / ratio, grad_z / ratio};
924  }
925 }
926 
927 template<typename ST>
929  ValueVector& psi,
930  GradVector& dpsi,
931  HessVector& grad_grad_psi,
932  int first,
933  int last) const
934 {
935  // protect last
936  last = last > kPoints.size() ? kPoints.size() : last;
937 
938  const ST g00 = PrimLattice.G(0), g01 = PrimLattice.G(1), g02 = PrimLattice.G(2), g10 = PrimLattice.G(3),
939  g11 = PrimLattice.G(4), g12 = PrimLattice.G(5), g20 = PrimLattice.G(6), g21 = PrimLattice.G(7),
940  g22 = PrimLattice.G(8);
941  const ST x = r[0], y = r[1], z = r[2];
942 
943  const ST* restrict k0 = myKcart->data(0);
944  const ST* restrict k1 = myKcart->data(1);
945  const ST* restrict k2 = myKcart->data(2);
946 
947  const ST* restrict g0 = myG.data(0);
948  const ST* restrict g1 = myG.data(1);
949  const ST* restrict g2 = myG.data(2);
950  const ST* restrict h00 = myH.data(0);
951  const ST* restrict h01 = myH.data(1);
952  const ST* restrict h02 = myH.data(2);
953  const ST* restrict h11 = myH.data(3);
954  const ST* restrict h12 = myH.data(4);
955  const ST* restrict h22 = myH.data(5);
956 
957  const size_t requested_orb_size = psi.size();
958 #pragma omp simd
959  for (size_t j = first; j < std::min(nComplexBands, last); j++)
960  {
961  int jr = j << 1;
962  int ji = jr + 1;
963 
964  const ST kX = k0[j];
965  const ST kY = k1[j];
966  const ST kZ = k2[j];
967  const ST val_r = myV[jr];
968  const ST val_i = myV[ji];
969 
970  //phase
971  ST s, c;
972  omptarget::sincos(-(x * kX + y * kY + z * kZ), &s, &c);
973 
974  //dot(PrimLattice.G,myG[j])
975  const ST dX_r = g00 * g0[jr] + g01 * g1[jr] + g02 * g2[jr];
976  const ST dY_r = g10 * g0[jr] + g11 * g1[jr] + g12 * g2[jr];
977  const ST dZ_r = g20 * g0[jr] + g21 * g1[jr] + g22 * g2[jr];
978 
979  const ST dX_i = g00 * g0[ji] + g01 * g1[ji] + g02 * g2[ji];
980  const ST dY_i = g10 * g0[ji] + g11 * g1[ji] + g12 * g2[ji];
981  const ST dZ_i = g20 * g0[ji] + g21 * g1[ji] + g22 * g2[ji];
982 
983  // \f$\nabla \psi_r + {\bf k}\psi_i\f$
984  const ST gX_r = dX_r + val_i * kX;
985  const ST gY_r = dY_r + val_i * kY;
986  const ST gZ_r = dZ_r + val_i * kZ;
987  const ST gX_i = dX_i - val_r * kX;
988  const ST gY_i = dY_i - val_r * kY;
989  const ST gZ_i = dZ_i - val_r * kZ;
990 
991  const size_t psiIndex = first_spo + jr;
992 
993  if (psiIndex < requested_orb_size)
994  {
995  psi[psiIndex] = c * val_r - s * val_i;
996  dpsi[psiIndex][0] = c * gX_r - s * gX_i;
997  dpsi[psiIndex][1] = c * gY_r - s * gY_i;
998  dpsi[psiIndex][2] = c * gZ_r - s * gZ_i;
999  }
1000 
1001  if (psiIndex + 1 < requested_orb_size)
1002  {
1003  psi[psiIndex + 1] = c * val_i + s * val_r;
1004  dpsi[psiIndex + 1][0] = c * gX_i + s * gX_r;
1005  dpsi[psiIndex + 1][1] = c * gY_i + s * gY_r;
1006  dpsi[psiIndex + 1][2] = c * gZ_i + s * gZ_r;
1007  }
1008 
1009  const ST h_xx_r =
1010  v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g00, g01, g02, g00, g01, g02) + kX * (gX_i + dX_i);
1011  const ST h_xy_r =
1012  v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g00, g01, g02, g10, g11, g12) + kX * (gY_i + dY_i);
1013  const ST h_xz_r =
1014  v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g00, g01, g02, g20, g21, g22) + kX * (gZ_i + dZ_i);
1015  const ST h_yx_r =
1016  v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g10, g11, g12, g00, g01, g02) + kY * (gX_i + dX_i);
1017  const ST h_yy_r =
1018  v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g10, g11, g12, g10, g11, g12) + kY * (gY_i + dY_i);
1019  const ST h_yz_r =
1020  v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g10, g11, g12, g20, g21, g22) + kY * (gZ_i + dZ_i);
1021  const ST h_zx_r =
1022  v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g20, g21, g22, g00, g01, g02) + kZ * (gX_i + dX_i);
1023  const ST h_zy_r =
1024  v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g20, g21, g22, g10, g11, g12) + kZ * (gY_i + dY_i);
1025  const ST h_zz_r =
1026  v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g20, g21, g22, g20, g21, g22) + kZ * (gZ_i + dZ_i);
1027 
1028  const ST h_xx_i =
1029  v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g00, g01, g02, g00, g01, g02) - kX * (gX_r + dX_r);
1030  const ST h_xy_i =
1031  v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g00, g01, g02, g10, g11, g12) - kX * (gY_r + dY_r);
1032  const ST h_xz_i =
1033  v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g00, g01, g02, g20, g21, g22) - kX * (gZ_r + dZ_r);
1034  const ST h_yx_i =
1035  v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g10, g11, g12, g00, g01, g02) - kY * (gX_r + dX_r);
1036  const ST h_yy_i =
1037  v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g10, g11, g12, g10, g11, g12) - kY * (gY_r + dY_r);
1038  const ST h_yz_i =
1039  v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g10, g11, g12, g20, g21, g22) - kY * (gZ_r + dZ_r);
1040  const ST h_zx_i =
1041  v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g20, g21, g22, g00, g01, g02) - kZ * (gX_r + dX_r);
1042  const ST h_zy_i =
1043  v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g20, g21, g22, g10, g11, g12) - kZ * (gY_r + dY_r);
1044  const ST h_zz_i =
1045  v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g20, g21, g22, g20, g21, g22) - kZ * (gZ_r + dZ_r);
1046 
1047  if (psiIndex < requested_orb_size)
1048  {
1049  grad_grad_psi[psiIndex][0] = c * h_xx_r - s * h_xx_i;
1050  grad_grad_psi[psiIndex][1] = c * h_xy_r - s * h_xy_i;
1051  grad_grad_psi[psiIndex][2] = c * h_xz_r - s * h_xz_i;
1052  grad_grad_psi[psiIndex][3] = c * h_yx_r - s * h_yx_i;
1053  grad_grad_psi[psiIndex][4] = c * h_yy_r - s * h_yy_i;
1054  grad_grad_psi[psiIndex][5] = c * h_yz_r - s * h_yz_i;
1055  grad_grad_psi[psiIndex][6] = c * h_zx_r - s * h_zx_i;
1056  grad_grad_psi[psiIndex][7] = c * h_zy_r - s * h_zy_i;
1057  grad_grad_psi[psiIndex][8] = c * h_zz_r - s * h_zz_i;
1058  }
1059 
1060  if (psiIndex + 1 < requested_orb_size)
1061  {
1062  grad_grad_psi[psiIndex + 1][0] = c * h_xx_i + s * h_xx_r;
1063  grad_grad_psi[psiIndex + 1][1] = c * h_xy_i + s * h_xy_r;
1064  grad_grad_psi[psiIndex + 1][2] = c * h_xz_i + s * h_xz_r;
1065  grad_grad_psi[psiIndex + 1][3] = c * h_yx_i + s * h_yx_r;
1066  grad_grad_psi[psiIndex + 1][4] = c * h_yy_i + s * h_yy_r;
1067  grad_grad_psi[psiIndex + 1][5] = c * h_yz_i + s * h_yz_r;
1068  grad_grad_psi[psiIndex + 1][6] = c * h_zx_i + s * h_zx_r;
1069  grad_grad_psi[psiIndex + 1][7] = c * h_zy_i + s * h_zy_r;
1070  grad_grad_psi[psiIndex + 1][8] = c * h_zz_i + s * h_zz_r;
1071  }
1072  }
1073 
1074 #pragma omp simd
1075  for (size_t j = std::max(nComplexBands, first); j < last; j++)
1076  {
1077  int jr = j << 1;
1078  int ji = jr + 1;
1079 
1080  const ST kX = k0[j];
1081  const ST kY = k1[j];
1082  const ST kZ = k2[j];
1083  const ST val_r = myV[jr];
1084  const ST val_i = myV[ji];
1085 
1086  //phase
1087  ST s, c;
1088  omptarget::sincos(-(x * kX + y * kY + z * kZ), &s, &c);
1089 
1090  //dot(PrimLattice.G,myG[j])
1091  const ST dX_r = g00 * g0[jr] + g01 * g1[jr] + g02 * g2[jr];
1092  const ST dY_r = g10 * g0[jr] + g11 * g1[jr] + g12 * g2[jr];
1093  const ST dZ_r = g20 * g0[jr] + g21 * g1[jr] + g22 * g2[jr];
1094 
1095  const ST dX_i = g00 * g0[ji] + g01 * g1[ji] + g02 * g2[ji];
1096  const ST dY_i = g10 * g0[ji] + g11 * g1[ji] + g12 * g2[ji];
1097  const ST dZ_i = g20 * g0[ji] + g21 * g1[ji] + g22 * g2[ji];
1098 
1099  // \f$\nabla \psi_r + {\bf k}\psi_i\f$
1100  const ST gX_r = dX_r + val_i * kX;
1101  const ST gY_r = dY_r + val_i * kY;
1102  const ST gZ_r = dZ_r + val_i * kZ;
1103  const ST gX_i = dX_i - val_r * kX;
1104  const ST gY_i = dY_i - val_r * kY;
1105  const ST gZ_i = dZ_i - val_r * kZ;
1106 
1107  if (const size_t psiIndex = first_spo + nComplexBands + j; psiIndex < requested_orb_size)
1108  {
1109  psi[psiIndex] = c * val_r - s * val_i;
1110  dpsi[psiIndex][0] = c * gX_r - s * gX_i;
1111  dpsi[psiIndex][1] = c * gY_r - s * gY_i;
1112  dpsi[psiIndex][2] = c * gZ_r - s * gZ_i;
1113 
1114  const ST h_xx_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g00, g01, g02, g00, g01, g02) +
1115  kX * (gX_i + dX_i);
1116  const ST h_xy_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g00, g01, g02, g10, g11, g12) +
1117  kX * (gY_i + dY_i);
1118  const ST h_xz_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g00, g01, g02, g20, g21, g22) +
1119  kX * (gZ_i + dZ_i);
1120  const ST h_yx_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g10, g11, g12, g00, g01, g02) +
1121  kY * (gX_i + dX_i);
1122  const ST h_yy_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g10, g11, g12, g10, g11, g12) +
1123  kY * (gY_i + dY_i);
1124  const ST h_yz_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g10, g11, g12, g20, g21, g22) +
1125  kY * (gZ_i + dZ_i);
1126  const ST h_zx_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g20, g21, g22, g00, g01, g02) +
1127  kZ * (gX_i + dX_i);
1128  const ST h_zy_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g20, g21, g22, g10, g11, g12) +
1129  kZ * (gY_i + dY_i);
1130  const ST h_zz_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g20, g21, g22, g20, g21, g22) +
1131  kZ * (gZ_i + dZ_i);
1132 
1133  const ST h_xx_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g00, g01, g02, g00, g01, g02) -
1134  kX * (gX_r + dX_r);
1135  const ST h_xy_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g00, g01, g02, g10, g11, g12) -
1136  kX * (gY_r + dY_r);
1137  const ST h_xz_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g00, g01, g02, g20, g21, g22) -
1138  kX * (gZ_r + dZ_r);
1139  const ST h_yx_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g10, g11, g12, g00, g01, g02) -
1140  kY * (gX_r + dX_r);
1141  const ST h_yy_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g10, g11, g12, g10, g11, g12) -
1142  kY * (gY_r + dY_r);
1143  const ST h_yz_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g10, g11, g12, g20, g21, g22) -
1144  kY * (gZ_r + dZ_r);
1145  const ST h_zx_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g20, g21, g22, g00, g01, g02) -
1146  kZ * (gX_r + dX_r);
1147  const ST h_zy_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g20, g21, g22, g10, g11, g12) -
1148  kZ * (gY_r + dY_r);
1149  const ST h_zz_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g20, g21, g22, g20, g21, g22) -
1150  kZ * (gZ_r + dZ_r);
1151 
1152  grad_grad_psi[psiIndex][0] = c * h_xx_r - s * h_xx_i;
1153  grad_grad_psi[psiIndex][1] = c * h_xy_r - s * h_xy_i;
1154  grad_grad_psi[psiIndex][2] = c * h_xz_r - s * h_xz_i;
1155  grad_grad_psi[psiIndex][3] = c * h_yx_r - s * h_yx_i;
1156  grad_grad_psi[psiIndex][4] = c * h_yy_r - s * h_yy_i;
1157  grad_grad_psi[psiIndex][5] = c * h_yz_r - s * h_yz_i;
1158  grad_grad_psi[psiIndex][6] = c * h_zx_r - s * h_zx_i;
1159  grad_grad_psi[psiIndex][7] = c * h_zy_r - s * h_zy_i;
1160  grad_grad_psi[psiIndex][8] = c * h_zz_r - s * h_zz_i;
1161  }
1162  }
1163 }
1164 
1165 template<typename ST>
1167  const int iat,
1168  ValueVector& psi,
1169  GradVector& dpsi,
1170  HessVector& grad_grad_psi)
1171 {
1172  const PointType& r = P.activeR(iat);
1173  PointType ru(PrimLattice.toUnit_floor(r));
1174 #pragma omp parallel
1175  {
1176  int first, last;
1177  FairDivideAligned(myV.size(), getAlignment<ST>(), omp_get_num_threads(), omp_get_thread_num(), first, last);
1178 
1179  spline2::evaluate3d_vgh(SplineInst->getSplinePtr(), ru, myV, myG, myH, first, last);
1180  assign_vgh(r, psi, dpsi, grad_grad_psi, first / 2, last / 2);
1181  }
1182 }
1183 
1184 template<typename ST>
1186  ValueVector& psi,
1187  GradVector& dpsi,
1188  HessVector& grad_grad_psi,
1189  GGGVector& grad_grad_grad_psi,
1190  int first,
1191  int last) const
1192 {
1193  // protect last
1194  last = last < 0 ? kPoints.size() : (last > kPoints.size() ? kPoints.size() : last);
1195 
1196  const ST g00 = PrimLattice.G(0), g01 = PrimLattice.G(1), g02 = PrimLattice.G(2), g10 = PrimLattice.G(3),
1197  g11 = PrimLattice.G(4), g12 = PrimLattice.G(5), g20 = PrimLattice.G(6), g21 = PrimLattice.G(7),
1198  g22 = PrimLattice.G(8);
1199  const ST x = r[0], y = r[1], z = r[2];
1200 
1201  const ST* restrict k0 = myKcart->data(0);
1202  const ST* restrict k1 = myKcart->data(1);
1203  const ST* restrict k2 = myKcart->data(2);
1204 
1205  const ST* restrict g0 = myG.data(0);
1206  const ST* restrict g1 = myG.data(1);
1207  const ST* restrict g2 = myG.data(2);
1208  const ST* restrict h00 = myH.data(0);
1209  const ST* restrict h01 = myH.data(1);
1210  const ST* restrict h02 = myH.data(2);
1211  const ST* restrict h11 = myH.data(3);
1212  const ST* restrict h12 = myH.data(4);
1213  const ST* restrict h22 = myH.data(5);
1214 
1215  const ST* restrict gh000 = mygH.data(0);
1216  const ST* restrict gh001 = mygH.data(1);
1217  const ST* restrict gh002 = mygH.data(2);
1218  const ST* restrict gh011 = mygH.data(3);
1219  const ST* restrict gh012 = mygH.data(4);
1220  const ST* restrict gh022 = mygH.data(5);
1221  const ST* restrict gh111 = mygH.data(6);
1222  const ST* restrict gh112 = mygH.data(7);
1223  const ST* restrict gh122 = mygH.data(8);
1224  const ST* restrict gh222 = mygH.data(9);
1225 
1226  const size_t requested_orb_size = psi.size();
1227 //SIMD doesn't work quite right yet. Comment out until further debugging.
1228 #pragma omp simd
1229  for (size_t j = first; j < std::min(nComplexBands, last); j++)
1230  {
1231  int jr = j << 1;
1232  int ji = jr + 1;
1233 
1234  const ST kX = k0[j];
1235  const ST kY = k1[j];
1236  const ST kZ = k2[j];
1237  const ST val_r = myV[jr];
1238  const ST val_i = myV[ji];
1239 
1240  //phase
1241  ST s, c;
1242  omptarget::sincos(-(x * kX + y * kY + z * kZ), &s, &c);
1243 
1244  //dot(PrimLattice.G,myG[j])
1245  const ST dX_r = g00 * g0[jr] + g01 * g1[jr] + g02 * g2[jr];
1246  const ST dY_r = g10 * g0[jr] + g11 * g1[jr] + g12 * g2[jr];
1247  const ST dZ_r = g20 * g0[jr] + g21 * g1[jr] + g22 * g2[jr];
1248 
1249  const ST dX_i = g00 * g0[ji] + g01 * g1[ji] + g02 * g2[ji];
1250  const ST dY_i = g10 * g0[ji] + g11 * g1[ji] + g12 * g2[ji];
1251  const ST dZ_i = g20 * g0[ji] + g21 * g1[ji] + g22 * g2[ji];
1252 
1253  // \f$\nabla \psi_r + {\bf k}\psi_i\f$
1254  const ST gX_r = dX_r + val_i * kX;
1255  const ST gY_r = dY_r + val_i * kY;
1256  const ST gZ_r = dZ_r + val_i * kZ;
1257  const ST gX_i = dX_i - val_r * kX;
1258  const ST gY_i = dY_i - val_r * kY;
1259  const ST gZ_i = dZ_i - val_r * kZ;
1260 
1261  const size_t psiIndex = first_spo + jr;
1262 
1263  if (psiIndex < requested_orb_size)
1264  {
1265  psi[psiIndex] = c * val_r - s * val_i;
1266  dpsi[psiIndex][0] = c * gX_r - s * gX_i;
1267  dpsi[psiIndex][1] = c * gY_r - s * gY_i;
1268  dpsi[psiIndex][2] = c * gZ_r - s * gZ_i;
1269  }
1270 
1271  if (psiIndex + 1 < requested_orb_size)
1272  {
1273  psi[psiIndex + 1] = c * val_i + s * val_r;
1274  dpsi[psiIndex + 1][0] = c * gX_i + s * gX_r;
1275  dpsi[psiIndex + 1][1] = c * gY_i + s * gY_r;
1276  dpsi[psiIndex + 1][2] = c * gZ_i + s * gZ_r;
1277  }
1278 
1279  //intermediates for computation of hessian. \partial_i \partial_j phi in cartesian coordinates.
1280  const ST f_xx_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g00, g01, g02, g00, g01, g02);
1281  const ST f_xy_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g00, g01, g02, g10, g11, g12);
1282  const ST f_xz_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g00, g01, g02, g20, g21, g22);
1283  const ST f_yy_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g10, g11, g12, g10, g11, g12);
1284  const ST f_yz_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g10, g11, g12, g20, g21, g22);
1285  const ST f_zz_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g20, g21, g22, g20, g21, g22);
1286 
1287  const ST f_xx_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g00, g01, g02, g00, g01, g02);
1288  const ST f_xy_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g00, g01, g02, g10, g11, g12);
1289  const ST f_xz_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g00, g01, g02, g20, g21, g22);
1290  const ST f_yy_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g10, g11, g12, g10, g11, g12);
1291  const ST f_yz_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g10, g11, g12, g20, g21, g22);
1292  const ST f_zz_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g20, g21, g22, g20, g21, g22);
1293 
1294  const ST h_xx_r = f_xx_r + 2 * kX * dX_i - kX * kX * val_r;
1295  const ST h_xy_r = f_xy_r + (kX * dY_i + kY * dX_i) - kX * kY * val_r;
1296  const ST h_xz_r = f_xz_r + (kX * dZ_i + kZ * dX_i) - kX * kZ * val_r;
1297  const ST h_yy_r = f_yy_r + 2 * kY * dY_i - kY * kY * val_r;
1298  const ST h_yz_r = f_yz_r + (kY * dZ_i + kZ * dY_i) - kY * kZ * val_r;
1299  const ST h_zz_r = f_zz_r + 2 * kZ * dZ_i - kZ * kZ * val_r;
1300 
1301  const ST h_xx_i = f_xx_i - 2 * kX * dX_r - kX * kX * val_i;
1302  const ST h_xy_i = f_xy_i - (kX * dY_r + kY * dX_r) - kX * kY * val_i;
1303  const ST h_xz_i = f_xz_i - (kX * dZ_r + kZ * dX_r) - kX * kZ * val_i;
1304  const ST h_yy_i = f_yy_i - 2 * kY * dY_r - kY * kY * val_i;
1305  const ST h_yz_i = f_yz_i - (kZ * dY_r + kY * dZ_r) - kZ * kY * val_i;
1306  const ST h_zz_i = f_zz_i - 2 * kZ * dZ_r - kZ * kZ * val_i;
1307 
1308  if (psiIndex < requested_orb_size)
1309  {
1310  grad_grad_psi[psiIndex][0] = c * h_xx_r - s * h_xx_i;
1311  grad_grad_psi[psiIndex][1] = c * h_xy_r - s * h_xy_i;
1312  grad_grad_psi[psiIndex][2] = c * h_xz_r - s * h_xz_i;
1313  grad_grad_psi[psiIndex][3] = c * h_xy_r - s * h_xy_i;
1314  grad_grad_psi[psiIndex][4] = c * h_yy_r - s * h_yy_i;
1315  grad_grad_psi[psiIndex][5] = c * h_yz_r - s * h_yz_i;
1316  grad_grad_psi[psiIndex][6] = c * h_xz_r - s * h_xz_i;
1317  grad_grad_psi[psiIndex][7] = c * h_yz_r - s * h_yz_i;
1318  grad_grad_psi[psiIndex][8] = c * h_zz_r - s * h_zz_i;
1319  }
1320 
1321  if (psiIndex + 1 < requested_orb_size)
1322  {
1323  grad_grad_psi[psiIndex + 1][0] = c * h_xx_i + s * h_xx_r;
1324  grad_grad_psi[psiIndex + 1][1] = c * h_xy_i + s * h_xy_r;
1325  grad_grad_psi[psiIndex + 1][2] = c * h_xz_i + s * h_xz_r;
1326  grad_grad_psi[psiIndex + 1][3] = c * h_xy_i + s * h_xy_r;
1327  grad_grad_psi[psiIndex + 1][4] = c * h_yy_i + s * h_yy_r;
1328  grad_grad_psi[psiIndex + 1][5] = c * h_yz_i + s * h_yz_r;
1329  grad_grad_psi[psiIndex + 1][6] = c * h_xz_i + s * h_xz_r;
1330  grad_grad_psi[psiIndex + 1][7] = c * h_yz_i + s * h_yz_r;
1331  grad_grad_psi[psiIndex + 1][8] = c * h_zz_i + s * h_zz_r;
1332  }
1333 
1334  //These are the real and imaginary components of the third SPO derivative. _xxx denotes
1335  // third derivative w.r.t. x, _xyz, a derivative with resepect to x,y, and z, and so on.
1336 
1337  const ST f3_xxx_r = t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
1338  gh112[jr], gh122[jr], gh222[jr], g00, g01, g02, g00, g01, g02, g00, g01, g02);
1339  const ST f3_xxy_r = t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
1340  gh112[jr], gh122[jr], gh222[jr], g00, g01, g02, g00, g01, g02, g10, g11, g12);
1341  const ST f3_xxz_r = t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
1342  gh112[jr], gh122[jr], gh222[jr], g00, g01, g02, g00, g01, g02, g20, g21, g22);
1343  const ST f3_xyy_r = t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
1344  gh112[jr], gh122[jr], gh222[jr], g00, g01, g02, g10, g11, g12, g10, g11, g12);
1345  const ST f3_xyz_r = t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
1346  gh112[jr], gh122[jr], gh222[jr], g00, g01, g02, g10, g11, g12, g20, g21, g22);
1347  const ST f3_xzz_r = t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
1348  gh112[jr], gh122[jr], gh222[jr], g00, g01, g02, g20, g21, g22, g20, g21, g22);
1349  const ST f3_yyy_r = t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
1350  gh112[jr], gh122[jr], gh222[jr], g10, g11, g12, g10, g11, g12, g10, g11, g12);
1351  const ST f3_yyz_r = t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
1352  gh112[jr], gh122[jr], gh222[jr], g10, g11, g12, g10, g11, g12, g20, g21, g22);
1353  const ST f3_yzz_r = t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
1354  gh112[jr], gh122[jr], gh222[jr], g10, g11, g12, g20, g21, g22, g20, g21, g22);
1355  const ST f3_zzz_r = t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
1356  gh112[jr], gh122[jr], gh222[jr], g20, g21, g22, g20, g21, g22, g20, g21, g22);
1357 
1358  const ST f3_xxx_i = t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
1359  gh112[ji], gh122[ji], gh222[ji], g00, g01, g02, g00, g01, g02, g00, g01, g02);
1360  const ST f3_xxy_i = t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
1361  gh112[ji], gh122[ji], gh222[ji], g00, g01, g02, g00, g01, g02, g10, g11, g12);
1362  const ST f3_xxz_i = t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
1363  gh112[ji], gh122[ji], gh222[ji], g00, g01, g02, g00, g01, g02, g20, g21, g22);
1364  const ST f3_xyy_i = t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
1365  gh112[ji], gh122[ji], gh222[ji], g00, g01, g02, g10, g11, g12, g10, g11, g12);
1366  const ST f3_xyz_i = t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
1367  gh112[ji], gh122[ji], gh222[ji], g00, g01, g02, g10, g11, g12, g20, g21, g22);
1368  const ST f3_xzz_i = t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
1369  gh112[ji], gh122[ji], gh222[ji], g00, g01, g02, g20, g21, g22, g20, g21, g22);
1370  const ST f3_yyy_i = t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
1371  gh112[ji], gh122[ji], gh222[ji], g10, g11, g12, g10, g11, g12, g10, g11, g12);
1372  const ST f3_yyz_i = t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
1373  gh112[ji], gh122[ji], gh222[ji], g10, g11, g12, g10, g11, g12, g20, g21, g22);
1374  const ST f3_yzz_i = t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
1375  gh112[ji], gh122[ji], gh222[ji], g10, g11, g12, g20, g21, g22, g20, g21, g22);
1376  const ST f3_zzz_i = t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
1377  gh112[ji], gh122[ji], gh222[ji], g20, g21, g22, g20, g21, g22, g20, g21, g22);
1378 
1379  //Here is where we build up the components of the physical hessian gradient, namely, d^3/dx^3(e^{-ik*r}\phi(r)
1380  const ST gh_xxx_r = f3_xxx_r + 3 * kX * f_xx_i - 3 * kX * kX * dX_r - kX * kX * kX * val_i;
1381  const ST gh_xxx_i = f3_xxx_i - 3 * kX * f_xx_r - 3 * kX * kX * dX_i + kX * kX * kX * val_r;
1382  const ST gh_xxy_r =
1383  f3_xxy_r + (kY * f_xx_i + 2 * kX * f_xy_i) - (kX * kX * dY_r + 2 * kX * kY * dX_r) - kX * kX * kY * val_i;
1384  const ST gh_xxy_i =
1385  f3_xxy_i - (kY * f_xx_r + 2 * kX * f_xy_r) - (kX * kX * dY_i + 2 * kX * kY * dX_i) + kX * kX * kY * val_r;
1386  const ST gh_xxz_r =
1387  f3_xxz_r + (kZ * f_xx_i + 2 * kX * f_xz_i) - (kX * kX * dZ_r + 2 * kX * kZ * dX_r) - kX * kX * kZ * val_i;
1388  const ST gh_xxz_i =
1389  f3_xxz_i - (kZ * f_xx_r + 2 * kX * f_xz_r) - (kX * kX * dZ_i + 2 * kX * kZ * dX_i) + kX * kX * kZ * val_r;
1390  const ST gh_xyy_r =
1391  f3_xyy_r + (2 * kY * f_xy_i + kX * f_yy_i) - (2 * kX * kY * dY_r + kY * kY * dX_r) - kX * kY * kY * val_i;
1392  const ST gh_xyy_i =
1393  f3_xyy_i - (2 * kY * f_xy_r + kX * f_yy_r) - (2 * kX * kY * dY_i + kY * kY * dX_i) + kX * kY * kY * val_r;
1394  const ST gh_xyz_r = f3_xyz_r + (kX * f_yz_i + kY * f_xz_i + kZ * f_xy_i) -
1395  (kX * kY * dZ_r + kY * kZ * dX_r + kZ * kX * dY_r) - kX * kY * kZ * val_i;
1396  const ST gh_xyz_i = f3_xyz_i - (kX * f_yz_r + kY * f_xz_r + kZ * f_xy_r) -
1397  (kX * kY * dZ_i + kY * kZ * dX_i + kZ * kX * dY_i) + kX * kY * kZ * val_r;
1398  const ST gh_xzz_r =
1399  f3_xzz_r + (2 * kZ * f_xz_i + kX * f_zz_i) - (2 * kX * kZ * dZ_r + kZ * kZ * dX_r) - kX * kZ * kZ * val_i;
1400  const ST gh_xzz_i =
1401  f3_xzz_i - (2 * kZ * f_xz_r + kX * f_zz_r) - (2 * kX * kZ * dZ_i + kZ * kZ * dX_i) + kX * kZ * kZ * val_r;
1402  const ST gh_yyy_r = f3_yyy_r + 3 * kY * f_yy_i - 3 * kY * kY * dY_r - kY * kY * kY * val_i;
1403  const ST gh_yyy_i = f3_yyy_i - 3 * kY * f_yy_r - 3 * kY * kY * dY_i + kY * kY * kY * val_r;
1404  const ST gh_yyz_r =
1405  f3_yyz_r + (kZ * f_yy_i + 2 * kY * f_yz_i) - (kY * kY * dZ_r + 2 * kY * kZ * dY_r) - kY * kY * kZ * val_i;
1406  const ST gh_yyz_i =
1407  f3_yyz_i - (kZ * f_yy_r + 2 * kY * f_yz_r) - (kY * kY * dZ_i + 2 * kY * kZ * dY_i) + kY * kY * kZ * val_r;
1408  const ST gh_yzz_r =
1409  f3_yzz_r + (2 * kZ * f_yz_i + kY * f_zz_i) - (2 * kY * kZ * dZ_r + kZ * kZ * dY_r) - kY * kZ * kZ * val_i;
1410  const ST gh_yzz_i =
1411  f3_yzz_i - (2 * kZ * f_yz_r + kY * f_zz_r) - (2 * kY * kZ * dZ_i + kZ * kZ * dY_i) + kY * kZ * kZ * val_r;
1412  const ST gh_zzz_r = f3_zzz_r + 3 * kZ * f_zz_i - 3 * kZ * kZ * dZ_r - kZ * kZ * kZ * val_i;
1413  const ST gh_zzz_i = f3_zzz_i - 3 * kZ * f_zz_r - 3 * kZ * kZ * dZ_i + kZ * kZ * kZ * val_r;
1414 
1415  if (psiIndex < requested_orb_size)
1416  {
1417  grad_grad_grad_psi[psiIndex][0][0] = c * gh_xxx_r - s * gh_xxx_i;
1418  grad_grad_grad_psi[psiIndex][0][1] = c * gh_xxy_r - s * gh_xxy_i;
1419  grad_grad_grad_psi[psiIndex][0][2] = c * gh_xxz_r - s * gh_xxz_i;
1420  grad_grad_grad_psi[psiIndex][0][3] = c * gh_xxy_r - s * gh_xxy_i;
1421  grad_grad_grad_psi[psiIndex][0][4] = c * gh_xyy_r - s * gh_xyy_i;
1422  grad_grad_grad_psi[psiIndex][0][5] = c * gh_xyz_r - s * gh_xyz_i;
1423  grad_grad_grad_psi[psiIndex][0][6] = c * gh_xxz_r - s * gh_xxz_i;
1424  grad_grad_grad_psi[psiIndex][0][7] = c * gh_xyz_r - s * gh_xyz_i;
1425  grad_grad_grad_psi[psiIndex][0][8] = c * gh_xzz_r - s * gh_xzz_i;
1426 
1427  grad_grad_grad_psi[psiIndex][1][0] = c * gh_xxy_r - s * gh_xxy_i;
1428  grad_grad_grad_psi[psiIndex][1][1] = c * gh_xyy_r - s * gh_xyy_i;
1429  grad_grad_grad_psi[psiIndex][1][2] = c * gh_xyz_r - s * gh_xyz_i;
1430  grad_grad_grad_psi[psiIndex][1][3] = c * gh_xyy_r - s * gh_xyy_i;
1431  grad_grad_grad_psi[psiIndex][1][4] = c * gh_yyy_r - s * gh_yyy_i;
1432  grad_grad_grad_psi[psiIndex][1][5] = c * gh_yyz_r - s * gh_yyz_i;
1433  grad_grad_grad_psi[psiIndex][1][6] = c * gh_xyz_r - s * gh_xyz_i;
1434  grad_grad_grad_psi[psiIndex][1][7] = c * gh_yyz_r - s * gh_yyz_i;
1435  grad_grad_grad_psi[psiIndex][1][8] = c * gh_yzz_r - s * gh_yzz_i;
1436 
1437  grad_grad_grad_psi[psiIndex][2][0] = c * gh_xxz_r - s * gh_xxz_i;
1438  grad_grad_grad_psi[psiIndex][2][1] = c * gh_xyz_r - s * gh_xyz_i;
1439  grad_grad_grad_psi[psiIndex][2][2] = c * gh_xzz_r - s * gh_xzz_i;
1440  grad_grad_grad_psi[psiIndex][2][3] = c * gh_xyz_r - s * gh_xyz_i;
1441  grad_grad_grad_psi[psiIndex][2][4] = c * gh_yyz_r - s * gh_yyz_i;
1442  grad_grad_grad_psi[psiIndex][2][5] = c * gh_yzz_r - s * gh_yzz_i;
1443  grad_grad_grad_psi[psiIndex][2][6] = c * gh_xzz_r - s * gh_xzz_i;
1444  grad_grad_grad_psi[psiIndex][2][7] = c * gh_yzz_r - s * gh_yzz_i;
1445  grad_grad_grad_psi[psiIndex][2][8] = c * gh_zzz_r - s * gh_zzz_i;
1446  }
1447 
1448  if (psiIndex + 1 < requested_orb_size)
1449  {
1450  grad_grad_grad_psi[psiIndex + 1][0][0] = c * gh_xxx_i + s * gh_xxx_r;
1451  grad_grad_grad_psi[psiIndex + 1][0][1] = c * gh_xxy_i + s * gh_xxy_r;
1452  grad_grad_grad_psi[psiIndex + 1][0][2] = c * gh_xxz_i + s * gh_xxz_r;
1453  grad_grad_grad_psi[psiIndex + 1][0][3] = c * gh_xxy_i + s * gh_xxy_r;
1454  grad_grad_grad_psi[psiIndex + 1][0][4] = c * gh_xyy_i + s * gh_xyy_r;
1455  grad_grad_grad_psi[psiIndex + 1][0][5] = c * gh_xyz_i + s * gh_xyz_r;
1456  grad_grad_grad_psi[psiIndex + 1][0][6] = c * gh_xxz_i + s * gh_xxz_r;
1457  grad_grad_grad_psi[psiIndex + 1][0][7] = c * gh_xyz_i + s * gh_xyz_r;
1458  grad_grad_grad_psi[psiIndex + 1][0][8] = c * gh_xzz_i + s * gh_xzz_r;
1459 
1460  grad_grad_grad_psi[psiIndex + 1][1][0] = c * gh_xxy_i + s * gh_xxy_r;
1461  grad_grad_grad_psi[psiIndex + 1][1][1] = c * gh_xyy_i + s * gh_xyy_r;
1462  grad_grad_grad_psi[psiIndex + 1][1][2] = c * gh_xyz_i + s * gh_xyz_r;
1463  grad_grad_grad_psi[psiIndex + 1][1][3] = c * gh_xyy_i + s * gh_xyy_r;
1464  grad_grad_grad_psi[psiIndex + 1][1][4] = c * gh_yyy_i + s * gh_yyy_r;
1465  grad_grad_grad_psi[psiIndex + 1][1][5] = c * gh_yyz_i + s * gh_yyz_r;
1466  grad_grad_grad_psi[psiIndex + 1][1][6] = c * gh_xyz_i + s * gh_xyz_r;
1467  grad_grad_grad_psi[psiIndex + 1][1][7] = c * gh_yyz_i + s * gh_yyz_r;
1468  grad_grad_grad_psi[psiIndex + 1][1][8] = c * gh_yzz_i + s * gh_yzz_r;
1469 
1470  grad_grad_grad_psi[psiIndex + 1][2][0] = c * gh_xxz_i + s * gh_xxz_r;
1471  grad_grad_grad_psi[psiIndex + 1][2][1] = c * gh_xyz_i + s * gh_xyz_r;
1472  grad_grad_grad_psi[psiIndex + 1][2][2] = c * gh_xzz_i + s * gh_xzz_r;
1473  grad_grad_grad_psi[psiIndex + 1][2][3] = c * gh_xyz_i + s * gh_xyz_r;
1474  grad_grad_grad_psi[psiIndex + 1][2][4] = c * gh_yyz_i + s * gh_yyz_r;
1475  grad_grad_grad_psi[psiIndex + 1][2][5] = c * gh_yzz_i + s * gh_yzz_r;
1476  grad_grad_grad_psi[psiIndex + 1][2][6] = c * gh_xzz_i + s * gh_xzz_r;
1477  grad_grad_grad_psi[psiIndex + 1][2][7] = c * gh_yzz_i + s * gh_yzz_r;
1478  grad_grad_grad_psi[psiIndex + 1][2][8] = c * gh_zzz_i + s * gh_zzz_r;
1479  }
1480  }
1481 #pragma omp simd
1482  for (size_t j = std::max(nComplexBands, first); j < last; j++)
1483  {
1484  int jr = j << 1;
1485  int ji = jr + 1;
1486 
1487  const ST kX = k0[j];
1488  const ST kY = k1[j];
1489  const ST kZ = k2[j];
1490  const ST val_r = myV[jr];
1491  const ST val_i = myV[ji];
1492 
1493  //phase
1494  ST s, c;
1495  omptarget::sincos(-(x * kX + y * kY + z * kZ), &s, &c);
1496 
1497  //dot(PrimLattice.G,myG[j])
1498  const ST dX_r = g00 * g0[jr] + g01 * g1[jr] + g02 * g2[jr];
1499  const ST dY_r = g10 * g0[jr] + g11 * g1[jr] + g12 * g2[jr];
1500  const ST dZ_r = g20 * g0[jr] + g21 * g1[jr] + g22 * g2[jr];
1501 
1502  const ST dX_i = g00 * g0[ji] + g01 * g1[ji] + g02 * g2[ji];
1503  const ST dY_i = g10 * g0[ji] + g11 * g1[ji] + g12 * g2[ji];
1504  const ST dZ_i = g20 * g0[ji] + g21 * g1[ji] + g22 * g2[ji];
1505 
1506  // \f$\nabla \psi_r + {\bf k}\psi_i\f$
1507  const ST gX_r = dX_r + val_i * kX;
1508  const ST gY_r = dY_r + val_i * kY;
1509  const ST gZ_r = dZ_r + val_i * kZ;
1510  const ST gX_i = dX_i - val_r * kX;
1511  const ST gY_i = dY_i - val_r * kY;
1512  const ST gZ_i = dZ_i - val_r * kZ;
1513 
1514  if (const size_t psiIndex = first_spo + nComplexBands + j; psiIndex < requested_orb_size)
1515  {
1516  psi[psiIndex] = c * val_r - s * val_i;
1517  dpsi[psiIndex][0] = c * gX_r - s * gX_i;
1518  dpsi[psiIndex][1] = c * gY_r - s * gY_i;
1519  dpsi[psiIndex][2] = c * gZ_r - s * gZ_i;
1520 
1521  //intermediates for computation of hessian. \partial_i \partial_j phi in cartesian coordinates.
1522  const ST f_xx_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g00, g01, g02, g00, g01, g02);
1523  const ST f_xy_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g00, g01, g02, g10, g11, g12);
1524  const ST f_xz_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g00, g01, g02, g20, g21, g22);
1525  const ST f_yy_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g10, g11, g12, g10, g11, g12);
1526  const ST f_yz_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g10, g11, g12, g20, g21, g22);
1527  const ST f_zz_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g20, g21, g22, g20, g21, g22);
1528 
1529  const ST f_xx_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g00, g01, g02, g00, g01, g02);
1530  const ST f_xy_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g00, g01, g02, g10, g11, g12);
1531  const ST f_xz_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g00, g01, g02, g20, g21, g22);
1532  const ST f_yy_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g10, g11, g12, g10, g11, g12);
1533  const ST f_yz_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g10, g11, g12, g20, g21, g22);
1534  const ST f_zz_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g20, g21, g22, g20, g21, g22);
1535 
1536  const ST h_xx_r = f_xx_r + 2 * kX * dX_i - kX * kX * val_r;
1537  const ST h_xy_r = f_xy_r + (kX * dY_i + kY * dX_i) - kX * kY * val_r;
1538  const ST h_xz_r = f_xz_r + (kX * dZ_i + kZ * dX_i) - kX * kZ * val_r;
1539  const ST h_yy_r = f_yy_r + 2 * kY * dY_i - kY * kY * val_r;
1540  const ST h_yz_r = f_yz_r + (kY * dZ_i + kZ * dY_i) - kY * kZ * val_r;
1541  const ST h_zz_r = f_zz_r + 2 * kZ * dZ_i - kZ * kZ * val_r;
1542 
1543  const ST h_xx_i = f_xx_i - 2 * kX * dX_r - kX * kX * val_i;
1544  const ST h_xy_i = f_xy_i - (kX * dY_r + kY * dX_r) - kX * kY * val_i;
1545  const ST h_xz_i = f_xz_i - (kX * dZ_r + kZ * dX_r) - kX * kZ * val_i;
1546  const ST h_yy_i = f_yy_i - 2 * kY * dY_r - kY * kY * val_i;
1547  const ST h_yz_i = f_yz_i - (kZ * dY_r + kY * dZ_r) - kZ * kY * val_i;
1548  const ST h_zz_i = f_zz_i - 2 * kZ * dZ_r - kZ * kZ * val_i;
1549 
1550  grad_grad_psi[psiIndex][0] = c * h_xx_r - s * h_xx_i;
1551  grad_grad_psi[psiIndex][1] = c * h_xy_r - s * h_xy_i;
1552  grad_grad_psi[psiIndex][2] = c * h_xz_r - s * h_xz_i;
1553  grad_grad_psi[psiIndex][3] = c * h_xy_r - s * h_xy_i;
1554  grad_grad_psi[psiIndex][4] = c * h_yy_r - s * h_yy_i;
1555  grad_grad_psi[psiIndex][5] = c * h_yz_r - s * h_yz_i;
1556  grad_grad_psi[psiIndex][6] = c * h_xz_r - s * h_xz_i;
1557  grad_grad_psi[psiIndex][7] = c * h_yz_r - s * h_yz_i;
1558  grad_grad_psi[psiIndex][8] = c * h_zz_r - s * h_zz_i;
1559 
1560  //These are the real and imaginary components of the third SPO derivative. _xxx denotes
1561  // third derivative w.r.t. x, _xyz, a derivative with resepect to x,y, and z, and so on.
1562 
1563  const ST f3_xxx_r = t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
1564  gh112[jr], gh122[jr], gh222[jr], g00, g01, g02, g00, g01, g02, g00, g01, g02);
1565  const ST f3_xxy_r = t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
1566  gh112[jr], gh122[jr], gh222[jr], g00, g01, g02, g00, g01, g02, g10, g11, g12);
1567  const ST f3_xxz_r = t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
1568  gh112[jr], gh122[jr], gh222[jr], g00, g01, g02, g00, g01, g02, g20, g21, g22);
1569  const ST f3_xyy_r = t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
1570  gh112[jr], gh122[jr], gh222[jr], g00, g01, g02, g10, g11, g12, g10, g11, g12);
1571  const ST f3_xyz_r = t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
1572  gh112[jr], gh122[jr], gh222[jr], g00, g01, g02, g10, g11, g12, g20, g21, g22);
1573  const ST f3_xzz_r = t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
1574  gh112[jr], gh122[jr], gh222[jr], g00, g01, g02, g20, g21, g22, g20, g21, g22);
1575  const ST f3_yyy_r = t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
1576  gh112[jr], gh122[jr], gh222[jr], g10, g11, g12, g10, g11, g12, g10, g11, g12);
1577  const ST f3_yyz_r = t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
1578  gh112[jr], gh122[jr], gh222[jr], g10, g11, g12, g10, g11, g12, g20, g21, g22);
1579  const ST f3_yzz_r = t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
1580  gh112[jr], gh122[jr], gh222[jr], g10, g11, g12, g20, g21, g22, g20, g21, g22);
1581  const ST f3_zzz_r = t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
1582  gh112[jr], gh122[jr], gh222[jr], g20, g21, g22, g20, g21, g22, g20, g21, g22);
1583 
1584  const ST f3_xxx_i = t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
1585  gh112[ji], gh122[ji], gh222[ji], g00, g01, g02, g00, g01, g02, g00, g01, g02);
1586  const ST f3_xxy_i = t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
1587  gh112[ji], gh122[ji], gh222[ji], g00, g01, g02, g00, g01, g02, g10, g11, g12);
1588  const ST f3_xxz_i = t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
1589  gh112[ji], gh122[ji], gh222[ji], g00, g01, g02, g00, g01, g02, g20, g21, g22);
1590  const ST f3_xyy_i = t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
1591  gh112[ji], gh122[ji], gh222[ji], g00, g01, g02, g10, g11, g12, g10, g11, g12);
1592  const ST f3_xyz_i = t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
1593  gh112[ji], gh122[ji], gh222[ji], g00, g01, g02, g10, g11, g12, g20, g21, g22);
1594  const ST f3_xzz_i = t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
1595  gh112[ji], gh122[ji], gh222[ji], g00, g01, g02, g20, g21, g22, g20, g21, g22);
1596  const ST f3_yyy_i = t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
1597  gh112[ji], gh122[ji], gh222[ji], g10, g11, g12, g10, g11, g12, g10, g11, g12);
1598  const ST f3_yyz_i = t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
1599  gh112[ji], gh122[ji], gh222[ji], g10, g11, g12, g10, g11, g12, g20, g21, g22);
1600  const ST f3_yzz_i = t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
1601  gh112[ji], gh122[ji], gh222[ji], g10, g11, g12, g20, g21, g22, g20, g21, g22);
1602  const ST f3_zzz_i = t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
1603  gh112[ji], gh122[ji], gh222[ji], g20, g21, g22, g20, g21, g22, g20, g21, g22);
1604 
1605  //Here is where we build up the components of the physical hessian gradient, namely, d^3/dx^3(e^{-ik*r}\phi(r)
1606  const ST gh_xxx_r = f3_xxx_r + 3 * kX * f_xx_i - 3 * kX * kX * dX_r - kX * kX * kX * val_i;
1607  const ST gh_xxx_i = f3_xxx_i - 3 * kX * f_xx_r - 3 * kX * kX * dX_i + kX * kX * kX * val_r;
1608  const ST gh_xxy_r =
1609  f3_xxy_r + (kY * f_xx_i + 2 * kX * f_xy_i) - (kX * kX * dY_r + 2 * kX * kY * dX_r) - kX * kX * kY * val_i;
1610  const ST gh_xxy_i =
1611  f3_xxy_i - (kY * f_xx_r + 2 * kX * f_xy_r) - (kX * kX * dY_i + 2 * kX * kY * dX_i) + kX * kX * kY * val_r;
1612  const ST gh_xxz_r =
1613  f3_xxz_r + (kZ * f_xx_i + 2 * kX * f_xz_i) - (kX * kX * dZ_r + 2 * kX * kZ * dX_r) - kX * kX * kZ * val_i;
1614  const ST gh_xxz_i =
1615  f3_xxz_i - (kZ * f_xx_r + 2 * kX * f_xz_r) - (kX * kX * dZ_i + 2 * kX * kZ * dX_i) + kX * kX * kZ * val_r;
1616  const ST gh_xyy_r =
1617  f3_xyy_r + (2 * kY * f_xy_i + kX * f_yy_i) - (2 * kX * kY * dY_r + kY * kY * dX_r) - kX * kY * kY * val_i;
1618  const ST gh_xyy_i =
1619  f3_xyy_i - (2 * kY * f_xy_r + kX * f_yy_r) - (2 * kX * kY * dY_i + kY * kY * dX_i) + kX * kY * kY * val_r;
1620  const ST gh_xyz_r = f3_xyz_r + (kX * f_yz_i + kY * f_xz_i + kZ * f_xy_i) -
1621  (kX * kY * dZ_r + kY * kZ * dX_r + kZ * kX * dY_r) - kX * kY * kZ * val_i;
1622  const ST gh_xyz_i = f3_xyz_i - (kX * f_yz_r + kY * f_xz_r + kZ * f_xy_r) -
1623  (kX * kY * dZ_i + kY * kZ * dX_i + kZ * kX * dY_i) + kX * kY * kZ * val_r;
1624  const ST gh_xzz_r =
1625  f3_xzz_r + (2 * kZ * f_xz_i + kX * f_zz_i) - (2 * kX * kZ * dZ_r + kZ * kZ * dX_r) - kX * kZ * kZ * val_i;
1626  const ST gh_xzz_i =
1627  f3_xzz_i - (2 * kZ * f_xz_r + kX * f_zz_r) - (2 * kX * kZ * dZ_i + kZ * kZ * dX_i) + kX * kZ * kZ * val_r;
1628  const ST gh_yyy_r = f3_yyy_r + 3 * kY * f_yy_i - 3 * kY * kY * dY_r - kY * kY * kY * val_i;
1629  const ST gh_yyy_i = f3_yyy_i - 3 * kY * f_yy_r - 3 * kY * kY * dY_i + kY * kY * kY * val_r;
1630  const ST gh_yyz_r =
1631  f3_yyz_r + (kZ * f_yy_i + 2 * kY * f_yz_i) - (kY * kY * dZ_r + 2 * kY * kZ * dY_r) - kY * kY * kZ * val_i;
1632  const ST gh_yyz_i =
1633  f3_yyz_i - (kZ * f_yy_r + 2 * kY * f_yz_r) - (kY * kY * dZ_i + 2 * kY * kZ * dY_i) + kY * kY * kZ * val_r;
1634  const ST gh_yzz_r =
1635  f3_yzz_r + (2 * kZ * f_yz_i + kY * f_zz_i) - (2 * kY * kZ * dZ_r + kZ * kZ * dY_r) - kY * kZ * kZ * val_i;
1636  const ST gh_yzz_i =
1637  f3_yzz_i - (2 * kZ * f_yz_r + kY * f_zz_r) - (2 * kY * kZ * dZ_i + kZ * kZ * dY_i) + kY * kZ * kZ * val_r;
1638  const ST gh_zzz_r = f3_zzz_r + 3 * kZ * f_zz_i - 3 * kZ * kZ * dZ_r - kZ * kZ * kZ * val_i;
1639  const ST gh_zzz_i = f3_zzz_i - 3 * kZ * f_zz_r - 3 * kZ * kZ * dZ_i + kZ * kZ * kZ * val_r;
1640  //[x][xx] //These are the unique entries
1641  grad_grad_grad_psi[psiIndex][0][0] = c * gh_xxx_r - s * gh_xxx_i;
1642  grad_grad_grad_psi[psiIndex][0][1] = c * gh_xxy_r - s * gh_xxy_i;
1643  grad_grad_grad_psi[psiIndex][0][2] = c * gh_xxz_r - s * gh_xxz_i;
1644  grad_grad_grad_psi[psiIndex][0][3] = c * gh_xxy_r - s * gh_xxy_i;
1645  grad_grad_grad_psi[psiIndex][0][4] = c * gh_xyy_r - s * gh_xyy_i;
1646  grad_grad_grad_psi[psiIndex][0][5] = c * gh_xyz_r - s * gh_xyz_i;
1647  grad_grad_grad_psi[psiIndex][0][6] = c * gh_xxz_r - s * gh_xxz_i;
1648  grad_grad_grad_psi[psiIndex][0][7] = c * gh_xyz_r - s * gh_xyz_i;
1649  grad_grad_grad_psi[psiIndex][0][8] = c * gh_xzz_r - s * gh_xzz_i;
1650 
1651  grad_grad_grad_psi[psiIndex][1][0] = c * gh_xxy_r - s * gh_xxy_i;
1652  grad_grad_grad_psi[psiIndex][1][1] = c * gh_xyy_r - s * gh_xyy_i;
1653  grad_grad_grad_psi[psiIndex][1][2] = c * gh_xyz_r - s * gh_xyz_i;
1654  grad_grad_grad_psi[psiIndex][1][3] = c * gh_xyy_r - s * gh_xyy_i;
1655  grad_grad_grad_psi[psiIndex][1][4] = c * gh_yyy_r - s * gh_yyy_i;
1656  grad_grad_grad_psi[psiIndex][1][5] = c * gh_yyz_r - s * gh_yyz_i;
1657  grad_grad_grad_psi[psiIndex][1][6] = c * gh_xyz_r - s * gh_xyz_i;
1658  grad_grad_grad_psi[psiIndex][1][7] = c * gh_yyz_r - s * gh_yyz_i;
1659  grad_grad_grad_psi[psiIndex][1][8] = c * gh_yzz_r - s * gh_yzz_i;
1660 
1661  grad_grad_grad_psi[psiIndex][2][0] = c * gh_xxz_r - s * gh_xxz_i;
1662  grad_grad_grad_psi[psiIndex][2][1] = c * gh_xyz_r - s * gh_xyz_i;
1663  grad_grad_grad_psi[psiIndex][2][2] = c * gh_xzz_r - s * gh_xzz_i;
1664  grad_grad_grad_psi[psiIndex][2][3] = c * gh_xyz_r - s * gh_xyz_i;
1665  grad_grad_grad_psi[psiIndex][2][4] = c * gh_yyz_r - s * gh_yyz_i;
1666  grad_grad_grad_psi[psiIndex][2][5] = c * gh_yzz_r - s * gh_yzz_i;
1667  grad_grad_grad_psi[psiIndex][2][6] = c * gh_xzz_r - s * gh_xzz_i;
1668  grad_grad_grad_psi[psiIndex][2][7] = c * gh_yzz_r - s * gh_yzz_i;
1669  grad_grad_grad_psi[psiIndex][2][8] = c * gh_zzz_r - s * gh_zzz_i;
1670  }
1671  }
1672 }
1673 
1674 template<typename ST>
1676  const int iat,
1677  ValueVector& psi,
1678  GradVector& dpsi,
1679  HessVector& grad_grad_psi,
1680  GGGVector& grad_grad_grad_psi)
1681 {
1682  const PointType& r = P.activeR(iat);
1683  PointType ru(PrimLattice.toUnit_floor(r));
1684 #pragma omp parallel
1685  {
1686  int first, last;
1687  FairDivideAligned(myV.size(), getAlignment<ST>(), omp_get_num_threads(), omp_get_thread_num(), first, last);
1688 
1689  spline2::evaluate3d_vghgh(SplineInst->getSplinePtr(), ru, myV, myG, myH, mygH, first, last);
1690  assign_vghgh(r, psi, dpsi, grad_grad_psi, grad_grad_grad_psi, first / 2, last / 2);
1691  }
1692 }
1693 
1694 template<typename ST>
1696  int first,
1697  int last,
1698  ValueMatrix& logdet,
1699  GradMatrix& dlogdet,
1700  ValueMatrix& d2logdet)
1701 {
1702  // chunk the [first, last) loop into blocks to save temporary memory usage
1703  const int block_size = 16;
1704 
1705  // reference vectors refer to the rows of matrices
1706  std::vector<ValueVector> multi_psi_v;
1707  std::vector<GradVector> multi_dpsi_v;
1708  std::vector<ValueVector> multi_d2psi_v;
1709  RefVector<ValueVector> psi_v_list;
1710  RefVector<GradVector> dpsi_v_list;
1711  RefVector<ValueVector> d2psi_v_list;
1712 
1713  multi_psi_v.reserve(block_size);
1714  multi_dpsi_v.reserve(block_size);
1715  multi_d2psi_v.reserve(block_size);
1716  psi_v_list.reserve(block_size);
1717  dpsi_v_list.reserve(block_size);
1718  d2psi_v_list.reserve(block_size);
1719 
1720  for (int iat = first, i = 0; iat < last; iat += block_size, i += block_size)
1721  {
1722  const int actual_block_size = std::min(last - iat, block_size);
1723  multi_pos_copy.resize(actual_block_size * 6);
1724  multi_psi_v.clear();
1725  multi_dpsi_v.clear();
1726  multi_d2psi_v.clear();
1727  psi_v_list.clear();
1728  dpsi_v_list.clear();
1729  d2psi_v_list.clear();
1730 
1731  for (int ipos = 0; ipos < actual_block_size; ++ipos)
1732  {
1733  // pack particle positions
1734  const PointType& r = P.activeR(iat + ipos);
1735  PointType ru(PrimLattice.toUnit_floor(r));
1736  multi_pos_copy[ipos * 6] = r[0];
1737  multi_pos_copy[ipos * 6 + 1] = r[1];
1738  multi_pos_copy[ipos * 6 + 2] = r[2];
1739  multi_pos_copy[ipos * 6 + 3] = ru[0];
1740  multi_pos_copy[ipos * 6 + 4] = ru[1];
1741  multi_pos_copy[ipos * 6 + 5] = ru[2];
1742 
1743  multi_psi_v.emplace_back(logdet[i + ipos], OrbitalSetSize);
1744  multi_dpsi_v.emplace_back(dlogdet[i + ipos], OrbitalSetSize);
1745  multi_d2psi_v.emplace_back(d2logdet[i + ipos], OrbitalSetSize);
1746 
1747  psi_v_list.push_back(multi_psi_v[ipos]);
1748  dpsi_v_list.push_back(multi_dpsi_v[ipos]);
1749  d2psi_v_list.push_back(multi_d2psi_v[ipos]);
1750  }
1751 
1752  evaluateVGLMultiPos(multi_pos_copy, offload_scratch, results_scratch, psi_v_list, dpsi_v_list, d2psi_v_list);
1753  }
1754 }
1755 
1756 template class SplineC2ROMPTarget<float>;
1757 template class SplineC2ROMPTarget<double>;
1758 
1759 } // namespace qmcplusplus
OrbitalSetTraits< ValueType >::HessVector HessVector
Definition: SPOSet.h:53
T SymTrace(T h00, T h01, T h02, T h11, T h12, T h22, const T gg[6])
compute Trace(H*G)
Fixed-size array.
Definition: OhmmsTinyMeta.h:30
helper functions for EinsplineSetBuilder
Definition: Configuration.h:43
virtual void mw_evaluateVGLandDetRatioGrads(const RefVectorWithLeader< SPOSet > &spo_list, const RefVectorWithLeader< ParticleSet > &P_list, int iat, const std::vector< const ValueType *> &invRow_ptr_list, OffloadMWVGLArray &phi_vgl_v, std::vector< ValueType > &ratios, std::vector< GradType > &grads) const override
evaluate the values, gradients and laplacians of this single-particle orbital sets and determinant ra...
virtual void evaluateVGH(const ParticleSet &P, const int iat, ValueVector &psi, GradVector &dpsi, HessVector &grad_grad_psi) override
evaluate the values, gradients and hessians of this single-particle orbital set
virtual void mw_evaluateDetRatios(const RefVectorWithLeader< SPOSet > &spo_list, const RefVectorWithLeader< const VirtualParticleSet > &vp_list, const RefVector< ValueVector > &psi_list, const std::vector< const ValueType *> &invRow_ptr_list, std::vector< std::vector< ValueType >> &ratios_list) const override
evaluate determinant ratios for virtual moves, e.g., sphere move for nonlocalPP, of multiple walkers ...
Type_t * data()
Definition: OhmmsArray.h:87
A ParticleSet that handles virtual moves of a selected particle of a given physical ParticleSet Virtu...
OrbitalSetTraits< ValueType >::ValueMatrix ValueMatrix
Definition: SPOSet.h:50
void assign_vgl_from_l(const PointType &r, ValueVector &psi, GradVector &dpsi, ValueVector &d2psi)
assign_vgl_from_l can be used when myL is precomputed and myV,myG,myL in cartesian ...
T v_m_v(T h00, T h01, T h02, T h11, T h12, T h22, T g1x, T g1y, T g1z, T g2x, T g2y, T g2z)
compute vector[3]^T x matrix[3][3] x vector[3]
T t3_contract(T h000, T h001, T h002, T h011, T h012, T h022, T h111, T h112, T h122, T h222, T g1x, T g1y, T g1z, T g2x, T g2y, T g2z, T g3x, T g3y, T g3z)
Coordinate transform for a 3rd rank symmetric tensor representing coordinate derivatives (hence t3_co...
class to handle hdf file
Definition: hdf_archive.h:51
void assign_vgl(ST x, ST y, ST z, TT *restrict results_scratch_ptr, size_t orb_padded_size, const ST *mKK_ptr, const ST *restrict offload_scratch_ptr, size_t spline_padded_size, const ST G[9], const ST *myKcart_ptr, size_t myKcart_padded_size, size_t first_spo, int nComplexBands, int index)
assign_vgl
void assign_v(ST x, ST y, ST z, TT *restrict results_scratch_ptr, const ST *restrict offload_scratch_ptr, const ST *restrict myKcart_ptr, size_t myKcart_padded_size, size_t first_spo, int index)
bool read_splines(hdf_archive &h5f)
void evaluateVGLMultiPos(const Vector< ST, OffloadPinnedAllocator< ST >> &multi_pos_copy, Vector< ST, OffloadPinnedAllocator< ST >> &offload_scratch, Vector< TT, OffloadPinnedAllocator< TT >> &results_scratch, const RefVector< ValueVector > &psi_v_list, const RefVector< GradVector > &dpsi_v_list, const RefVector< ValueVector > &d2psi_v_list) const
T min(T a, T b)
OrbitalSetTraits< ValueType >::GradMatrix GradMatrix
Definition: SPOSet.h:52
void assign_vgh(const PointType &r, ValueVector &psi, GradVector &dpsi, HessVector &grad_grad_psi, int first, int last) const
omp_int_t omp_get_thread_num()
Definition: OpenMP.h:25
Specialized paritlce class for atomistic simulations.
Definition: ParticleSet.h:55
void assign_vghgh(const PointType &r, ValueVector &psi, GradVector &dpsi, HessVector &grad_grad_psi, GGGVector &grad_grad_grad_psi, int first=0, int last=-1) const
void assign_v(const PointType &r, const vContainer_type &myV, ValueVector &psi, int first, int last) const
class to match std::complex<ST> spline with BsplineSet::ValueType (real) SPOs with OpenMP offload ...
virtual void mw_evaluateVGL(const RefVectorWithLeader< SPOSet > &sa_list, const RefVectorWithLeader< ParticleSet > &P_list, int iat, const RefVector< ValueVector > &psi_v_list, const RefVector< GradVector > &dpsi_v_list, const RefVector< ValueVector > &d2psi_v_list) const override
evaluate the values, gradients and laplacians of this single-particle orbital sets of multiple walker...
bool write_splines(hdf_archive &h5f)
QTBase::ValueType ValueType
Definition: Configuration.h:60
ResourceHandle< SplineOMPTargetMultiWalkerMem< ST, TT > > mw_mem_handle_
void FairDivideAligned(const int ntot, const int base, const int npart, const int me, int &first, int &last)
Partition ntot over npart and the size of each partition is a multiple of base size.
Definition: FairDivide.h:96
OrbitalSetTraits< ValueType >::ValueVector ValueVector
Definition: SPOSet.h:49
virtual void evaluateDetRatios(const VirtualParticleSet &VP, ValueVector &psi, const ValueVector &psiinv, std::vector< ValueType > &ratios) override
evaluate determinant ratios for virtual moves, e.g., sphere move for nonlocalPP
size_t size() const
Definition: OhmmsArray.h:57
class to handle complex splines to real orbitals with splines of arbitrary precision splines storage ...
virtual void evaluateVGHGH(const ParticleSet &P, const int iat, ValueVector &psi, GradVector &dpsi, HessVector &grad_grad_psi, GGGVector &grad_grad_grad_psi) override
evaluate the values, gradients, hessians, and grad hessians of this single-particle orbital set ...
const PosType & activeR(int iat) const
return the active position if the particle is active or the return current position if not ...
Definition: ParticleSet.h:265
omp_int_t omp_get_num_threads()
Definition: OpenMP.h:27
OMPallocator is an allocator with fused device and dualspace allocator functionality.
void assign_v(ST x, ST y, ST z, TT *restrict results_scratch_ptr, const ST *restrict offload_scratch_ptr, const ST *restrict myKcart_ptr, size_t myKcart_padded_size, size_t first_spo, int nComplexBands, int index)
std::vector< std::reference_wrapper< T > > RefVector
void set_spline(SingleSplineType *spline_r, SingleSplineType *spline_i, int twist, int ispline, int level)
OrbitalSetTraits< ValueType >::GradHessVector GGGVector
Definition: SPOSet.h:55
sycl::event copy_n(sycl::queue &aq, const T1 *restrict VA, size_t array_size, T2 *restrict VC, const std::vector< sycl::event > &events)
Definition: syclBLAS.cpp:548
OrbitalSetTraits< ValueType >::GradVector GradVector
Definition: SPOSet.h:51
typename BsplineSet::ValueType TT
void sincos(T a, T *restrict s, T *restrict c)
sincos function wrapper
Definition: math.hpp:62
#define ASSUME_ALIGNED(x)
virtual void evaluate_notranspose(const ParticleSet &P, int first, int last, ValueMatrix &logdet, GradMatrix &dlogdet, ValueMatrix &d2logdet) override
evaluate the values, gradients and laplacians of this single-particle orbital for [first...
bool readEntry(T &data, const std::string &aname)
read the data from the group aname and return status use read() for inbuilt error checking ...
Definition: hdf_archive.h:293
virtual void evaluateVGL(const ParticleSet &P, const int iat, ValueVector &psi, GradVector &dpsi, ValueVector &d2psi) override
evaluate the values, gradients and laplacians of this single-particle orbital set ...
A D-dimensional Array class based on PETE.
Definition: OhmmsArray.h:25
SplineC2ROMPTarget(const std::string &my_name)
virtual void evaluateValue(const ParticleSet &P, const int iat, ValueVector &psi) override
evaluate the values of this single-particle orbital set
bool writeEntry(T &data, const std::string &aname)
write the data to the group aname and return status use write() for inbuilt error checking ...
Definition: hdf_archive.h:244