14 #include "spline2/MultiBsplineEval.hpp" 15 #include "spline2/MultiBsplineEval_OMPoffload.hpp" 33 SplineInst->copy_spline(spline_r, 2 * ispline);
34 SplineInst->copy_spline(spline_i, 2 * ispline + 1);
41 o <<
"spline_" << MyIndex;
42 einspline_engine<SplineType> bigtable(SplineInst->getSplinePtr());
43 return h5f.
readEntry(bigtable, o.str().c_str());
50 o <<
"spline_" << MyIndex;
51 einspline_engine<SplineType> bigtable(SplineInst->getSplinePtr());
52 return h5f.
writeEntry(bigtable, o.str().c_str());
63 const size_t last_cplx =
std::min(kPoints.size(), psi.size());
64 last = last > last_cplx ? last_cplx : last;
66 const ST x = r[0], y = r[1], z = r[2];
67 const ST* restrict kx = myKcart->
data(0);
68 const ST* restrict ky = myKcart->data(1);
69 const ST* restrict kz = myKcart->data(2);
71 for (
size_t j = first; j < last; ++j)
74 const ST val_r = myV[2 * j];
75 const ST val_i = myV[2 * j + 1];
77 psi[j + first_spo] =
ComplexT(val_r * c - val_i *
s, val_i * c + val_r *
s);
85 PointType ru(PrimLattice.toUnit_floor(r));
93 spline2::evaluate3d(SplineInst->getSplinePtr(), ru, myV, first, last);
94 assign_v(r, myV, psi, first / 2, last / 2);
102 std::vector<ValueType>& ratios)
105 psiinv_pos_copy.resize(psiinv.size() + nVP * 3);
108 std::copy_n(psiinv.data(), psiinv.size(), psiinv_pos_copy.data());
111 auto* restrict pos_scratch =
reinterpret_cast<RealType*
>(psiinv_pos_copy.data() + psiinv.size());
112 for (
int iat = 0; iat < nVP; ++iat)
115 PointType ru(PrimLattice.toUnit_floor(r));
116 pos_scratch[iat * 6] = r[0];
117 pos_scratch[iat * 6 + 1] = r[1];
118 pos_scratch[iat * 6 + 2] = r[2];
119 pos_scratch[iat * 6 + 3] = ru[0];
120 pos_scratch[iat * 6 + 4] = ru[1];
121 pos_scratch[iat * 6 + 5] = ru[2];
124 const size_t ChunkSizePerTeam = 512;
125 const int NumTeams = (myV.size() + ChunkSizePerTeam - 1) / ChunkSizePerTeam;
126 ratios_private.resize(nVP, NumTeams);
127 const auto spline_padded_size = myV.
size();
128 const auto sposet_padded_size = getAlignedSize<ValueType>(OrbitalSetSize);
129 offload_scratch.resize(spline_padded_size * nVP);
130 results_scratch.resize(sposet_padded_size * nVP);
133 const auto* spline_ptr = SplineInst->getSplinePtr();
134 auto* offload_scratch_ptr = offload_scratch.data();
135 auto* results_scratch_ptr = results_scratch.data();
136 const auto myKcart_padded_size = myKcart->capacity();
137 auto* myKcart_ptr = myKcart->data();
138 auto* psiinv_ptr = psiinv_pos_copy.data();
139 auto* ratios_private_ptr = ratios_private.data();
140 const size_t first_spo_local = first_spo;
141 const auto orb_size = psiinv.size();
145 PRAGMA_OFFLOAD(
"omp target teams distribute collapse(2) num_teams(NumTeams*nVP) \ 146 map(always, to: psiinv_ptr[0:psiinv_pos_copy.size()]) \ 147 map(always, from: ratios_private_ptr[0:NumTeams*nVP])")
148 for (
int iat = 0; iat < nVP; iat++)
149 for (
int team_id = 0; team_id < NumTeams; team_id++)
151 const size_t first = ChunkSizePerTeam * team_id;
152 const size_t last =
omptarget::min(first + ChunkSizePerTeam, spline_padded_size);
154 auto* restrict offload_scratch_iat_ptr = offload_scratch_ptr + spline_padded_size * iat;
155 auto* restrict psi_iat_ptr = results_scratch_ptr + sposet_padded_size * iat;
156 auto* restrict pos_scratch =
reinterpret_cast<RealType*
>(psiinv_ptr + orb_size);
160 spline2::computeLocationAndFractional(spline_ptr, ST(pos_scratch[iat * 6 + 3]), ST(pos_scratch[iat * 6 + 4]),
161 ST(pos_scratch[iat * 6 + 5]), ix, iy, iz, a, b, c);
163 PRAGMA_OFFLOAD(
"omp parallel for")
164 for (
int index = 0; index < last - first; index++)
165 spline2offload::evaluate_v_impl_v2(spline_ptr, ix, iy, iz, first + index, a, b, c,
166 offload_scratch_iat_ptr + first + index);
167 const size_t first_cplx = first / 2;
169 PRAGMA_OFFLOAD(
"omp parallel for")
170 for (
int index = first_cplx; index < last_cplx; index++)
171 C2C::assign_v(ST(pos_scratch[iat * 6]), ST(pos_scratch[iat * 6 + 1]), ST(pos_scratch[iat * 6 + 2]),
172 psi_iat_ptr, offload_scratch_iat_ptr, myKcart_ptr, myKcart_padded_size, first_spo_local, index);
175 PRAGMA_OFFLOAD(
"omp parallel for simd reduction(+:sum)")
176 for (
int i = first_cplx; i < last_cplx; i++)
177 sum += psi_iat_ptr[i] * psiinv_ptr[i];
178 ratios_private_ptr[iat * NumTeams + team_id] = sum;
183 for (
int iat = 0; iat < nVP; ++iat)
186 for (
int tid = 0; tid < NumTeams; tid++)
187 ratios[iat] += ratios_private[iat][tid];
191 template<
typename ST>
195 const std::vector<const ValueType*>& invRow_ptr_list,
196 std::vector<std::vector<ValueType>>& ratios_list)
const 201 auto& det_ratios_buffer_H2D = mw_mem.det_ratios_buffer_H2D;
202 auto& mw_ratios_private = mw_mem.mw_ratios_private;
203 auto& mw_offload_scratch = mw_mem.mw_offload_scratch;
204 auto& mw_results_scratch = mw_mem.mw_results_scratch;
205 const size_t nw = spo_list.size();
206 const size_t orb_size = phi_leader.size();
210 mw_nVP += VP.getTotalNum();
212 const size_t packed_size = nw *
sizeof(
ValueType*) + mw_nVP * (6 *
sizeof(ST) +
sizeof(int));
213 det_ratios_buffer_H2D.resize(packed_size);
217 for (
size_t iw = 0; iw < nw; iw++)
218 ptr_buffer[iw] = invRow_ptr_list[iw];
221 auto* pos_ptr =
reinterpret_cast<ST*
>(det_ratios_buffer_H2D.data() + nw *
sizeof(
ValueType*));
223 reinterpret_cast<int*
>(det_ratios_buffer_H2D.data() + nw *
sizeof(
ValueType*) + mw_nVP * 6 *
sizeof(ST));
225 for (
size_t iw = 0; iw < nw; iw++)
228 assert(ratios_list[iw].size() == VP.
getTotalNum());
229 for (
size_t iat = 0; iat < VP.
getTotalNum(); ++iat, ++iVP)
231 ref_id_ptr[iVP] = iw;
233 PointType ru(PrimLattice.toUnit_floor(r));
244 const size_t ChunkSizePerTeam = 512;
245 const int NumTeams = (myV.size() + ChunkSizePerTeam - 1) / ChunkSizePerTeam;
246 mw_ratios_private.resize(mw_nVP, NumTeams);
247 const auto spline_padded_size = myV.
size();
248 const auto sposet_padded_size = getAlignedSize<ValueType>(OrbitalSetSize);
249 mw_offload_scratch.resize(spline_padded_size * mw_nVP);
250 mw_results_scratch.resize(sposet_padded_size * mw_nVP);
253 const auto* spline_ptr = SplineInst->getSplinePtr();
254 auto* offload_scratch_ptr = mw_offload_scratch.data();
255 auto* results_scratch_ptr = mw_results_scratch.data();
256 const auto myKcart_padded_size = myKcart->capacity();
257 auto* myKcart_ptr = myKcart->data();
258 auto* buffer_H2D_ptr = det_ratios_buffer_H2D.data();
259 auto* ratios_private_ptr = mw_ratios_private.data();
260 const size_t first_spo_local = first_spo;
264 PRAGMA_OFFLOAD(
"omp target teams distribute collapse(2) num_teams(NumTeams*mw_nVP) \ 265 map(always, to: buffer_H2D_ptr[0:det_ratios_buffer_H2D.size()]) \ 266 map(always, from: ratios_private_ptr[0:NumTeams*mw_nVP])")
267 for (
int iat = 0; iat < mw_nVP; iat++)
268 for (
int team_id = 0; team_id < NumTeams; team_id++)
270 const size_t first = ChunkSizePerTeam * team_id;
271 const size_t last =
omptarget::min(first + ChunkSizePerTeam, spline_padded_size);
273 auto* restrict offload_scratch_iat_ptr = offload_scratch_ptr + spline_padded_size * iat;
274 auto* restrict psi_iat_ptr = results_scratch_ptr + sposet_padded_size * iat;
275 auto* ref_id_ptr =
reinterpret_cast<int*
>(buffer_H2D_ptr + nw *
sizeof(
ValueType*) + mw_nVP * 6 *
sizeof(ST));
276 auto* restrict psiinv_ptr =
reinterpret_cast<const ValueType**
>(buffer_H2D_ptr)[ref_id_ptr[iat]];
277 auto* restrict pos_scratch =
reinterpret_cast<ST*
>(buffer_H2D_ptr + nw *
sizeof(
ValueType*));
281 spline2::computeLocationAndFractional(spline_ptr, pos_scratch[iat * 6 + 3], pos_scratch[iat * 6 + 4],
282 pos_scratch[iat * 6 + 5], ix, iy, iz, a, b, c);
284 PRAGMA_OFFLOAD(
"omp parallel for")
285 for (
int index = 0; index < last - first; index++)
286 spline2offload::evaluate_v_impl_v2(spline_ptr, ix, iy, iz, first + index, a, b, c,
287 offload_scratch_iat_ptr + first + index);
288 const size_t first_cplx = first / 2;
290 PRAGMA_OFFLOAD(
"omp parallel for")
291 for (
int index = first_cplx; index < last_cplx; index++)
292 C2C::assign_v(pos_scratch[iat * 6], pos_scratch[iat * 6 + 1], pos_scratch[iat * 6 + 2], psi_iat_ptr,
293 offload_scratch_iat_ptr, myKcart_ptr, myKcart_padded_size, first_spo_local, index);
296 PRAGMA_OFFLOAD(
"omp parallel for simd reduction(+:sum)")
297 for (
int i = first_cplx; i < last_cplx; i++)
298 sum += psi_iat_ptr[i] * psiinv_ptr[i];
299 ratios_private_ptr[iat * NumTeams + team_id] = sum;
305 for (
size_t iw = 0; iw < nw; iw++)
307 auto& ratios = ratios_list[iw];
308 for (
size_t iat = 0; iat < ratios.size(); iat++, iVP++)
311 for (
int tid = 0; tid < NumTeams; ++tid)
312 ratios[iat] += mw_ratios_private[iVP][tid];
319 template<
typename ST>
326 const ST x = r[0], y = r[1], z = r[2];
328 const ST* restrict k0 = myKcart->
data(0);
329 const ST* restrict k1 = myKcart->data(1);
330 const ST* restrict k2 = myKcart->data(2);
332 const ST* restrict g0 = myG.data(0);
333 const ST* restrict g1 = myG.data(1);
334 const ST* restrict g2 = myG.data(2);
336 const size_t last_cplx = last_spo > psi.size() ? psi.size() : last_spo;
337 const size_t N = last_cplx - first_spo;
339 for (
size_t j = 0; j <
N; ++j)
341 const size_t jr = j << 1;
342 const size_t ji = jr + 1;
347 const ST val_r = myV[jr];
348 const ST val_i = myV[ji];
355 const ST dX_r = g0[jr];
356 const ST dY_r = g1[jr];
357 const ST dZ_r = g2[jr];
359 const ST dX_i = g0[ji];
360 const ST dY_i = g1[ji];
361 const ST dZ_i = g2[ji];
364 const ST gX_r = dX_r + val_i * kX;
365 const ST gY_r = dY_r + val_i * kY;
366 const ST gZ_r = dZ_r + val_i * kZ;
367 const ST gX_i = dX_i - val_r * kX;
368 const ST gY_i = dY_i - val_r * kY;
369 const ST gZ_i = dZ_i - val_r * kZ;
371 const ST lap_r = myL[jr] + (*mKK)[j] * val_r + two * (kX * dX_i + kY * dY_i + kZ * dZ_i);
372 const ST lap_i = myL[ji] + (*mKK)[j] * val_i - two * (kX * dX_r + kY * dY_r + kZ * dZ_r);
374 const size_t psiIndex = j + first_spo;
375 psi[psiIndex] =
ComplexT(c * val_r -
s * val_i, c * val_i +
s * val_r);
376 dpsi[psiIndex][0] =
ComplexT(c * gX_r -
s * gX_i, c * gX_i +
s * gX_r);
377 dpsi[psiIndex][1] =
ComplexT(c * gY_r -
s * gY_i, c * gY_i +
s * gY_r);
378 dpsi[psiIndex][2] =
ComplexT(c * gZ_r -
s * gZ_i, c * gZ_i +
s * gZ_r);
379 d2psi[psiIndex] =
ComplexT(c * lap_r -
s * lap_i, c * lap_i +
s * lap_r);
383 template<
typename ST>
391 PointType ru(PrimLattice.toUnit_floor(r));
393 const size_t ChunkSizePerTeam = 512;
394 const int NumTeams = (myV.size() + ChunkSizePerTeam - 1) / ChunkSizePerTeam;
396 const auto spline_padded_size = myV.
size();
397 const auto sposet_padded_size = getAlignedSize<ValueType>(OrbitalSetSize);
400 results_scratch.resize(sposet_padded_size * 5);
403 const auto* spline_ptr = SplineInst->getSplinePtr();
404 auto* offload_scratch_ptr = offload_scratch.data();
405 auto* results_scratch_ptr = results_scratch.data();
406 const auto x = r[0], y = r[1], z = r[2];
407 const auto rux = ru[0], ruy = ru[1], ruz = ru[2];
408 const auto myKcart_padded_size = myKcart->capacity();
409 auto* mKK_ptr = mKK->
data();
410 auto* GGt_ptr = GGt_offload->data();
411 auto* PrimLattice_G_ptr = PrimLattice_G_offload->data();
412 auto* myKcart_ptr = myKcart->data();
413 const size_t first_spo_local = first_spo;
414 const auto orb_size = psi.size();
418 PRAGMA_OFFLOAD(
"omp target teams distribute num_teams(NumTeams) \ 419 map(always, from: results_scratch_ptr[0:sposet_padded_size*5])")
420 for (
int team_id = 0; team_id < NumTeams; team_id++)
422 const size_t first = ChunkSizePerTeam * team_id;
423 const size_t last =
omptarget::min(first + ChunkSizePerTeam, spline_padded_size);
426 ST a[4], b[4], c[4], da[4], db[4], dc[4], d2a[4], d2b[4], d2c[4];
427 spline2::computeLocationAndFractional(spline_ptr, rux, ruy, ruz, ix, iy, iz, a, b, c, da, db, dc, d2a, d2b, d2c);
429 const ST G[9] = {PrimLattice_G_ptr[0], PrimLattice_G_ptr[1], PrimLattice_G_ptr[2],
430 PrimLattice_G_ptr[3], PrimLattice_G_ptr[4], PrimLattice_G_ptr[5],
431 PrimLattice_G_ptr[6], PrimLattice_G_ptr[7], PrimLattice_G_ptr[8]};
432 const ST symGGt[6] = {GGt_ptr[0], GGt_ptr[1] + GGt_ptr[3], GGt_ptr[2] + GGt_ptr[6],
433 GGt_ptr[4], GGt_ptr[5] + GGt_ptr[7], GGt_ptr[8]};
435 PRAGMA_OFFLOAD(
"omp parallel for")
436 for (
int index = 0; index < last - first; index++)
438 spline2offload::evaluate_vgh_impl_v2(spline_ptr, ix, iy, iz, first + index, a, b, c, da, db, dc, d2a, d2b, d2c,
439 offload_scratch_ptr + first + index, spline_padded_size);
440 const int output_index = first + index;
450 const size_t first_cplx = first / 2;
452 PRAGMA_OFFLOAD(
"omp parallel for")
453 for (
int index = first_cplx; index < last_cplx; index++)
454 C2C::assign_vgl(x, y, z, results_scratch_ptr, sposet_padded_size, mKK_ptr, offload_scratch_ptr,
455 spline_padded_size, G, myKcart_ptr, myKcart_padded_size, first_spo_local, index);
459 for (
size_t i = 0; i < orb_size; i++)
461 psi[i] = results_scratch[i];
462 dpsi[i][0] = results_scratch[i + sposet_padded_size];
463 dpsi[i][1] = results_scratch[i + sposet_padded_size * 2];
464 dpsi[i][2] = results_scratch[i + sposet_padded_size * 3];
465 d2psi[i] = results_scratch[i + sposet_padded_size * 4];
469 template<
typename ST>
477 const size_t num_pos = psi_v_list.size();
478 const size_t ChunkSizePerTeam = 512;
479 const int NumTeams = (myV.size() + ChunkSizePerTeam - 1) / ChunkSizePerTeam;
480 const auto spline_padded_size = myV.size();
481 const auto sposet_padded_size = getAlignedSize<ValueType>(OrbitalSetSize);
483 const auto orb_size = psi_v_list[0].get().size();
485 results_scratch.resize(sposet_padded_size * num_pos * 5);
488 const auto* spline_ptr = SplineInst->getSplinePtr();
489 auto* pos_copy_ptr = multi_pos.data();
490 auto* offload_scratch_ptr = offload_scratch.data();
491 auto* results_scratch_ptr = results_scratch.data();
492 const auto myKcart_padded_size = myKcart->capacity();
493 auto* mKK_ptr = mKK->data();
494 auto* GGt_ptr = GGt_offload->data();
495 auto* PrimLattice_G_ptr = PrimLattice_G_offload->data();
496 auto* myKcart_ptr = myKcart->data();
497 const size_t first_spo_local = first_spo;
501 PRAGMA_OFFLOAD(
"omp target teams distribute collapse(2) num_teams(NumTeams*num_pos) \ 502 map(always, to: pos_copy_ptr[0:num_pos*6]) \ 503 map(always, from: results_scratch_ptr[0:sposet_padded_size*num_pos*5])")
504 for (
int iw = 0; iw < num_pos; iw++)
505 for (
int team_id = 0; team_id < NumTeams; team_id++)
507 const size_t first = ChunkSizePerTeam * team_id;
508 const size_t last =
omptarget::min(first + ChunkSizePerTeam, spline_padded_size);
510 auto* restrict offload_scratch_iw_ptr = offload_scratch_ptr + spline_padded_size * iw *
SoAFields3D::NUM_FIELDS;
511 auto* restrict psi_iw_ptr = results_scratch_ptr + sposet_padded_size * iw * 5;
514 ST a[4], b[4], c[4], da[4], db[4], dc[4], d2a[4], d2b[4], d2c[4];
515 spline2::computeLocationAndFractional(spline_ptr, pos_copy_ptr[iw * 6 + 3], pos_copy_ptr[iw * 6 + 4],
516 pos_copy_ptr[iw * 6 + 5], ix, iy, iz, a, b, c, da, db, dc, d2a, d2b, d2c);
518 const ST G[9] = {PrimLattice_G_ptr[0], PrimLattice_G_ptr[1], PrimLattice_G_ptr[2],
519 PrimLattice_G_ptr[3], PrimLattice_G_ptr[4], PrimLattice_G_ptr[5],
520 PrimLattice_G_ptr[6], PrimLattice_G_ptr[7], PrimLattice_G_ptr[8]};
521 const ST symGGt[6] = {GGt_ptr[0], GGt_ptr[1] + GGt_ptr[3], GGt_ptr[2] + GGt_ptr[6],
522 GGt_ptr[4], GGt_ptr[5] + GGt_ptr[7], GGt_ptr[8]};
524 PRAGMA_OFFLOAD(
"omp parallel for")
525 for (
int index = 0; index < last - first; index++)
527 spline2offload::evaluate_vgh_impl_v2(spline_ptr, ix, iy, iz, first + index, a, b, c, da, db, dc, d2a, d2b,
528 d2c, offload_scratch_iw_ptr + first + index, spline_padded_size);
529 const int output_index = first + index;
539 const size_t first_cplx = first / 2;
541 PRAGMA_OFFLOAD(
"omp parallel for")
542 for (
int index = first_cplx; index < last_cplx; index++)
543 C2C::assign_vgl(pos_copy_ptr[iw * 6], pos_copy_ptr[iw * 6 + 1], pos_copy_ptr[iw * 6 + 2], psi_iw_ptr,
544 sposet_padded_size, mKK_ptr, offload_scratch_iw_ptr, spline_padded_size, G, myKcart_ptr,
545 myKcart_padded_size, first_spo_local, index);
549 for (
int iw = 0; iw < num_pos; ++iw)
551 auto* restrict results_iw_ptr = results_scratch_ptr + sposet_padded_size * iw * 5;
555 for (
size_t i = 0; i < orb_size; i++)
557 psi_v[i] = results_iw_ptr[i];
558 dpsi_v[i][0] = results_iw_ptr[i + sposet_padded_size];
559 dpsi_v[i][1] = results_iw_ptr[i + sposet_padded_size * 2];
560 dpsi_v[i][2] = results_iw_ptr[i + sposet_padded_size * 3];
561 d2psi_v[i] = results_iw_ptr[i + sposet_padded_size * 4];
566 template<
typename ST>
577 auto& mw_pos_copy = mw_mem.mw_pos_copy;
578 auto& mw_offload_scratch = mw_mem.mw_offload_scratch;
579 auto& mw_results_scratch = mw_mem.mw_results_scratch;
580 const int nwalkers = sa_list.size();
581 mw_pos_copy.resize(nwalkers * 6);
584 for (
int iw = 0; iw < nwalkers; ++iw)
586 const PointType& r = P_list[iw].activeR(iat);
587 PointType ru(PrimLattice.toUnit_floor(r));
588 mw_pos_copy[iw * 6] = r[0];
589 mw_pos_copy[iw * 6 + 1] = r[1];
590 mw_pos_copy[iw * 6 + 2] = r[2];
591 mw_pos_copy[iw * 6 + 3] = ru[0];
592 mw_pos_copy[iw * 6 + 4] = ru[1];
593 mw_pos_copy[iw * 6 + 5] = ru[2];
596 phi_leader.evaluateVGLMultiPos(mw_pos_copy, mw_offload_scratch, mw_results_scratch, psi_v_list, dpsi_v_list,
600 template<
typename ST>
604 const std::vector<const ValueType*>& invRow_ptr_list,
606 std::vector<ValueType>& ratios,
607 std::vector<GradType>& grads)
const 612 auto& buffer_H2D = mw_mem.buffer_H2D;
613 auto& rg_private = mw_mem.rg_private;
614 auto& mw_offload_scratch = mw_mem.mw_offload_scratch;
615 auto& mw_results_scratch = mw_mem.mw_results_scratch;
616 const int nwalkers = spo_list.size();
617 buffer_H2D.resize(nwalkers,
sizeof(ST) * 6 +
sizeof(
ValueType*));
620 for (
int iw = 0; iw < nwalkers; ++iw)
622 const PointType& r = P_list[iw].activeR(iat);
623 PointType ru(PrimLattice.toUnit_floor(r));
624 Vector<ST> pos_copy(reinterpret_cast<ST*>(buffer_H2D[iw]), 6);
633 auto& invRow_ptr = *
reinterpret_cast<const ValueType**
>(buffer_H2D[iw] +
sizeof(ST) * 6);
634 invRow_ptr = invRow_ptr_list[iw];
637 const size_t num_pos = nwalkers;
638 const auto orb_size = phi_vgl_v.
size(2);
639 const auto spline_padded_size = myV.size();
640 const auto sposet_padded_size = getAlignedSize<ValueType>(OrbitalSetSize);
641 const size_t ChunkSizePerTeam = 512;
642 const int NumTeams = (myV.size() + ChunkSizePerTeam - 1) / ChunkSizePerTeam;
645 mw_results_scratch.resize(sposet_padded_size * num_pos * 5);
647 rg_private.resize(num_pos, NumTeams * 4);
650 const auto* spline_ptr = SplineInst->getSplinePtr();
651 auto* buffer_H2D_ptr = buffer_H2D.data();
652 auto* offload_scratch_ptr = mw_offload_scratch.data();
653 auto* results_scratch_ptr = mw_results_scratch.data();
654 const auto myKcart_padded_size = myKcart->capacity();
655 auto* mKK_ptr = mKK->data();
656 auto* GGt_ptr = GGt_offload->data();
657 auto* PrimLattice_G_ptr = PrimLattice_G_offload->data();
658 auto* myKcart_ptr = myKcart->data();
659 auto* phi_vgl_ptr = phi_vgl_v.
data();
660 auto* rg_private_ptr = rg_private.data();
661 const size_t buffer_H2D_stride = buffer_H2D.cols();
662 const size_t first_spo_local = first_spo;
663 const size_t phi_vgl_stride = num_pos * orb_size;
667 PRAGMA_OFFLOAD(
"omp target teams distribute collapse(2) num_teams(NumTeams*num_pos) \ 668 map(always, to: buffer_H2D_ptr[:buffer_H2D.size()]) \ 669 map(always, from: rg_private_ptr[0:rg_private.size()])")
670 for (
int iw = 0; iw < num_pos; iw++)
671 for (
int team_id = 0; team_id < NumTeams; team_id++)
673 const size_t first = ChunkSizePerTeam * team_id;
674 const size_t last =
omptarget::min(first + ChunkSizePerTeam, spline_padded_size);
676 auto* restrict offload_scratch_iw_ptr = offload_scratch_ptr + spline_padded_size * iw *
SoAFields3D::NUM_FIELDS;
677 auto* restrict psi_iw_ptr = results_scratch_ptr + sposet_padded_size * iw * 5;
678 const auto* restrict pos_iw_ptr =
reinterpret_cast<ST*
>(buffer_H2D_ptr + buffer_H2D_stride * iw);
679 const auto* restrict invRow_iw_ptr =
680 *
reinterpret_cast<ValueType**
>(buffer_H2D_ptr + buffer_H2D_stride * iw +
sizeof(ST) * 6);
683 ST a[4], b[4], c[4], da[4], db[4], dc[4], d2a[4], d2b[4], d2c[4];
684 spline2::computeLocationAndFractional(spline_ptr, pos_iw_ptr[3], pos_iw_ptr[4], pos_iw_ptr[5], ix, iy, iz, a, b,
685 c, da, db, dc, d2a, d2b, d2c);
687 const ST G[9] = {PrimLattice_G_ptr[0], PrimLattice_G_ptr[1], PrimLattice_G_ptr[2],
688 PrimLattice_G_ptr[3], PrimLattice_G_ptr[4], PrimLattice_G_ptr[5],
689 PrimLattice_G_ptr[6], PrimLattice_G_ptr[7], PrimLattice_G_ptr[8]};
690 const ST symGGt[6] = {GGt_ptr[0], GGt_ptr[1] + GGt_ptr[3], GGt_ptr[2] + GGt_ptr[6],
691 GGt_ptr[4], GGt_ptr[5] + GGt_ptr[7], GGt_ptr[8]};
693 PRAGMA_OFFLOAD(
"omp parallel for")
694 for (
int index = 0; index < last - first; index++)
696 spline2offload::evaluate_vgh_impl_v2(spline_ptr, ix, iy, iz, first + index, a, b, c, da, db, dc, d2a, d2b,
697 d2c, offload_scratch_iw_ptr + first + index, spline_padded_size);
698 const int output_index = first + index;
708 const size_t first_cplx = first / 2;
710 PRAGMA_OFFLOAD(
"omp parallel for")
711 for (
int index = first_cplx; index < last_cplx; index++)
712 C2C::assign_vgl(pos_iw_ptr[0], pos_iw_ptr[1], pos_iw_ptr[2], psi_iw_ptr, sposet_padded_size, mKK_ptr,
713 offload_scratch_iw_ptr, spline_padded_size, G, myKcart_ptr, myKcart_padded_size,
714 first_spo_local, index);
717 ValueType* restrict dpsi_x = psi_iw_ptr + sposet_padded_size;
718 ValueType* restrict dpsi_y = psi_iw_ptr + sposet_padded_size * 2;
719 ValueType* restrict dpsi_z = psi_iw_ptr + sposet_padded_size * 3;
720 ValueType* restrict d2psi = psi_iw_ptr + sposet_padded_size * 4;
722 ValueType* restrict out_phi = phi_vgl_ptr + iw * orb_size;
723 ValueType* restrict out_dphi_x = out_phi + phi_vgl_stride;
724 ValueType* restrict out_dphi_y = out_dphi_x + phi_vgl_stride;
725 ValueType* restrict out_dphi_z = out_dphi_y + phi_vgl_stride;
726 ValueType* restrict out_d2phi = out_dphi_z + phi_vgl_stride;
728 ValueType ratio(0), grad_x(0), grad_y(0), grad_z(0);
729 PRAGMA_OFFLOAD(
"omp parallel for reduction(+: ratio, grad_x, grad_y, grad_z)")
730 for (
int j = first_cplx; j < last_cplx; j++)
732 const size_t psiIndex = first_spo_local + j;
734 out_phi[psiIndex] = psi[psiIndex];
735 out_dphi_x[psiIndex] = dpsi_x[psiIndex];
736 out_dphi_y[psiIndex] = dpsi_y[psiIndex];
737 out_dphi_z[psiIndex] = dpsi_z[psiIndex];
738 out_d2phi[psiIndex] = d2psi[psiIndex];
740 ratio += psi[psiIndex] * invRow_iw_ptr[psiIndex];
741 grad_x += dpsi_x[psiIndex] * invRow_iw_ptr[psiIndex];
742 grad_y += dpsi_y[psiIndex] * invRow_iw_ptr[psiIndex];
743 grad_z += dpsi_z[psiIndex] * invRow_iw_ptr[psiIndex];
746 rg_private_ptr[(iw * NumTeams + team_id) * 4] = ratio;
747 rg_private_ptr[(iw * NumTeams + team_id) * 4 + 1] = grad_x;
748 rg_private_ptr[(iw * NumTeams + team_id) * 4 + 2] = grad_y;
749 rg_private_ptr[(iw * NumTeams + team_id) * 4 + 3] = grad_z;
753 for (
int iw = 0; iw < num_pos; iw++)
756 for (
int team_id = 0; team_id < NumTeams; team_id++)
757 ratio += rg_private[iw][team_id * 4];
760 ValueType grad_x(0), grad_y(0), grad_z(0);
761 for (
int team_id = 0; team_id < NumTeams; team_id++)
763 grad_x += rg_private[iw][team_id * 4 + 1];
764 grad_y += rg_private[iw][team_id * 4 + 2];
765 grad_z += rg_private[iw][team_id * 4 + 3];
767 grads[iw] =
GradType{grad_x / ratio, grad_y / ratio, grad_z / ratio};
770 template<
typename ST>
779 const size_t last_cplx =
std::min(kPoints.size(), psi.size());
780 last = last > last_cplx ? last_cplx : last;
782 const ST g00 = PrimLattice.G(0), g01 = PrimLattice.G(1), g02 = PrimLattice.G(2), g10 = PrimLattice.G(3),
783 g11 = PrimLattice.G(4), g12 = PrimLattice.G(5), g20 = PrimLattice.G(6), g21 = PrimLattice.G(7),
784 g22 = PrimLattice.G(8);
785 const ST x = r[0], y = r[1], z = r[2];
787 const ST* restrict k0 = myKcart->
data(0);
788 const ST* restrict k1 = myKcart->data(1);
789 const ST* restrict k2 = myKcart->data(2);
791 const ST* restrict g0 = myG.data(0);
792 const ST* restrict g1 = myG.data(1);
793 const ST* restrict g2 = myG.data(2);
794 const ST* restrict h00 = myH.data(0);
795 const ST* restrict h01 = myH.data(1);
796 const ST* restrict h02 = myH.data(2);
797 const ST* restrict h11 = myH.data(3);
798 const ST* restrict h12 = myH.data(4);
799 const ST* restrict h22 = myH.data(5);
802 for (
size_t j = first; j < last; ++j)
810 const ST val_r = myV[jr];
811 const ST val_i = myV[ji];
818 const ST dX_r = g00 * g0[jr] + g01 * g1[jr] + g02 * g2[jr];
819 const ST dY_r = g10 * g0[jr] + g11 * g1[jr] + g12 * g2[jr];
820 const ST dZ_r = g20 * g0[jr] + g21 * g1[jr] + g22 * g2[jr];
822 const ST dX_i = g00 * g0[ji] + g01 * g1[ji] + g02 * g2[ji];
823 const ST dY_i = g10 * g0[ji] + g11 * g1[ji] + g12 * g2[ji];
824 const ST dZ_i = g20 * g0[ji] + g21 * g1[ji] + g22 * g2[ji];
827 const ST gX_r = dX_r + val_i * kX;
828 const ST gY_r = dY_r + val_i * kY;
829 const ST gZ_r = dZ_r + val_i * kZ;
830 const ST gX_i = dX_i - val_r * kX;
831 const ST gY_i = dY_i - val_r * kY;
832 const ST gZ_i = dZ_i - val_r * kZ;
834 const size_t psiIndex = j + first_spo;
835 psi[psiIndex] =
ComplexT(c * val_r -
s * val_i, c * val_i +
s * val_r);
836 dpsi[psiIndex][0] =
ComplexT(c * gX_r -
s * gX_i, c * gX_i +
s * gX_r);
837 dpsi[psiIndex][1] =
ComplexT(c * gY_r -
s * gY_i, c * gY_i +
s * gY_r);
838 dpsi[psiIndex][2] =
ComplexT(c * gZ_r -
s * gZ_i, c * gZ_i +
s * gZ_r);
841 v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g00, g01, g02, g00, g01, g02) + kX * (gX_i + dX_i);
843 v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g00, g01, g02, g10, g11, g12) + kX * (gY_i + dY_i);
845 v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g00, g01, g02, g20, g21, g22) + kX * (gZ_i + dZ_i);
847 v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g10, g11, g12, g00, g01, g02) + kY * (gX_i + dX_i);
849 v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g10, g11, g12, g10, g11, g12) + kY * (gY_i + dY_i);
851 v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g10, g11, g12, g20, g21, g22) + kY * (gZ_i + dZ_i);
853 v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g20, g21, g22, g00, g01, g02) + kZ * (gX_i + dX_i);
855 v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g20, g21, g22, g10, g11, g12) + kZ * (gY_i + dY_i);
857 v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g20, g21, g22, g20, g21, g22) + kZ * (gZ_i + dZ_i);
860 v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g00, g01, g02, g00, g01, g02) - kX * (gX_r + dX_r);
862 v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g00, g01, g02, g10, g11, g12) - kX * (gY_r + dY_r);
864 v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g00, g01, g02, g20, g21, g22) - kX * (gZ_r + dZ_r);
866 v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g10, g11, g12, g00, g01, g02) - kY * (gX_r + dX_r);
868 v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g10, g11, g12, g10, g11, g12) - kY * (gY_r + dY_r);
870 v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g10, g11, g12, g20, g21, g22) - kY * (gZ_r + dZ_r);
872 v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g20, g21, g22, g00, g01, g02) - kZ * (gX_r + dX_r);
874 v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g20, g21, g22, g10, g11, g12) - kZ * (gY_r + dY_r);
876 v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g20, g21, g22, g20, g21, g22) - kZ * (gZ_r + dZ_r);
878 grad_grad_psi[psiIndex][0] =
ComplexT(c * h_xx_r -
s * h_xx_i, c * h_xx_i +
s * h_xx_r);
879 grad_grad_psi[psiIndex][1] =
ComplexT(c * h_xy_r -
s * h_xy_i, c * h_xy_i +
s * h_xy_r);
880 grad_grad_psi[psiIndex][2] =
ComplexT(c * h_xz_r -
s * h_xz_i, c * h_xz_i +
s * h_xz_r);
881 grad_grad_psi[psiIndex][3] =
ComplexT(c * h_yx_r -
s * h_yx_i, c * h_yx_i +
s * h_yx_r);
882 grad_grad_psi[psiIndex][4] =
ComplexT(c * h_yy_r -
s * h_yy_i, c * h_yy_i +
s * h_yy_r);
883 grad_grad_psi[psiIndex][5] =
ComplexT(c * h_yz_r -
s * h_yz_i, c * h_yz_i +
s * h_yz_r);
884 grad_grad_psi[psiIndex][6] =
ComplexT(c * h_zx_r -
s * h_zx_i, c * h_zx_i +
s * h_zx_r);
885 grad_grad_psi[psiIndex][7] =
ComplexT(c * h_zy_r -
s * h_zy_i, c * h_zy_i +
s * h_zy_r);
886 grad_grad_psi[psiIndex][8] =
ComplexT(c * h_zz_r -
s * h_zz_i, c * h_zz_i +
s * h_zz_r);
890 template<
typename ST>
898 PointType ru(PrimLattice.toUnit_floor(r));
906 spline2::evaluate3d_vgh(SplineInst->getSplinePtr(), ru, myV, myG, myH, first, last);
907 assign_vgh(r, psi, dpsi, grad_grad_psi, first / 2, last / 2);
911 template<
typename ST>
921 const size_t last_cplx =
std::min(kPoints.size(), psi.size());
922 last = last < 0 ? last_cplx : (last > last_cplx ? last_cplx : last);
924 const ST g00 = PrimLattice.G(0), g01 = PrimLattice.G(1), g02 = PrimLattice.G(2), g10 = PrimLattice.G(3),
925 g11 = PrimLattice.G(4), g12 = PrimLattice.G(5), g20 = PrimLattice.G(6), g21 = PrimLattice.G(7),
926 g22 = PrimLattice.G(8);
927 const ST x = r[0], y = r[1], z = r[2];
929 const ST* restrict k0 = myKcart->
data(0);
930 const ST* restrict k1 = myKcart->data(1);
931 const ST* restrict k2 = myKcart->data(2);
933 const ST* restrict g0 = myG.data(0);
934 const ST* restrict g1 = myG.data(1);
935 const ST* restrict g2 = myG.data(2);
936 const ST* restrict h00 = myH.data(0);
937 const ST* restrict h01 = myH.data(1);
938 const ST* restrict h02 = myH.data(2);
939 const ST* restrict h11 = myH.data(3);
940 const ST* restrict h12 = myH.data(4);
941 const ST* restrict h22 = myH.data(5);
943 const ST* restrict gh000 = mygH.data(0);
944 const ST* restrict gh001 = mygH.data(1);
945 const ST* restrict gh002 = mygH.data(2);
946 const ST* restrict gh011 = mygH.data(3);
947 const ST* restrict gh012 = mygH.data(4);
948 const ST* restrict gh022 = mygH.data(5);
949 const ST* restrict gh111 = mygH.data(6);
950 const ST* restrict gh112 = mygH.data(7);
951 const ST* restrict gh122 = mygH.data(8);
952 const ST* restrict gh222 = mygH.data(9);
956 for (
size_t j = first; j < last; ++j)
964 const ST val_r = myV[jr];
965 const ST val_i = myV[ji];
972 const ST dX_r = g00 * g0[jr] + g01 * g1[jr] + g02 * g2[jr];
973 const ST dY_r = g10 * g0[jr] + g11 * g1[jr] + g12 * g2[jr];
974 const ST dZ_r = g20 * g0[jr] + g21 * g1[jr] + g22 * g2[jr];
976 const ST dX_i = g00 * g0[ji] + g01 * g1[ji] + g02 * g2[ji];
977 const ST dY_i = g10 * g0[ji] + g11 * g1[ji] + g12 * g2[ji];
978 const ST dZ_i = g20 * g0[ji] + g21 * g1[ji] + g22 * g2[ji];
981 const ST gX_r = dX_r + val_i * kX;
982 const ST gY_r = dY_r + val_i * kY;
983 const ST gZ_r = dZ_r + val_i * kZ;
984 const ST gX_i = dX_i - val_r * kX;
985 const ST gY_i = dY_i - val_r * kY;
986 const ST gZ_i = dZ_i - val_r * kZ;
988 const size_t psiIndex = j + first_spo;
989 psi[psiIndex] =
ComplexT(c * val_r -
s * val_i, c * val_i +
s * val_r);
990 dpsi[psiIndex][0] =
ComplexT(c * gX_r -
s * gX_i, c * gX_i +
s * gX_r);
991 dpsi[psiIndex][1] =
ComplexT(c * gY_r -
s * gY_i, c * gY_i +
s * gY_r);
992 dpsi[psiIndex][2] =
ComplexT(c * gZ_r -
s * gZ_i, c * gZ_i +
s * gZ_r);
995 const ST f_xx_r =
v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g00, g01, g02, g00, g01, g02);
996 const ST f_xy_r =
v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g00, g01, g02, g10, g11, g12);
997 const ST f_xz_r =
v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g00, g01, g02, g20, g21, g22);
998 const ST f_yy_r =
v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g10, g11, g12, g10, g11, g12);
999 const ST f_yz_r =
v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g10, g11, g12, g20, g21, g22);
1000 const ST f_zz_r =
v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g20, g21, g22, g20, g21, g22);
1002 const ST f_xx_i =
v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g00, g01, g02, g00, g01, g02);
1003 const ST f_xy_i =
v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g00, g01, g02, g10, g11, g12);
1004 const ST f_xz_i =
v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g00, g01, g02, g20, g21, g22);
1005 const ST f_yy_i =
v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g10, g11, g12, g10, g11, g12);
1006 const ST f_yz_i =
v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g10, g11, g12, g20, g21, g22);
1007 const ST f_zz_i =
v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g20, g21, g22, g20, g21, g22);
1009 const ST h_xx_r = f_xx_r + 2 * kX * dX_i - kX * kX * val_r;
1010 const ST h_xy_r = f_xy_r + (kX * dY_i + kY * dX_i) - kX * kY * val_r;
1011 const ST h_xz_r = f_xz_r + (kX * dZ_i + kZ * dX_i) - kX * kZ * val_r;
1012 const ST h_yy_r = f_yy_r + 2 * kY * dY_i - kY * kY * val_r;
1013 const ST h_yz_r = f_yz_r + (kY * dZ_i + kZ * dY_i) - kY * kZ * val_r;
1014 const ST h_zz_r = f_zz_r + 2 * kZ * dZ_i - kZ * kZ * val_r;
1016 const ST h_xx_i = f_xx_i - 2 * kX * dX_r - kX * kX * val_i;
1017 const ST h_xy_i = f_xy_i - (kX * dY_r + kY * dX_r) - kX * kY * val_i;
1018 const ST h_xz_i = f_xz_i - (kX * dZ_r + kZ * dX_r) - kX * kZ * val_i;
1019 const ST h_yy_i = f_yy_i - 2 * kY * dY_r - kY * kY * val_i;
1020 const ST h_yz_i = f_yz_i - (kZ * dY_r + kY * dZ_r) - kZ * kY * val_i;
1021 const ST h_zz_i = f_zz_i - 2 * kZ * dZ_r - kZ * kZ * val_i;
1023 grad_grad_psi[psiIndex][0] =
ComplexT(c * h_xx_r -
s * h_xx_i, c * h_xx_i +
s * h_xx_r);
1024 grad_grad_psi[psiIndex][1] =
ComplexT(c * h_xy_r -
s * h_xy_i, c * h_xy_i +
s * h_xy_r);
1025 grad_grad_psi[psiIndex][2] =
ComplexT(c * h_xz_r -
s * h_xz_i, c * h_xz_i +
s * h_xz_r);
1026 grad_grad_psi[psiIndex][3] =
ComplexT(c * h_xy_r -
s * h_xy_i, c * h_xy_i +
s * h_xy_r);
1027 grad_grad_psi[psiIndex][4] =
ComplexT(c * h_yy_r -
s * h_yy_i, c * h_yy_i +
s * h_yy_r);
1028 grad_grad_psi[psiIndex][5] =
ComplexT(c * h_yz_r -
s * h_yz_i, c * h_yz_i +
s * h_yz_r);
1029 grad_grad_psi[psiIndex][6] =
ComplexT(c * h_xz_r -
s * h_xz_i, c * h_xz_i +
s * h_xz_r);
1030 grad_grad_psi[psiIndex][7] =
ComplexT(c * h_yz_r -
s * h_yz_i, c * h_yz_i +
s * h_yz_r);
1031 grad_grad_psi[psiIndex][8] =
ComplexT(c * h_zz_r -
s * h_zz_i, c * h_zz_i +
s * h_zz_r);
1036 const ST f3_xxx_r =
t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
1037 gh112[jr], gh122[jr], gh222[jr], g00, g01, g02, g00, g01, g02, g00, g01, g02);
1038 const ST f3_xxy_r =
t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
1039 gh112[jr], gh122[jr], gh222[jr], g00, g01, g02, g00, g01, g02, g10, g11, g12);
1040 const ST f3_xxz_r =
t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
1041 gh112[jr], gh122[jr], gh222[jr], g00, g01, g02, g00, g01, g02, g20, g21, g22);
1042 const ST f3_xyy_r =
t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
1043 gh112[jr], gh122[jr], gh222[jr], g00, g01, g02, g10, g11, g12, g10, g11, g12);
1044 const ST f3_xyz_r =
t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
1045 gh112[jr], gh122[jr], gh222[jr], g00, g01, g02, g10, g11, g12, g20, g21, g22);
1046 const ST f3_xzz_r =
t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
1047 gh112[jr], gh122[jr], gh222[jr], g00, g01, g02, g20, g21, g22, g20, g21, g22);
1048 const ST f3_yyy_r =
t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
1049 gh112[jr], gh122[jr], gh222[jr], g10, g11, g12, g10, g11, g12, g10, g11, g12);
1050 const ST f3_yyz_r =
t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
1051 gh112[jr], gh122[jr], gh222[jr], g10, g11, g12, g10, g11, g12, g20, g21, g22);
1052 const ST f3_yzz_r =
t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
1053 gh112[jr], gh122[jr], gh222[jr], g10, g11, g12, g20, g21, g22, g20, g21, g22);
1054 const ST f3_zzz_r =
t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
1055 gh112[jr], gh122[jr], gh222[jr], g20, g21, g22, g20, g21, g22, g20, g21, g22);
1057 const ST f3_xxx_i =
t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
1058 gh112[ji], gh122[ji], gh222[ji], g00, g01, g02, g00, g01, g02, g00, g01, g02);
1059 const ST f3_xxy_i =
t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
1060 gh112[ji], gh122[ji], gh222[ji], g00, g01, g02, g00, g01, g02, g10, g11, g12);
1061 const ST f3_xxz_i =
t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
1062 gh112[ji], gh122[ji], gh222[ji], g00, g01, g02, g00, g01, g02, g20, g21, g22);
1063 const ST f3_xyy_i =
t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
1064 gh112[ji], gh122[ji], gh222[ji], g00, g01, g02, g10, g11, g12, g10, g11, g12);
1065 const ST f3_xyz_i =
t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
1066 gh112[ji], gh122[ji], gh222[ji], g00, g01, g02, g10, g11, g12, g20, g21, g22);
1067 const ST f3_xzz_i =
t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
1068 gh112[ji], gh122[ji], gh222[ji], g00, g01, g02, g20, g21, g22, g20, g21, g22);
1069 const ST f3_yyy_i =
t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
1070 gh112[ji], gh122[ji], gh222[ji], g10, g11, g12, g10, g11, g12, g10, g11, g12);
1071 const ST f3_yyz_i =
t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
1072 gh112[ji], gh122[ji], gh222[ji], g10, g11, g12, g10, g11, g12, g20, g21, g22);
1073 const ST f3_yzz_i =
t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
1074 gh112[ji], gh122[ji], gh222[ji], g10, g11, g12, g20, g21, g22, g20, g21, g22);
1075 const ST f3_zzz_i =
t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
1076 gh112[ji], gh122[ji], gh222[ji], g20, g21, g22, g20, g21, g22, g20, g21, g22);
1079 const ST gh_xxx_r = f3_xxx_r + 3 * kX * f_xx_i - 3 * kX * kX * dX_r - kX * kX * kX * val_i;
1080 const ST gh_xxx_i = f3_xxx_i - 3 * kX * f_xx_r - 3 * kX * kX * dX_i + kX * kX * kX * val_r;
1082 f3_xxy_r + (kY * f_xx_i + 2 * kX * f_xy_i) - (kX * kX * dY_r + 2 * kX * kY * dX_r) - kX * kX * kY * val_i;
1084 f3_xxy_i - (kY * f_xx_r + 2 * kX * f_xy_r) - (kX * kX * dY_i + 2 * kX * kY * dX_i) + kX * kX * kY * val_r;
1086 f3_xxz_r + (kZ * f_xx_i + 2 * kX * f_xz_i) - (kX * kX * dZ_r + 2 * kX * kZ * dX_r) - kX * kX * kZ * val_i;
1088 f3_xxz_i - (kZ * f_xx_r + 2 * kX * f_xz_r) - (kX * kX * dZ_i + 2 * kX * kZ * dX_i) + kX * kX * kZ * val_r;
1090 f3_xyy_r + (2 * kY * f_xy_i + kX * f_yy_i) - (2 * kX * kY * dY_r + kY * kY * dX_r) - kX * kY * kY * val_i;
1092 f3_xyy_i - (2 * kY * f_xy_r + kX * f_yy_r) - (2 * kX * kY * dY_i + kY * kY * dX_i) + kX * kY * kY * val_r;
1093 const ST gh_xyz_r = f3_xyz_r + (kX * f_yz_i + kY * f_xz_i + kZ * f_xy_i) -
1094 (kX * kY * dZ_r + kY * kZ * dX_r + kZ * kX * dY_r) - kX * kY * kZ * val_i;
1095 const ST gh_xyz_i = f3_xyz_i - (kX * f_yz_r + kY * f_xz_r + kZ * f_xy_r) -
1096 (kX * kY * dZ_i + kY * kZ * dX_i + kZ * kX * dY_i) + kX * kY * kZ * val_r;
1098 f3_xzz_r + (2 * kZ * f_xz_i + kX * f_zz_i) - (2 * kX * kZ * dZ_r + kZ * kZ * dX_r) - kX * kZ * kZ * val_i;
1100 f3_xzz_i - (2 * kZ * f_xz_r + kX * f_zz_r) - (2 * kX * kZ * dZ_i + kZ * kZ * dX_i) + kX * kZ * kZ * val_r;
1101 const ST gh_yyy_r = f3_yyy_r + 3 * kY * f_yy_i - 3 * kY * kY * dY_r - kY * kY * kY * val_i;
1102 const ST gh_yyy_i = f3_yyy_i - 3 * kY * f_yy_r - 3 * kY * kY * dY_i + kY * kY * kY * val_r;
1104 f3_yyz_r + (kZ * f_yy_i + 2 * kY * f_yz_i) - (kY * kY * dZ_r + 2 * kY * kZ * dY_r) - kY * kY * kZ * val_i;
1106 f3_yyz_i - (kZ * f_yy_r + 2 * kY * f_yz_r) - (kY * kY * dZ_i + 2 * kY * kZ * dY_i) + kY * kY * kZ * val_r;
1108 f3_yzz_r + (2 * kZ * f_yz_i + kY * f_zz_i) - (2 * kY * kZ * dZ_r + kZ * kZ * dY_r) - kY * kZ * kZ * val_i;
1110 f3_yzz_i - (2 * kZ * f_yz_r + kY * f_zz_r) - (2 * kY * kZ * dZ_i + kZ * kZ * dY_i) + kY * kZ * kZ * val_r;
1111 const ST gh_zzz_r = f3_zzz_r + 3 * kZ * f_zz_i - 3 * kZ * kZ * dZ_r - kZ * kZ * kZ * val_i;
1112 const ST gh_zzz_i = f3_zzz_i - 3 * kZ * f_zz_r - 3 * kZ * kZ * dZ_i + kZ * kZ * kZ * val_r;
1114 grad_grad_grad_psi[psiIndex][0][0] =
ComplexT(c * gh_xxx_r -
s * gh_xxx_i, c * gh_xxx_i +
s * gh_xxx_r);
1115 grad_grad_grad_psi[psiIndex][0][1] =
ComplexT(c * gh_xxy_r -
s * gh_xxy_i, c * gh_xxy_i +
s * gh_xxy_r);
1116 grad_grad_grad_psi[psiIndex][0][2] =
ComplexT(c * gh_xxz_r -
s * gh_xxz_i, c * gh_xxz_i +
s * gh_xxz_r);
1117 grad_grad_grad_psi[psiIndex][0][3] =
ComplexT(c * gh_xxy_r -
s * gh_xxy_i, c * gh_xxy_i +
s * gh_xxy_r);
1118 grad_grad_grad_psi[psiIndex][0][4] =
ComplexT(c * gh_xyy_r -
s * gh_xyy_i, c * gh_xyy_i +
s * gh_xyy_r);
1119 grad_grad_grad_psi[psiIndex][0][5] =
ComplexT(c * gh_xyz_r -
s * gh_xyz_i, c * gh_xyz_i +
s * gh_xyz_r);
1120 grad_grad_grad_psi[psiIndex][0][6] =
ComplexT(c * gh_xxz_r -
s * gh_xxz_i, c * gh_xxz_i +
s * gh_xxz_r);
1121 grad_grad_grad_psi[psiIndex][0][7] =
ComplexT(c * gh_xyz_r -
s * gh_xyz_i, c * gh_xyz_i +
s * gh_xyz_r);
1122 grad_grad_grad_psi[psiIndex][0][8] =
ComplexT(c * gh_xzz_r -
s * gh_xzz_i, c * gh_xzz_i +
s * gh_xzz_r);
1124 grad_grad_grad_psi[psiIndex][1][0] =
ComplexT(c * gh_xxy_r -
s * gh_xxy_i, c * gh_xxy_i +
s * gh_xxy_r);
1125 grad_grad_grad_psi[psiIndex][1][1] =
ComplexT(c * gh_xyy_r -
s * gh_xyy_i, c * gh_xyy_i +
s * gh_xyy_r);
1126 grad_grad_grad_psi[psiIndex][1][2] =
ComplexT(c * gh_xyz_r -
s * gh_xyz_i, c * gh_xyz_i +
s * gh_xyz_r);
1127 grad_grad_grad_psi[psiIndex][1][3] =
ComplexT(c * gh_xyy_r -
s * gh_xyy_i, c * gh_xyy_i +
s * gh_xyy_r);
1128 grad_grad_grad_psi[psiIndex][1][4] =
ComplexT(c * gh_yyy_r -
s * gh_yyy_i, c * gh_yyy_i +
s * gh_yyy_r);
1129 grad_grad_grad_psi[psiIndex][1][5] =
ComplexT(c * gh_yyz_r -
s * gh_yyz_i, c * gh_yyz_i +
s * gh_yyz_r);
1130 grad_grad_grad_psi[psiIndex][1][6] =
ComplexT(c * gh_xyz_r -
s * gh_xyz_i, c * gh_xyz_i +
s * gh_xyz_r);
1131 grad_grad_grad_psi[psiIndex][1][7] =
ComplexT(c * gh_yyz_r -
s * gh_yyz_i, c * gh_yyz_i +
s * gh_yyz_r);
1132 grad_grad_grad_psi[psiIndex][1][8] =
ComplexT(c * gh_yzz_r -
s * gh_yzz_i, c * gh_yzz_i +
s * gh_yzz_r);
1135 grad_grad_grad_psi[psiIndex][2][0] =
ComplexT(c * gh_xxz_r -
s * gh_xxz_i, c * gh_xxz_i +
s * gh_xxz_r);
1136 grad_grad_grad_psi[psiIndex][2][1] =
ComplexT(c * gh_xyz_r -
s * gh_xyz_i, c * gh_xyz_i +
s * gh_xyz_r);
1137 grad_grad_grad_psi[psiIndex][2][2] =
ComplexT(c * gh_xzz_r -
s * gh_xzz_i, c * gh_xzz_i +
s * gh_xzz_r);
1138 grad_grad_grad_psi[psiIndex][2][3] =
ComplexT(c * gh_xyz_r -
s * gh_xyz_i, c * gh_xyz_i +
s * gh_xyz_r);
1139 grad_grad_grad_psi[psiIndex][2][4] =
ComplexT(c * gh_yyz_r -
s * gh_yyz_i, c * gh_yyz_i +
s * gh_yyz_r);
1140 grad_grad_grad_psi[psiIndex][2][5] =
ComplexT(c * gh_yzz_r -
s * gh_yzz_i, c * gh_yzz_i +
s * gh_yzz_r);
1141 grad_grad_grad_psi[psiIndex][2][6] =
ComplexT(c * gh_xzz_r -
s * gh_xzz_i, c * gh_xzz_i +
s * gh_xzz_r);
1142 grad_grad_grad_psi[psiIndex][2][7] =
ComplexT(c * gh_yzz_r -
s * gh_yzz_i, c * gh_yzz_i +
s * gh_yzz_r);
1143 grad_grad_grad_psi[psiIndex][2][8] =
ComplexT(c * gh_zzz_r -
s * gh_zzz_i, c * gh_zzz_i +
s * gh_zzz_r);
1147 template<
typename ST>
1156 PointType ru(PrimLattice.toUnit_floor(r));
1157 #pragma omp parallel 1162 spline2::evaluate3d_vghgh(SplineInst->getSplinePtr(), ru, myV, myG, myH, mygH, first, last);
1163 assign_vghgh(r, psi, dpsi, grad_grad_psi, grad_grad_grad_psi, first / 2, last / 2);
1167 template<
typename ST>
1176 const int block_size = 16;
1179 std::vector<ValueVector> multi_psi_v;
1180 std::vector<GradVector> multi_dpsi_v;
1181 std::vector<ValueVector> multi_d2psi_v;
1186 multi_psi_v.reserve(block_size);
1187 multi_dpsi_v.reserve(block_size);
1188 multi_d2psi_v.reserve(block_size);
1189 psi_v_list.reserve(block_size);
1190 dpsi_v_list.reserve(block_size);
1191 d2psi_v_list.reserve(block_size);
1193 for (
int iat = first, i = 0; iat < last; iat += block_size, i += block_size)
1195 const int actual_block_size =
std::min(last - iat, block_size);
1196 multi_pos_copy.resize(actual_block_size * 6);
1197 multi_psi_v.clear();
1198 multi_dpsi_v.clear();
1199 multi_d2psi_v.clear();
1201 dpsi_v_list.clear();
1202 d2psi_v_list.clear();
1204 for (
int ipos = 0; ipos < actual_block_size; ++ipos)
1208 PointType ru(PrimLattice.toUnit_floor(r));
1209 multi_pos_copy[ipos * 6] = r[0];
1210 multi_pos_copy[ipos * 6 + 1] = r[1];
1211 multi_pos_copy[ipos * 6 + 2] = r[2];
1212 multi_pos_copy[ipos * 6 + 3] = ru[0];
1213 multi_pos_copy[ipos * 6 + 4] = ru[1];
1214 multi_pos_copy[ipos * 6 + 5] = ru[2];
1216 multi_psi_v.emplace_back(logdet[i + ipos], logdet.cols());
1217 multi_dpsi_v.emplace_back(dlogdet[i + ipos], dlogdet.cols());
1218 multi_d2psi_v.emplace_back(d2logdet[i + ipos], d2logdet.cols());
1220 psi_v_list.push_back(multi_psi_v[ipos]);
1221 dpsi_v_list.push_back(multi_dpsi_v[ipos]);
1222 d2psi_v_list.push_back(multi_d2psi_v[ipos]);
1225 evaluateVGLMultiPos(multi_pos_copy, offload_scratch, results_scratch, psi_v_list, dpsi_v_list, d2psi_v_list);
OrbitalSetTraits< ValueType >::HessVector HessVector
T SymTrace(T h00, T h01, T h02, T h11, T h12, T h22, const T gg[6])
compute Trace(H*G)
virtual void evaluateVGL(const ParticleSet &P, const int iat, ValueVector &psi, GradVector &dpsi, ValueVector &d2psi) override
evaluate the values, gradients and laplacians of this single-particle orbital set ...
helper functions for EinsplineSetBuilder
virtual void evaluateValue(const ParticleSet &P, const int iat, ValueVector &psi) override
evaluate the values of this single-particle orbital set
QTBase::RealType RealType
ResourceHandle< SplineOMPTargetMultiWalkerMem< ST, ComplexT > > mw_mem_handle_
A ParticleSet that handles virtual moves of a selected particle of a given physical ParticleSet Virtu...
OrbitalSetTraits< ValueType >::ValueMatrix ValueMatrix
size_t getTotalNum() const
T v_m_v(T h00, T h01, T h02, T h11, T h12, T h22, T g1x, T g1y, T g1z, T g2x, T g2y, T g2z)
compute vector[3]^T x matrix[3][3] x vector[3]
T t3_contract(T h000, T h001, T h002, T h011, T h012, T h022, T h111, T h112, T h122, T h222, T g1x, T g1y, T g1z, T g2x, T g2y, T g2z, T g3x, T g3y, T g3z)
Coordinate transform for a 3rd rank symmetric tensor representing coordinate derivatives (hence t3_co...
bool write_splines(hdf_archive &h5f)
void assign_v(ST x, ST y, ST z, TT *restrict results_scratch_ptr, const ST *restrict offload_scratch_ptr, const ST *restrict myKcart_ptr, size_t myKcart_padded_size, size_t first_spo, int index)
virtual void evaluateVGH(const ParticleSet &P, const int iat, ValueVector &psi, GradVector &dpsi, HessVector &grad_grad_psi) override
evaluate the values, gradients and hessians of this single-particle orbital set
class to match std::complex<ST> spline with BsplineSet::ValueType (complex) SPOs with OpenMP offload ...
virtual void mw_evaluateVGLandDetRatioGrads(const RefVectorWithLeader< SPOSet > &spo_list, const RefVectorWithLeader< ParticleSet > &P_list, int iat, const std::vector< const ValueType *> &invRow_ptr_list, OffloadMWVGLArray &phi_vgl_v, std::vector< ValueType > &ratios, std::vector< GradType > &grads) const override
evaluate the values, gradients and laplacians of this single-particle orbital sets and determinant ra...
void evaluateVGLMultiPos(const Vector< ST, OffloadPinnedAllocator< ST >> &multi_pos_copy, Vector< ST, OffloadPinnedAllocator< ST >> &offload_scratch, Vector< ComplexT, OffloadPinnedAllocator< ComplexT >> &results_scratch, const RefVector< ValueVector > &psi_v_list, const RefVector< GradVector > &dpsi_v_list, const RefVector< ValueVector > &d2psi_v_list) const
OrbitalSetTraits< ValueType >::GradMatrix GradMatrix
omp_int_t omp_get_thread_num()
Specialized paritlce class for atomistic simulations.
SplineC2COMPTarget(const std::string &my_name)
QTBase::ValueType ValueType
void FairDivideAligned(const int ntot, const int base, const int npart, const int me, int &first, int &last)
Partition ntot over npart and the size of each partition is a multiple of base size.
OrbitalSetTraits< ValueType >::ValueVector ValueVector
virtual void mw_evaluateDetRatios(const RefVectorWithLeader< SPOSet > &spo_list, const RefVectorWithLeader< const VirtualParticleSet > &vp_list, const RefVector< ValueVector > &psi_list, const std::vector< const ValueType *> &invRow_ptr_list, std::vector< std::vector< ValueType >> &ratios_list) const override
evaluate determinant ratios for virtual moves, e.g., sphere move for nonlocalPP, of multiple walkers ...
void assign_vgh(const PointType &r, ValueVector &psi, GradVector &dpsi, HessVector &grad_grad_psi, int first, int last) const
void assign_vghgh(const PointType &r, ValueVector &psi, GradVector &dpsi, HessVector &grad_grad_psi, GGGVector &grad_grad_grad_psi, int first=0, int last=-1) const
const PosType & activeR(int iat) const
return the active position if the particle is active or the return current position if not ...
omp_int_t omp_get_num_threads()
OMPallocator is an allocator with fused device and dualspace allocator functionality.
typename BsplineSet::ValueType ComplexT
void assign_v(const PointType &r, const vContainer_type &myV, ValueVector &psi, int first, int last) const
virtual void evaluate_notranspose(const ParticleSet &P, int first, int last, ValueMatrix &logdet, GradMatrix &dlogdet, ValueMatrix &d2logdet) override
evaluate the values, gradients and laplacians of this single-particle orbital for [first...
std::vector< std::reference_wrapper< T > > RefVector
virtual void mw_evaluateVGL(const RefVectorWithLeader< SPOSet > &sa_list, const RefVectorWithLeader< ParticleSet > &P_list, int iat, const RefVector< ValueVector > &psi_v_list, const RefVector< GradVector > &dpsi_v_list, const RefVector< ValueVector > &d2psi_v_list) const override
evaluate the values, gradients and laplacians of this single-particle orbital sets of multiple walker...
bool read_splines(hdf_archive &h5f)
CASTTYPE & getCastedLeader() const
OrbitalSetTraits< ValueType >::GradHessVector GGGVector
sycl::event copy_n(sycl::queue &aq, const T1 *restrict VA, size_t array_size, T2 *restrict VC, const std::vector< sycl::event > &events)
OrbitalSetTraits< ValueType >::GradVector GradVector
void assign_vgl(ST x, ST y, ST z, TT *restrict results_scratch_ptr, size_t orb_padded_size, const ST *mKK_ptr, const ST *restrict offload_scratch_ptr, size_t spline_padded_size, const ST G[9], const ST *myKcart_ptr, size_t myKcart_padded_size, size_t first_spo, int index)
assign_vgl
void assign_vgl_from_l(const PointType &r, ValueVector &psi, GradVector &dpsi, ValueVector &d2psi)
assign_vgl_from_l can be used when myL is precomputed and myV,myG,myL in cartesian ...
virtual void evaluateDetRatios(const VirtualParticleSet &VP, ValueVector &psi, const ValueVector &psiinv, std::vector< ValueType > &ratios) override
evaluate determinant ratios for virtual moves, e.g., sphere move for nonlocalPP
class to handle complex splines to complex orbitals with splines of arbitrary precision splines stora...
void sincos(T a, T *restrict s, T *restrict c)
sincos function wrapper
virtual void evaluateVGHGH(const ParticleSet &P, const int iat, ValueVector &psi, GradVector &dpsi, HessVector &grad_grad_psi, GGGVector &grad_grad_grad_psi) override
evaluate the values, gradients, hessians, and grad hessians of this single-particle orbital set ...
bool readEntry(T &data, const std::string &aname)
read the data from the group aname and return status use read() for inbuilt error checking ...
A D-dimensional Array class based on PETE.
void set_spline(SingleSplineType *spline_r, SingleSplineType *spline_i, int twist, int ispline, int level)
bool writeEntry(T &data, const std::string &aname)
write the data to the group aname and return status use write() for inbuilt error checking ...
UBspline_3d_d SingleSplineType