14 #include "spline2/MultiBsplineEval.hpp" 15 #include "spline2/MultiBsplineEval_OMPoffload.hpp" 32 SplineInst->copy_spline(spline_r, 2 * ispline);
33 SplineInst->copy_spline(spline_i, 2 * ispline + 1);
40 o <<
"spline_" << MyIndex;
41 einspline_engine<SplineType> bigtable(SplineInst->getSplinePtr());
42 return h5f.
readEntry(bigtable, o.str().c_str());
49 o <<
"spline_" << MyIndex;
50 einspline_engine<SplineType> bigtable(SplineInst->getSplinePtr());
51 return h5f.
writeEntry(bigtable, o.str().c_str());
62 last = last > kPoints.size() ? kPoints.size() : last;
64 const ST x = r[0], y = r[1], z = r[2];
65 const ST* restrict kx = myKcart->
data(0);
66 const ST* restrict ky = myKcart->data(1);
67 const ST* restrict kz = myKcart->data(2);
69 TT* restrict psi_s = psi.data() + first_spo;
70 const size_t requested_orb_size = psi.size();
72 for (
size_t j = first; j <
std::min(nComplexBands, last); j++)
75 const size_t jr = j << 1;
76 const size_t ji = jr + 1;
77 const ST val_r = myV[jr];
78 const ST val_i = myV[ji];
80 if (jr < requested_orb_size)
81 psi_s[jr] = val_r * c - val_i *
s;
82 if (ji < requested_orb_size)
83 psi_s[ji] = val_i * c + val_r *
s;
86 psi_s += nComplexBands;
88 for (
size_t j = std::max(nComplexBands, first); j < last; j++)
91 const ST val_r = myV[2 * j];
92 const ST val_i = myV[2 * j + 1];
94 if (j + nComplexBands < requested_orb_size)
95 psi_s[j] = val_r * c - val_i *
s;
103 PointType ru(PrimLattice.toUnit_floor(r));
112 spline2::evaluate3d(SplineInst->getSplinePtr(), ru, myV, first, last);
113 assign_v(r, myV, psi, first / 2, last / 2);
118 const size_t ChunkSizePerTeam = 512;
119 const int NumTeams = (myV.size() + ChunkSizePerTeam - 1) / ChunkSizePerTeam;
121 const auto spline_padded_size = myV.size();
122 const auto sposet_padded_size = getAlignedSize<TT>(OrbitalSetSize);
123 offload_scratch.resize(spline_padded_size);
124 results_scratch.resize(sposet_padded_size);
127 const auto* spline_ptr = SplineInst->getSplinePtr();
128 auto* offload_scratch_ptr = offload_scratch.data();
129 auto* results_scratch_ptr = results_scratch.data();
130 auto* psi_ptr = psi.data();
131 const auto x = r[0], y = r[1], z = r[2];
132 const auto rux = ru[0], ruy = ru[1], ruz = ru[2];
133 const auto myKcart_padded_size = myKcart->capacity();
134 auto* myKcart_ptr = myKcart->
data();
135 const size_t first_spo_local = first_spo;
136 const size_t nComplexBands_local = nComplexBands;
137 const auto requested_orb_size = psi.size();
138 const auto num_complex_splines = kPoints.size();
142 PRAGMA_OFFLOAD(
"omp target teams distribute num_teams(NumTeams) \ 143 map(always, from: results_scratch_ptr[0:sposet_padded_size])")
144 for (
int team_id = 0; team_id < NumTeams; team_id++)
146 const size_t first = ChunkSizePerTeam * team_id;
147 const size_t last =
omptarget::min(first + ChunkSizePerTeam, spline_padded_size);
151 spline2::computeLocationAndFractional(spline_ptr, rux, ruy, ruz, ix, iy, iz, a, b, c);
153 PRAGMA_OFFLOAD(
"omp parallel for")
154 for (
int index = 0; index < last - first; index++)
155 spline2offload::evaluate_v_impl_v2(spline_ptr, ix, iy, iz, first + index, a, b, c,
156 offload_scratch_ptr + first + index);
157 const size_t first_cplx = first / 2;
158 const size_t last_cplx =
omptarget::min(last / 2, num_complex_splines);
159 PRAGMA_OFFLOAD(
"omp parallel for")
160 for (
int index = first_cplx; index < last_cplx; index++)
161 C2R::assign_v(x, y, z, results_scratch_ptr, offload_scratch_ptr, myKcart_ptr, myKcart_padded_size,
162 first_spo_local, nComplexBands_local, index);
165 for (
size_t i = 0; i < requested_orb_size; i++)
166 psi[i] = results_scratch[i];
171 template<
typename ST>
175 std::vector<ValueType>& ratios)
178 psiinv_pos_copy.resize(psiinv.size() + nVP * 6);
181 std::copy_n(psiinv.data(), psiinv.size(), psiinv_pos_copy.data());
184 auto* restrict pos_scratch = psiinv_pos_copy.data() + psiinv.size();
185 for (
int iat = 0; iat < nVP; ++iat)
188 PointType ru(PrimLattice.toUnit_floor(r));
189 pos_scratch[iat * 6] = r[0];
190 pos_scratch[iat * 6 + 1] = r[1];
191 pos_scratch[iat * 6 + 2] = r[2];
192 pos_scratch[iat * 6 + 3] = ru[0];
193 pos_scratch[iat * 6 + 4] = ru[1];
194 pos_scratch[iat * 6 + 5] = ru[2];
197 const size_t ChunkSizePerTeam = 512;
198 const int NumTeams = (myV.size() + ChunkSizePerTeam - 1) / ChunkSizePerTeam;
199 ratios_private.resize(nVP, NumTeams);
200 const auto spline_padded_size = myV.
size();
201 const auto sposet_padded_size = getAlignedSize<TT>(OrbitalSetSize);
202 offload_scratch.resize(spline_padded_size * nVP);
203 results_scratch.resize(sposet_padded_size * nVP);
206 const auto* spline_ptr = SplineInst->getSplinePtr();
207 auto* offload_scratch_ptr = offload_scratch.data();
208 auto* results_scratch_ptr = results_scratch.data();
209 const auto myKcart_padded_size = myKcart->capacity();
210 auto* myKcart_ptr = myKcart->data();
211 auto* psiinv_ptr = psiinv_pos_copy.data();
212 auto* ratios_private_ptr = ratios_private.data();
213 const size_t first_spo_local = first_spo;
214 const size_t nComplexBands_local = nComplexBands;
215 const auto requested_orb_size = psiinv.size();
216 const auto num_complex_splines = kPoints.size();
220 PRAGMA_OFFLOAD(
"omp target teams distribute collapse(2) num_teams(NumTeams*nVP) \ 221 map(always, to: psiinv_ptr[0:psiinv_pos_copy.size()]) \ 222 map(always, from: ratios_private_ptr[0:NumTeams*nVP])")
223 for (
int iat = 0; iat < nVP; iat++)
224 for (
int team_id = 0; team_id < NumTeams; team_id++)
226 const size_t first = ChunkSizePerTeam * team_id;
227 const size_t last =
omptarget::min(first + ChunkSizePerTeam, spline_padded_size);
229 auto* restrict offload_scratch_iat_ptr = offload_scratch_ptr + spline_padded_size * iat;
230 auto* restrict psi_iat_ptr = results_scratch_ptr + sposet_padded_size * iat;
231 auto* restrict pos_scratch = psiinv_ptr + requested_orb_size;
235 spline2::computeLocationAndFractional(spline_ptr, ST(pos_scratch[iat * 6 + 3]), ST(pos_scratch[iat * 6 + 4]),
236 ST(pos_scratch[iat * 6 + 5]), ix, iy, iz, a, b, c);
238 PRAGMA_OFFLOAD(
"omp parallel for")
239 for (
int index = 0; index < last - first; index++)
240 spline2offload::evaluate_v_impl_v2(spline_ptr, ix, iy, iz, first + index, a, b, c,
241 offload_scratch_iat_ptr + first + index);
242 const size_t first_cplx = first / 2;
243 const size_t last_cplx =
omptarget::min(last / 2, num_complex_splines);
244 PRAGMA_OFFLOAD(
"omp parallel for")
245 for (
int index = first_cplx; index < last_cplx; index++)
246 C2R::assign_v(ST(pos_scratch[iat * 6]), ST(pos_scratch[iat * 6 + 1]), ST(pos_scratch[iat * 6 + 2]),
247 psi_iat_ptr, offload_scratch_iat_ptr, myKcart_ptr, myKcart_padded_size, first_spo_local,
248 nComplexBands_local, index);
250 const size_t first_real = first_cplx +
omptarget::min(nComplexBands_local, first_cplx);
251 const size_t last_real =
254 PRAGMA_OFFLOAD(
"omp parallel for simd reduction(+:sum)")
255 for (
int i = first_real; i < last_real; i++)
256 sum += psi_iat_ptr[i] * psiinv_ptr[i];
257 ratios_private_ptr[iat * NumTeams + team_id] = sum;
262 for (
int iat = 0; iat < nVP; ++iat)
265 for (
int tid = 0; tid < NumTeams; tid++)
266 ratios[iat] += ratios_private[iat][tid];
270 template<
typename ST>
274 const std::vector<const ValueType*>& invRow_ptr_list,
275 std::vector<std::vector<ValueType>>& ratios_list)
const 280 auto& det_ratios_buffer_H2D = mw_mem.det_ratios_buffer_H2D;
281 auto& mw_ratios_private = mw_mem.mw_ratios_private;
282 auto& mw_offload_scratch = mw_mem.mw_offload_scratch;
283 auto& mw_results_scratch = mw_mem.mw_results_scratch;
284 const size_t nw = spo_list.size();
285 const size_t requested_orb_size = phi_leader.size();
289 mw_nVP += VP.getTotalNum();
291 const size_t packed_size = nw *
sizeof(
ValueType*) + mw_nVP * (6 *
sizeof(
TT) +
sizeof(int));
292 det_ratios_buffer_H2D.resize(packed_size);
296 for (
size_t iw = 0; iw < nw; iw++)
297 ptr_buffer[iw] = invRow_ptr_list[iw];
300 auto* pos_ptr =
reinterpret_cast<TT*
>(det_ratios_buffer_H2D.data() + nw *
sizeof(
ValueType*));
302 reinterpret_cast<int*
>(det_ratios_buffer_H2D.data() + nw *
sizeof(
ValueType*) + mw_nVP * 6 *
sizeof(
TT));
304 for (
size_t iw = 0; iw < nw; iw++)
307 assert(ratios_list[iw].size() == VP.
getTotalNum());
308 for (
size_t iat = 0; iat < VP.
getTotalNum(); ++iat, ++iVP)
310 ref_id_ptr[iVP] = iw;
312 PointType ru(PrimLattice.toUnit_floor(r));
323 const size_t ChunkSizePerTeam = 512;
324 const int NumTeams = (myV.size() + ChunkSizePerTeam - 1) / ChunkSizePerTeam;
325 mw_ratios_private.resize(mw_nVP, NumTeams);
326 const auto spline_padded_size = myV.
size();
327 const auto sposet_padded_size = getAlignedSize<TT>(OrbitalSetSize);
328 mw_offload_scratch.resize(spline_padded_size * mw_nVP);
329 mw_results_scratch.resize(sposet_padded_size * mw_nVP);
332 const auto* spline_ptr = SplineInst->getSplinePtr();
333 auto* offload_scratch_ptr = mw_offload_scratch.data();
334 auto* results_scratch_ptr = mw_results_scratch.data();
335 const auto myKcart_padded_size = myKcart->capacity();
336 auto* myKcart_ptr = myKcart->data();
337 auto* buffer_H2D_ptr = det_ratios_buffer_H2D.data();
338 auto* ratios_private_ptr = mw_ratios_private.data();
339 const size_t first_spo_local = first_spo;
340 const size_t nComplexBands_local = nComplexBands;
341 const auto num_complex_splines = kPoints.size();
345 PRAGMA_OFFLOAD(
"omp target teams distribute collapse(2) num_teams(NumTeams*mw_nVP) \ 346 map(always, to: buffer_H2D_ptr[0:det_ratios_buffer_H2D.size()]) \ 347 map(always, from: ratios_private_ptr[0:NumTeams*mw_nVP])")
348 for (
int iat = 0; iat < mw_nVP; iat++)
349 for (
int team_id = 0; team_id < NumTeams; team_id++)
351 const size_t first = ChunkSizePerTeam * team_id;
352 const size_t last =
omptarget::min(first + ChunkSizePerTeam, spline_padded_size);
354 auto* restrict offload_scratch_iat_ptr = offload_scratch_ptr + spline_padded_size * iat;
355 auto* restrict psi_iat_ptr = results_scratch_ptr + sposet_padded_size * iat;
356 auto* ref_id_ptr =
reinterpret_cast<int*
>(buffer_H2D_ptr + nw *
sizeof(
ValueType*) + mw_nVP * 6 *
sizeof(
TT));
357 auto* restrict psiinv_ptr =
reinterpret_cast<const ValueType**
>(buffer_H2D_ptr)[ref_id_ptr[iat]];
358 auto* restrict pos_scratch =
reinterpret_cast<TT*
>(buffer_H2D_ptr + nw *
sizeof(
ValueType*));
362 spline2::computeLocationAndFractional(spline_ptr, ST(pos_scratch[iat * 6 + 3]), ST(pos_scratch[iat * 6 + 4]),
363 ST(pos_scratch[iat * 6 + 5]), ix, iy, iz, a, b, c);
365 PRAGMA_OFFLOAD(
"omp parallel for")
366 for (
int index = 0; index < last - first; index++)
367 spline2offload::evaluate_v_impl_v2(spline_ptr, ix, iy, iz, first + index, a, b, c,
368 offload_scratch_iat_ptr + first + index);
369 const size_t first_cplx = first / 2;
370 const size_t last_cplx =
omptarget::min(last / 2, num_complex_splines);
371 PRAGMA_OFFLOAD(
"omp parallel for")
372 for (
int index = first_cplx; index < last_cplx; index++)
373 C2R::assign_v(ST(pos_scratch[iat * 6]), ST(pos_scratch[iat * 6 + 1]), ST(pos_scratch[iat * 6 + 2]),
374 psi_iat_ptr, offload_scratch_iat_ptr, myKcart_ptr, myKcart_padded_size, first_spo_local,
375 nComplexBands_local, index);
377 const size_t first_real = first_cplx +
omptarget::min(nComplexBands_local, first_cplx);
378 const size_t last_real =
381 PRAGMA_OFFLOAD(
"omp parallel for simd reduction(+:sum)")
382 for (
int i = first_real; i < last_real; i++)
383 sum += psi_iat_ptr[i] * psiinv_ptr[i];
384 ratios_private_ptr[iat * NumTeams + team_id] = sum;
390 for (
size_t iw = 0; iw < nw; iw++)
392 auto& ratios = ratios_list[iw];
393 for (
size_t iat = 0; iat < ratios.size(); iat++, iVP++)
396 for (
int tid = 0; tid < NumTeams; ++tid)
397 ratios[iat] += mw_ratios_private[iVP][tid];
404 template<
typename ST>
411 const ST x = r[0], y = r[1], z = r[2];
413 const ST* restrict k0 = myKcart->
data(0);
415 const ST* restrict k1 = myKcart->data(1);
417 const ST* restrict k2 = myKcart->data(2);
420 const ST* restrict g0 = myG.data(0);
422 const ST* restrict g1 = myG.data(1);
424 const ST* restrict g2 = myG.data(2);
427 const size_t N = kPoints.size();
429 const size_t requested_orb_size = psi.size();
431 for (
size_t j = 0; j < nComplexBands; j++)
433 const size_t jr = j << 1;
434 const size_t ji = jr + 1;
439 const ST val_r = myV[jr];
440 const ST val_i = myV[ji];
447 const ST dX_r = g0[jr];
448 const ST dY_r = g1[jr];
449 const ST dZ_r = g2[jr];
451 const ST dX_i = g0[ji];
452 const ST dY_i = g1[ji];
453 const ST dZ_i = g2[ji];
456 const ST gX_r = dX_r + val_i * kX;
457 const ST gY_r = dY_r + val_i * kY;
458 const ST gZ_r = dZ_r + val_i * kZ;
459 const ST gX_i = dX_i - val_r * kX;
460 const ST gY_i = dY_i - val_r * kY;
461 const ST gZ_i = dZ_i - val_r * kZ;
463 const ST lap_r = myL[jr] + (*mKK)[j] * val_r + two * (kX * dX_i + kY * dY_i + kZ * dZ_i);
464 const ST lap_i = myL[ji] + (*mKK)[j] * val_i - two * (kX * dX_r + kY * dY_r + kZ * dZ_r);
466 const size_t psiIndex = first_spo + jr;
467 if (psiIndex < requested_orb_size)
469 psi[psiIndex] = c * val_r -
s * val_i;
470 d2psi[psiIndex] = c * lap_r -
s * lap_i;
471 dpsi[psiIndex][0] = c * gX_r -
s * gX_i;
472 dpsi[psiIndex][1] = c * gY_r -
s * gY_i;
473 dpsi[psiIndex][2] = c * gZ_r -
s * gZ_i;
475 if (psiIndex + 1 < requested_orb_size)
477 psi[psiIndex + 1] = c * val_i +
s * val_r;
478 d2psi[psiIndex + 1] = c * lap_i +
s * lap_r;
479 dpsi[psiIndex + 1][0] = c * gX_i +
s * gX_r;
480 dpsi[psiIndex + 1][1] = c * gY_i +
s * gY_r;
481 dpsi[psiIndex + 1][2] = c * gZ_i +
s * gZ_r;
486 for (
size_t j = nComplexBands; j <
N; j++)
488 const size_t jr = j << 1;
489 const size_t ji = jr + 1;
494 const ST val_r = myV[jr];
495 const ST val_i = myV[ji];
502 const ST dX_r = g0[jr];
503 const ST dY_r = g1[jr];
504 const ST dZ_r = g2[jr];
506 const ST dX_i = g0[ji];
507 const ST dY_i = g1[ji];
508 const ST dZ_i = g2[ji];
511 const ST gX_r = dX_r + val_i * kX;
512 const ST gY_r = dY_r + val_i * kY;
513 const ST gZ_r = dZ_r + val_i * kZ;
514 const ST gX_i = dX_i - val_r * kX;
515 const ST gY_i = dY_i - val_r * kY;
516 const ST gZ_i = dZ_i - val_r * kZ;
517 if (
const size_t psiIndex = first_spo + nComplexBands + j; psiIndex < requested_orb_size)
519 psi[psiIndex] = c * val_r -
s * val_i;
520 dpsi[psiIndex][0] = c * gX_r -
s * gX_i;
521 dpsi[psiIndex][1] = c * gY_r -
s * gY_i;
522 dpsi[psiIndex][2] = c * gZ_r -
s * gZ_i;
524 const ST lap_r = myL[jr] + (*mKK)[j] * val_r + two * (kX * dX_i + kY * dY_i + kZ * dZ_i);
525 const ST lap_i = myL[ji] + (*mKK)[j] * val_i - two * (kX * dX_r + kY * dY_r + kZ * dZ_r);
526 d2psi[psiIndex] = c * lap_r -
s * lap_i;
531 template<
typename ST>
539 PointType ru(PrimLattice.toUnit_floor(r));
541 const size_t ChunkSizePerTeam = 512;
542 const int NumTeams = (myV.size() + ChunkSizePerTeam - 1) / ChunkSizePerTeam;
544 const auto spline_padded_size = myV.
size();
545 const auto sposet_padded_size = getAlignedSize<TT>(OrbitalSetSize);
549 results_scratch.resize(sposet_padded_size * 5);
552 const auto* spline_ptr = SplineInst->getSplinePtr();
553 auto* offload_scratch_ptr = offload_scratch.data();
554 auto* results_scratch_ptr = results_scratch.data();
555 const auto x = r[0], y = r[1], z = r[2];
556 const auto rux = ru[0], ruy = ru[1], ruz = ru[2];
557 const auto myKcart_padded_size = myKcart->capacity();
558 auto* mKK_ptr = mKK->
data();
559 auto* GGt_ptr = GGt_offload->data();
560 auto* PrimLattice_G_ptr = PrimLattice_G_offload->data();
561 auto* myKcart_ptr = myKcart->data();
562 const size_t first_spo_local = first_spo;
563 const size_t nComplexBands_local = nComplexBands;
564 const auto requested_orb_size = psi.size();
565 const auto num_complex_splines = kPoints.size();
569 PRAGMA_OFFLOAD(
"omp target teams distribute num_teams(NumTeams) \ 570 map(always, from: results_scratch_ptr[0:sposet_padded_size*5])")
571 for (
int team_id = 0; team_id < NumTeams; team_id++)
573 const size_t first = ChunkSizePerTeam * team_id;
574 const size_t last =
omptarget::min(first + ChunkSizePerTeam, spline_padded_size);
577 ST a[4], b[4], c[4], da[4], db[4], dc[4], d2a[4], d2b[4], d2c[4];
578 spline2::computeLocationAndFractional(spline_ptr, rux, ruy, ruz, ix, iy, iz, a, b, c, da, db, dc, d2a, d2b, d2c);
580 const ST G[9] = {PrimLattice_G_ptr[0], PrimLattice_G_ptr[1], PrimLattice_G_ptr[2],
581 PrimLattice_G_ptr[3], PrimLattice_G_ptr[4], PrimLattice_G_ptr[5],
582 PrimLattice_G_ptr[6], PrimLattice_G_ptr[7], PrimLattice_G_ptr[8]};
583 const ST symGGt[6] = {GGt_ptr[0], GGt_ptr[1] + GGt_ptr[3], GGt_ptr[2] + GGt_ptr[6],
584 GGt_ptr[4], GGt_ptr[5] + GGt_ptr[7], GGt_ptr[8]};
586 PRAGMA_OFFLOAD(
"omp parallel for")
587 for (
int index = 0; index < last - first; index++)
589 spline2offload::evaluate_vgh_impl_v2(spline_ptr, ix, iy, iz, first + index, a, b, c, da, db, dc, d2a, d2b, d2c,
590 offload_scratch_ptr + first + index, spline_padded_size);
591 const int output_index = first + index;
600 const size_t first_cplx = first / 2;
601 const size_t last_cplx =
omptarget::min(last / 2, num_complex_splines);
602 PRAGMA_OFFLOAD(
"omp parallel for")
603 for (
int index = first_cplx; index < last_cplx; index++)
604 C2R::assign_vgl(x, y, z, results_scratch_ptr, sposet_padded_size, mKK_ptr, offload_scratch_ptr,
605 spline_padded_size, G, myKcart_ptr, myKcart_padded_size, first_spo_local, nComplexBands_local,
610 for (
size_t i = 0; i < requested_orb_size; i++)
612 psi[i] = results_scratch[i];
613 dpsi[i][0] = results_scratch[i + sposet_padded_size * 1];
614 dpsi[i][1] = results_scratch[i + sposet_padded_size * 2];
615 dpsi[i][2] = results_scratch[i + sposet_padded_size * 3];
616 d2psi[i] = results_scratch[i + sposet_padded_size * 4];
620 template<
typename ST>
628 const size_t num_pos = psi_v_list.size();
629 const size_t ChunkSizePerTeam = 512;
630 const int NumTeams = (myV.size() + ChunkSizePerTeam - 1) / ChunkSizePerTeam;
631 const auto spline_padded_size = myV.size();
632 const auto sposet_padded_size = getAlignedSize<TT>(OrbitalSetSize);
636 results_scratch.resize(sposet_padded_size * num_pos * 5);
639 const auto* spline_ptr = SplineInst->getSplinePtr();
640 auto* pos_copy_ptr = multi_pos.data();
641 auto* offload_scratch_ptr = offload_scratch.data();
642 auto* results_scratch_ptr = results_scratch.data();
643 const auto myKcart_padded_size = myKcart->capacity();
644 auto* mKK_ptr = mKK->data();
645 auto* GGt_ptr = GGt_offload->data();
646 auto* PrimLattice_G_ptr = PrimLattice_G_offload->data();
647 auto* myKcart_ptr = myKcart->data();
648 const size_t first_spo_local = first_spo;
649 const size_t nComplexBands_local = nComplexBands;
650 const auto requested_orb_size = psi_v_list[0].get().size();
651 const auto num_complex_splines = kPoints.size();
655 PRAGMA_OFFLOAD(
"omp target teams distribute collapse(2) num_teams(NumTeams*num_pos) \ 656 map(always, to: pos_copy_ptr[0:num_pos*6]) \ 657 map(always, from: results_scratch_ptr[0:sposet_padded_size*num_pos*5])")
658 for (
int iw = 0; iw < num_pos; iw++)
659 for (
int team_id = 0; team_id < NumTeams; team_id++)
661 const size_t first = ChunkSizePerTeam * team_id;
662 const size_t last =
omptarget::min(first + ChunkSizePerTeam, spline_padded_size);
664 auto* restrict offload_scratch_iw_ptr = offload_scratch_ptr + spline_padded_size * iw *
SoAFields3D::NUM_FIELDS;
665 auto* restrict psi_iw_ptr = results_scratch_ptr + sposet_padded_size * iw * 5;
668 ST a[4], b[4], c[4], da[4], db[4], dc[4], d2a[4], d2b[4], d2c[4];
669 spline2::computeLocationAndFractional(spline_ptr, pos_copy_ptr[iw * 6 + 3], pos_copy_ptr[iw * 6 + 4],
670 pos_copy_ptr[iw * 6 + 5], ix, iy, iz, a, b, c, da, db, dc, d2a, d2b, d2c);
672 const ST G[9] = {PrimLattice_G_ptr[0], PrimLattice_G_ptr[1], PrimLattice_G_ptr[2],
673 PrimLattice_G_ptr[3], PrimLattice_G_ptr[4], PrimLattice_G_ptr[5],
674 PrimLattice_G_ptr[6], PrimLattice_G_ptr[7], PrimLattice_G_ptr[8]};
675 const ST symGGt[6] = {GGt_ptr[0], GGt_ptr[1] + GGt_ptr[3], GGt_ptr[2] + GGt_ptr[6],
676 GGt_ptr[4], GGt_ptr[5] + GGt_ptr[7], GGt_ptr[8]};
678 PRAGMA_OFFLOAD(
"omp parallel for")
679 for (
int index = 0; index < last - first; index++)
681 spline2offload::evaluate_vgh_impl_v2(spline_ptr, ix, iy, iz, first + index, a, b, c, da, db, dc, d2a, d2b,
682 d2c, offload_scratch_iw_ptr + first + index, spline_padded_size);
683 const int output_index = first + index;
692 const size_t first_cplx = first / 2;
693 const size_t last_cplx =
omptarget::min(last / 2, num_complex_splines);
694 PRAGMA_OFFLOAD(
"omp parallel for")
695 for (
int index = first_cplx; index < last_cplx; index++)
696 C2R::assign_vgl(pos_copy_ptr[iw * 6], pos_copy_ptr[iw * 6 + 1], pos_copy_ptr[iw * 6 + 2], psi_iw_ptr,
697 sposet_padded_size, mKK_ptr, offload_scratch_iw_ptr, spline_padded_size, G, myKcart_ptr,
698 myKcart_padded_size, first_spo_local, nComplexBands_local, index);
702 for (
int iw = 0; iw < num_pos; ++iw)
704 auto* restrict results_iw_ptr = results_scratch_ptr + sposet_padded_size * iw * 5;
708 for (
size_t i = 0; i < requested_orb_size; i++)
710 psi_v[i] = results_iw_ptr[i];
711 dpsi_v[i][0] = results_iw_ptr[i + sposet_padded_size];
712 dpsi_v[i][1] = results_iw_ptr[i + sposet_padded_size * 2];
713 dpsi_v[i][2] = results_iw_ptr[i + sposet_padded_size * 3];
714 d2psi_v[i] = results_iw_ptr[i + sposet_padded_size * 4];
719 template<
typename ST>
730 auto& mw_pos_copy = mw_mem.mw_pos_copy;
731 auto& mw_offload_scratch = mw_mem.mw_offload_scratch;
732 auto& mw_results_scratch = mw_mem.mw_results_scratch;
733 const int nwalkers = sa_list.size();
734 mw_pos_copy.resize(nwalkers * 6);
737 for (
int iw = 0; iw < nwalkers; ++iw)
739 const PointType& r = P_list[iw].activeR(iat);
740 PointType ru(PrimLattice.toUnit_floor(r));
741 mw_pos_copy[iw * 6] = r[0];
742 mw_pos_copy[iw * 6 + 1] = r[1];
743 mw_pos_copy[iw * 6 + 2] = r[2];
744 mw_pos_copy[iw * 6 + 3] = ru[0];
745 mw_pos_copy[iw * 6 + 4] = ru[1];
746 mw_pos_copy[iw * 6 + 5] = ru[2];
749 phi_leader.evaluateVGLMultiPos(mw_pos_copy, mw_offload_scratch, mw_results_scratch, psi_v_list, dpsi_v_list,
753 template<
typename ST>
757 const std::vector<const ValueType*>& invRow_ptr_list,
759 std::vector<ValueType>& ratios,
760 std::vector<GradType>& grads)
const 765 auto& buffer_H2D = mw_mem.buffer_H2D;
766 auto& rg_private = mw_mem.rg_private;
767 auto& mw_offload_scratch = mw_mem.mw_offload_scratch;
768 auto& mw_results_scratch = mw_mem.mw_results_scratch;
769 const int nwalkers = spo_list.size();
770 buffer_H2D.resize(nwalkers,
sizeof(ST) * 6 +
sizeof(
ValueType*));
773 for (
int iw = 0; iw < nwalkers; ++iw)
775 const PointType& r = P_list[iw].activeR(iat);
776 PointType ru(PrimLattice.toUnit_floor(r));
777 Vector<ST> pos_copy(reinterpret_cast<ST*>(buffer_H2D[iw]), 6);
786 auto& invRow_ptr = *
reinterpret_cast<const ValueType**
>(buffer_H2D[iw] +
sizeof(ST) * 6);
787 invRow_ptr = invRow_ptr_list[iw];
790 const size_t num_pos = nwalkers;
791 const auto spline_padded_size = myV.
size();
792 const auto sposet_padded_size = getAlignedSize<TT>(OrbitalSetSize);
793 const size_t ChunkSizePerTeam = 512;
794 const int NumTeams = (myV.size() + ChunkSizePerTeam - 1) / ChunkSizePerTeam;
798 mw_results_scratch.resize(sposet_padded_size * num_pos * 5);
800 rg_private.resize(num_pos, NumTeams * 4);
803 const auto* spline_ptr = SplineInst->getSplinePtr();
804 auto* buffer_H2D_ptr = buffer_H2D.data();
805 auto* offload_scratch_ptr = mw_offload_scratch.data();
806 auto* results_scratch_ptr = mw_results_scratch.data();
807 const auto myKcart_padded_size = myKcart->capacity();
808 auto* mKK_ptr = mKK->data();
809 auto* GGt_ptr = GGt_offload->data();
810 auto* PrimLattice_G_ptr = PrimLattice_G_offload->data();
811 auto* myKcart_ptr = myKcart->data();
812 auto* phi_vgl_ptr = phi_vgl_v.
data();
813 auto* rg_private_ptr = rg_private.data();
814 const size_t buffer_H2D_stride = buffer_H2D.cols();
815 const size_t first_spo_local = first_spo;
816 const auto requested_orb_size = phi_vgl_v.
size(2);
817 const size_t phi_vgl_stride = num_pos * requested_orb_size;
818 const size_t nComplexBands_local = nComplexBands;
819 const auto num_complex_splines = kPoints.size();
823 PRAGMA_OFFLOAD(
"omp target teams distribute collapse(2) num_teams(NumTeams*num_pos) \ 824 map(always, to: buffer_H2D_ptr[:buffer_H2D.size()]) \ 825 map(always, from: rg_private_ptr[0:rg_private.size()])")
826 for (
int iw = 0; iw < num_pos; iw++)
827 for (
int team_id = 0; team_id < NumTeams; team_id++)
829 const size_t first = ChunkSizePerTeam * team_id;
830 const size_t last =
omptarget::min(first + ChunkSizePerTeam, spline_padded_size);
832 auto* restrict offload_scratch_iw_ptr = offload_scratch_ptr + spline_padded_size * iw *
SoAFields3D::NUM_FIELDS;
833 auto* restrict psi_iw_ptr = results_scratch_ptr + sposet_padded_size * iw * 5;
834 const auto* restrict pos_iw_ptr =
reinterpret_cast<ST*
>(buffer_H2D_ptr + buffer_H2D_stride * iw);
835 const auto* restrict invRow_iw_ptr =
836 *
reinterpret_cast<ValueType**
>(buffer_H2D_ptr + buffer_H2D_stride * iw +
sizeof(ST) * 6);
839 ST a[4], b[4], c[4], da[4], db[4], dc[4], d2a[4], d2b[4], d2c[4];
840 spline2::computeLocationAndFractional(spline_ptr, pos_iw_ptr[3], pos_iw_ptr[4], pos_iw_ptr[5], ix, iy, iz, a, b,
841 c, da, db, dc, d2a, d2b, d2c);
843 const ST G[9] = {PrimLattice_G_ptr[0], PrimLattice_G_ptr[1], PrimLattice_G_ptr[2],
844 PrimLattice_G_ptr[3], PrimLattice_G_ptr[4], PrimLattice_G_ptr[5],
845 PrimLattice_G_ptr[6], PrimLattice_G_ptr[7], PrimLattice_G_ptr[8]};
846 const ST symGGt[6] = {GGt_ptr[0], GGt_ptr[1] + GGt_ptr[3], GGt_ptr[2] + GGt_ptr[6],
847 GGt_ptr[4], GGt_ptr[5] + GGt_ptr[7], GGt_ptr[8]};
849 PRAGMA_OFFLOAD(
"omp parallel for")
850 for (
int index = 0; index < last - first; index++)
852 spline2offload::evaluate_vgh_impl_v2(spline_ptr, ix, iy, iz, first + index, a, b, c, da, db, dc, d2a, d2b,
853 d2c, offload_scratch_iw_ptr + first + index, spline_padded_size);
854 const int output_index = first + index;
863 const size_t first_cplx = first / 2;
864 const size_t last_cplx =
omptarget::min(last / 2, num_complex_splines);
865 PRAGMA_OFFLOAD(
"omp parallel for")
866 for (
int index = first_cplx; index < last_cplx; index++)
867 C2R::assign_vgl(pos_iw_ptr[0], pos_iw_ptr[1], pos_iw_ptr[2], psi_iw_ptr, sposet_padded_size, mKK_ptr,
868 offload_scratch_iw_ptr, spline_padded_size, G, myKcart_ptr, myKcart_padded_size,
869 first_spo_local, nComplexBands_local, index);
872 ValueType* restrict dpsi_x = psi_iw_ptr + sposet_padded_size;
873 ValueType* restrict dpsi_y = psi_iw_ptr + sposet_padded_size * 2;
874 ValueType* restrict dpsi_z = psi_iw_ptr + sposet_padded_size * 3;
875 ValueType* restrict d2psi = psi_iw_ptr + sposet_padded_size * 4;
877 ValueType* restrict out_phi = phi_vgl_ptr + iw * requested_orb_size;
878 ValueType* restrict out_dphi_x = out_phi + phi_vgl_stride;
879 ValueType* restrict out_dphi_y = out_dphi_x + phi_vgl_stride;
880 ValueType* restrict out_dphi_z = out_dphi_y + phi_vgl_stride;
881 ValueType* restrict out_d2phi = out_dphi_z + phi_vgl_stride;
883 const size_t first_real = first_cplx +
omptarget::min(nComplexBands_local, first_cplx);
884 const size_t last_real =
886 ValueType ratio(0), grad_x(0), grad_y(0), grad_z(0);
887 PRAGMA_OFFLOAD(
"omp parallel for reduction(+: ratio, grad_x, grad_y, grad_z)")
888 for (
int j = first_real; j < last_real; j++)
891 out_dphi_x[j] = dpsi_x[j];
892 out_dphi_y[j] = dpsi_y[j];
893 out_dphi_z[j] = dpsi_z[j];
894 out_d2phi[j] = d2psi[j];
896 ratio += psi[j] * invRow_iw_ptr[j];
897 grad_x += dpsi_x[j] * invRow_iw_ptr[j];
898 grad_y += dpsi_y[j] * invRow_iw_ptr[j];
899 grad_z += dpsi_z[j] * invRow_iw_ptr[j];
902 rg_private_ptr[(iw * NumTeams + team_id) * 4] = ratio;
903 rg_private_ptr[(iw * NumTeams + team_id) * 4 + 1] = grad_x;
904 rg_private_ptr[(iw * NumTeams + team_id) * 4 + 2] = grad_y;
905 rg_private_ptr[(iw * NumTeams + team_id) * 4 + 3] = grad_z;
909 for (
int iw = 0; iw < num_pos; iw++)
912 for (
int team_id = 0; team_id < NumTeams; team_id++)
913 ratio += rg_private[iw][team_id * 4];
916 ValueType grad_x(0), grad_y(0), grad_z(0);
917 for (
int team_id = 0; team_id < NumTeams; team_id++)
919 grad_x += rg_private[iw][team_id * 4 + 1];
920 grad_y += rg_private[iw][team_id * 4 + 2];
921 grad_z += rg_private[iw][team_id * 4 + 3];
923 grads[iw] =
GradType{grad_x / ratio, grad_y / ratio, grad_z / ratio};
927 template<
typename ST>
936 last = last > kPoints.size() ? kPoints.size() : last;
938 const ST g00 = PrimLattice.G(0), g01 = PrimLattice.G(1), g02 = PrimLattice.G(2), g10 = PrimLattice.G(3),
939 g11 = PrimLattice.G(4), g12 = PrimLattice.G(5), g20 = PrimLattice.G(6), g21 = PrimLattice.G(7),
940 g22 = PrimLattice.G(8);
941 const ST x = r[0], y = r[1], z = r[2];
943 const ST* restrict k0 = myKcart->
data(0);
944 const ST* restrict k1 = myKcart->data(1);
945 const ST* restrict k2 = myKcart->data(2);
947 const ST* restrict g0 = myG.data(0);
948 const ST* restrict g1 = myG.data(1);
949 const ST* restrict g2 = myG.data(2);
950 const ST* restrict h00 = myH.data(0);
951 const ST* restrict h01 = myH.data(1);
952 const ST* restrict h02 = myH.data(2);
953 const ST* restrict h11 = myH.data(3);
954 const ST* restrict h12 = myH.data(4);
955 const ST* restrict h22 = myH.data(5);
957 const size_t requested_orb_size = psi.size();
959 for (
size_t j = first; j <
std::min(nComplexBands, last); j++)
967 const ST val_r = myV[jr];
968 const ST val_i = myV[ji];
975 const ST dX_r = g00 * g0[jr] + g01 * g1[jr] + g02 * g2[jr];
976 const ST dY_r = g10 * g0[jr] + g11 * g1[jr] + g12 * g2[jr];
977 const ST dZ_r = g20 * g0[jr] + g21 * g1[jr] + g22 * g2[jr];
979 const ST dX_i = g00 * g0[ji] + g01 * g1[ji] + g02 * g2[ji];
980 const ST dY_i = g10 * g0[ji] + g11 * g1[ji] + g12 * g2[ji];
981 const ST dZ_i = g20 * g0[ji] + g21 * g1[ji] + g22 * g2[ji];
984 const ST gX_r = dX_r + val_i * kX;
985 const ST gY_r = dY_r + val_i * kY;
986 const ST gZ_r = dZ_r + val_i * kZ;
987 const ST gX_i = dX_i - val_r * kX;
988 const ST gY_i = dY_i - val_r * kY;
989 const ST gZ_i = dZ_i - val_r * kZ;
991 const size_t psiIndex = first_spo + jr;
993 if (psiIndex < requested_orb_size)
995 psi[psiIndex] = c * val_r -
s * val_i;
996 dpsi[psiIndex][0] = c * gX_r -
s * gX_i;
997 dpsi[psiIndex][1] = c * gY_r -
s * gY_i;
998 dpsi[psiIndex][2] = c * gZ_r -
s * gZ_i;
1001 if (psiIndex + 1 < requested_orb_size)
1003 psi[psiIndex + 1] = c * val_i +
s * val_r;
1004 dpsi[psiIndex + 1][0] = c * gX_i +
s * gX_r;
1005 dpsi[psiIndex + 1][1] = c * gY_i +
s * gY_r;
1006 dpsi[psiIndex + 1][2] = c * gZ_i +
s * gZ_r;
1010 v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g00, g01, g02, g00, g01, g02) + kX * (gX_i + dX_i);
1012 v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g00, g01, g02, g10, g11, g12) + kX * (gY_i + dY_i);
1014 v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g00, g01, g02, g20, g21, g22) + kX * (gZ_i + dZ_i);
1016 v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g10, g11, g12, g00, g01, g02) + kY * (gX_i + dX_i);
1018 v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g10, g11, g12, g10, g11, g12) + kY * (gY_i + dY_i);
1020 v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g10, g11, g12, g20, g21, g22) + kY * (gZ_i + dZ_i);
1022 v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g20, g21, g22, g00, g01, g02) + kZ * (gX_i + dX_i);
1024 v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g20, g21, g22, g10, g11, g12) + kZ * (gY_i + dY_i);
1026 v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g20, g21, g22, g20, g21, g22) + kZ * (gZ_i + dZ_i);
1029 v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g00, g01, g02, g00, g01, g02) - kX * (gX_r + dX_r);
1031 v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g00, g01, g02, g10, g11, g12) - kX * (gY_r + dY_r);
1033 v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g00, g01, g02, g20, g21, g22) - kX * (gZ_r + dZ_r);
1035 v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g10, g11, g12, g00, g01, g02) - kY * (gX_r + dX_r);
1037 v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g10, g11, g12, g10, g11, g12) - kY * (gY_r + dY_r);
1039 v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g10, g11, g12, g20, g21, g22) - kY * (gZ_r + dZ_r);
1041 v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g20, g21, g22, g00, g01, g02) - kZ * (gX_r + dX_r);
1043 v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g20, g21, g22, g10, g11, g12) - kZ * (gY_r + dY_r);
1045 v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g20, g21, g22, g20, g21, g22) - kZ * (gZ_r + dZ_r);
1047 if (psiIndex < requested_orb_size)
1049 grad_grad_psi[psiIndex][0] = c * h_xx_r -
s * h_xx_i;
1050 grad_grad_psi[psiIndex][1] = c * h_xy_r -
s * h_xy_i;
1051 grad_grad_psi[psiIndex][2] = c * h_xz_r -
s * h_xz_i;
1052 grad_grad_psi[psiIndex][3] = c * h_yx_r -
s * h_yx_i;
1053 grad_grad_psi[psiIndex][4] = c * h_yy_r -
s * h_yy_i;
1054 grad_grad_psi[psiIndex][5] = c * h_yz_r -
s * h_yz_i;
1055 grad_grad_psi[psiIndex][6] = c * h_zx_r -
s * h_zx_i;
1056 grad_grad_psi[psiIndex][7] = c * h_zy_r -
s * h_zy_i;
1057 grad_grad_psi[psiIndex][8] = c * h_zz_r -
s * h_zz_i;
1060 if (psiIndex + 1 < requested_orb_size)
1062 grad_grad_psi[psiIndex + 1][0] = c * h_xx_i +
s * h_xx_r;
1063 grad_grad_psi[psiIndex + 1][1] = c * h_xy_i +
s * h_xy_r;
1064 grad_grad_psi[psiIndex + 1][2] = c * h_xz_i +
s * h_xz_r;
1065 grad_grad_psi[psiIndex + 1][3] = c * h_yx_i +
s * h_yx_r;
1066 grad_grad_psi[psiIndex + 1][4] = c * h_yy_i +
s * h_yy_r;
1067 grad_grad_psi[psiIndex + 1][5] = c * h_yz_i +
s * h_yz_r;
1068 grad_grad_psi[psiIndex + 1][6] = c * h_zx_i +
s * h_zx_r;
1069 grad_grad_psi[psiIndex + 1][7] = c * h_zy_i +
s * h_zy_r;
1070 grad_grad_psi[psiIndex + 1][8] = c * h_zz_i +
s * h_zz_r;
1075 for (
size_t j = std::max(nComplexBands, first); j < last; j++)
1080 const ST kX = k0[j];
1081 const ST kY = k1[j];
1082 const ST kZ = k2[j];
1083 const ST val_r = myV[jr];
1084 const ST val_i = myV[ji];
1091 const ST dX_r = g00 * g0[jr] + g01 * g1[jr] + g02 * g2[jr];
1092 const ST dY_r = g10 * g0[jr] + g11 * g1[jr] + g12 * g2[jr];
1093 const ST dZ_r = g20 * g0[jr] + g21 * g1[jr] + g22 * g2[jr];
1095 const ST dX_i = g00 * g0[ji] + g01 * g1[ji] + g02 * g2[ji];
1096 const ST dY_i = g10 * g0[ji] + g11 * g1[ji] + g12 * g2[ji];
1097 const ST dZ_i = g20 * g0[ji] + g21 * g1[ji] + g22 * g2[ji];
1100 const ST gX_r = dX_r + val_i * kX;
1101 const ST gY_r = dY_r + val_i * kY;
1102 const ST gZ_r = dZ_r + val_i * kZ;
1103 const ST gX_i = dX_i - val_r * kX;
1104 const ST gY_i = dY_i - val_r * kY;
1105 const ST gZ_i = dZ_i - val_r * kZ;
1107 if (
const size_t psiIndex = first_spo + nComplexBands + j; psiIndex < requested_orb_size)
1109 psi[psiIndex] = c * val_r -
s * val_i;
1110 dpsi[psiIndex][0] = c * gX_r -
s * gX_i;
1111 dpsi[psiIndex][1] = c * gY_r -
s * gY_i;
1112 dpsi[psiIndex][2] = c * gZ_r -
s * gZ_i;
1114 const ST h_xx_r =
v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g00, g01, g02, g00, g01, g02) +
1116 const ST h_xy_r =
v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g00, g01, g02, g10, g11, g12) +
1118 const ST h_xz_r =
v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g00, g01, g02, g20, g21, g22) +
1120 const ST h_yx_r =
v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g10, g11, g12, g00, g01, g02) +
1122 const ST h_yy_r =
v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g10, g11, g12, g10, g11, g12) +
1124 const ST h_yz_r =
v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g10, g11, g12, g20, g21, g22) +
1126 const ST h_zx_r =
v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g20, g21, g22, g00, g01, g02) +
1128 const ST h_zy_r =
v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g20, g21, g22, g10, g11, g12) +
1130 const ST h_zz_r =
v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g20, g21, g22, g20, g21, g22) +
1133 const ST h_xx_i =
v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g00, g01, g02, g00, g01, g02) -
1135 const ST h_xy_i =
v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g00, g01, g02, g10, g11, g12) -
1137 const ST h_xz_i =
v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g00, g01, g02, g20, g21, g22) -
1139 const ST h_yx_i =
v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g10, g11, g12, g00, g01, g02) -
1141 const ST h_yy_i =
v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g10, g11, g12, g10, g11, g12) -
1143 const ST h_yz_i =
v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g10, g11, g12, g20, g21, g22) -
1145 const ST h_zx_i =
v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g20, g21, g22, g00, g01, g02) -
1147 const ST h_zy_i =
v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g20, g21, g22, g10, g11, g12) -
1149 const ST h_zz_i =
v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g20, g21, g22, g20, g21, g22) -
1152 grad_grad_psi[psiIndex][0] = c * h_xx_r -
s * h_xx_i;
1153 grad_grad_psi[psiIndex][1] = c * h_xy_r -
s * h_xy_i;
1154 grad_grad_psi[psiIndex][2] = c * h_xz_r -
s * h_xz_i;
1155 grad_grad_psi[psiIndex][3] = c * h_yx_r -
s * h_yx_i;
1156 grad_grad_psi[psiIndex][4] = c * h_yy_r -
s * h_yy_i;
1157 grad_grad_psi[psiIndex][5] = c * h_yz_r -
s * h_yz_i;
1158 grad_grad_psi[psiIndex][6] = c * h_zx_r -
s * h_zx_i;
1159 grad_grad_psi[psiIndex][7] = c * h_zy_r -
s * h_zy_i;
1160 grad_grad_psi[psiIndex][8] = c * h_zz_r -
s * h_zz_i;
1165 template<
typename ST>
1173 PointType ru(PrimLattice.toUnit_floor(r));
1174 #pragma omp parallel 1179 spline2::evaluate3d_vgh(SplineInst->getSplinePtr(), ru, myV, myG, myH, first, last);
1180 assign_vgh(r, psi, dpsi, grad_grad_psi, first / 2, last / 2);
1184 template<
typename ST>
1194 last = last < 0 ? kPoints.size() : (last > kPoints.size() ? kPoints.size() : last);
1196 const ST g00 = PrimLattice.G(0), g01 = PrimLattice.G(1), g02 = PrimLattice.G(2), g10 = PrimLattice.G(3),
1197 g11 = PrimLattice.G(4), g12 = PrimLattice.G(5), g20 = PrimLattice.G(6), g21 = PrimLattice.G(7),
1198 g22 = PrimLattice.G(8);
1199 const ST x = r[0], y = r[1], z = r[2];
1201 const ST* restrict k0 = myKcart->
data(0);
1202 const ST* restrict k1 = myKcart->data(1);
1203 const ST* restrict k2 = myKcart->data(2);
1205 const ST* restrict g0 = myG.data(0);
1206 const ST* restrict g1 = myG.data(1);
1207 const ST* restrict g2 = myG.data(2);
1208 const ST* restrict h00 = myH.data(0);
1209 const ST* restrict h01 = myH.data(1);
1210 const ST* restrict h02 = myH.data(2);
1211 const ST* restrict h11 = myH.data(3);
1212 const ST* restrict h12 = myH.data(4);
1213 const ST* restrict h22 = myH.data(5);
1215 const ST* restrict gh000 = mygH.data(0);
1216 const ST* restrict gh001 = mygH.data(1);
1217 const ST* restrict gh002 = mygH.data(2);
1218 const ST* restrict gh011 = mygH.data(3);
1219 const ST* restrict gh012 = mygH.data(4);
1220 const ST* restrict gh022 = mygH.data(5);
1221 const ST* restrict gh111 = mygH.data(6);
1222 const ST* restrict gh112 = mygH.data(7);
1223 const ST* restrict gh122 = mygH.data(8);
1224 const ST* restrict gh222 = mygH.data(9);
1226 const size_t requested_orb_size = psi.size();
1229 for (
size_t j = first; j <
std::min(nComplexBands, last); j++)
1234 const ST kX = k0[j];
1235 const ST kY = k1[j];
1236 const ST kZ = k2[j];
1237 const ST val_r = myV[jr];
1238 const ST val_i = myV[ji];
1245 const ST dX_r = g00 * g0[jr] + g01 * g1[jr] + g02 * g2[jr];
1246 const ST dY_r = g10 * g0[jr] + g11 * g1[jr] + g12 * g2[jr];
1247 const ST dZ_r = g20 * g0[jr] + g21 * g1[jr] + g22 * g2[jr];
1249 const ST dX_i = g00 * g0[ji] + g01 * g1[ji] + g02 * g2[ji];
1250 const ST dY_i = g10 * g0[ji] + g11 * g1[ji] + g12 * g2[ji];
1251 const ST dZ_i = g20 * g0[ji] + g21 * g1[ji] + g22 * g2[ji];
1254 const ST gX_r = dX_r + val_i * kX;
1255 const ST gY_r = dY_r + val_i * kY;
1256 const ST gZ_r = dZ_r + val_i * kZ;
1257 const ST gX_i = dX_i - val_r * kX;
1258 const ST gY_i = dY_i - val_r * kY;
1259 const ST gZ_i = dZ_i - val_r * kZ;
1261 const size_t psiIndex = first_spo + jr;
1263 if (psiIndex < requested_orb_size)
1265 psi[psiIndex] = c * val_r -
s * val_i;
1266 dpsi[psiIndex][0] = c * gX_r -
s * gX_i;
1267 dpsi[psiIndex][1] = c * gY_r -
s * gY_i;
1268 dpsi[psiIndex][2] = c * gZ_r -
s * gZ_i;
1271 if (psiIndex + 1 < requested_orb_size)
1273 psi[psiIndex + 1] = c * val_i +
s * val_r;
1274 dpsi[psiIndex + 1][0] = c * gX_i +
s * gX_r;
1275 dpsi[psiIndex + 1][1] = c * gY_i +
s * gY_r;
1276 dpsi[psiIndex + 1][2] = c * gZ_i +
s * gZ_r;
1280 const ST f_xx_r =
v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g00, g01, g02, g00, g01, g02);
1281 const ST f_xy_r =
v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g00, g01, g02, g10, g11, g12);
1282 const ST f_xz_r =
v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g00, g01, g02, g20, g21, g22);
1283 const ST f_yy_r =
v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g10, g11, g12, g10, g11, g12);
1284 const ST f_yz_r =
v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g10, g11, g12, g20, g21, g22);
1285 const ST f_zz_r =
v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g20, g21, g22, g20, g21, g22);
1287 const ST f_xx_i =
v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g00, g01, g02, g00, g01, g02);
1288 const ST f_xy_i =
v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g00, g01, g02, g10, g11, g12);
1289 const ST f_xz_i =
v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g00, g01, g02, g20, g21, g22);
1290 const ST f_yy_i =
v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g10, g11, g12, g10, g11, g12);
1291 const ST f_yz_i =
v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g10, g11, g12, g20, g21, g22);
1292 const ST f_zz_i =
v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g20, g21, g22, g20, g21, g22);
1294 const ST h_xx_r = f_xx_r + 2 * kX * dX_i - kX * kX * val_r;
1295 const ST h_xy_r = f_xy_r + (kX * dY_i + kY * dX_i) - kX * kY * val_r;
1296 const ST h_xz_r = f_xz_r + (kX * dZ_i + kZ * dX_i) - kX * kZ * val_r;
1297 const ST h_yy_r = f_yy_r + 2 * kY * dY_i - kY * kY * val_r;
1298 const ST h_yz_r = f_yz_r + (kY * dZ_i + kZ * dY_i) - kY * kZ * val_r;
1299 const ST h_zz_r = f_zz_r + 2 * kZ * dZ_i - kZ * kZ * val_r;
1301 const ST h_xx_i = f_xx_i - 2 * kX * dX_r - kX * kX * val_i;
1302 const ST h_xy_i = f_xy_i - (kX * dY_r + kY * dX_r) - kX * kY * val_i;
1303 const ST h_xz_i = f_xz_i - (kX * dZ_r + kZ * dX_r) - kX * kZ * val_i;
1304 const ST h_yy_i = f_yy_i - 2 * kY * dY_r - kY * kY * val_i;
1305 const ST h_yz_i = f_yz_i - (kZ * dY_r + kY * dZ_r) - kZ * kY * val_i;
1306 const ST h_zz_i = f_zz_i - 2 * kZ * dZ_r - kZ * kZ * val_i;
1308 if (psiIndex < requested_orb_size)
1310 grad_grad_psi[psiIndex][0] = c * h_xx_r -
s * h_xx_i;
1311 grad_grad_psi[psiIndex][1] = c * h_xy_r -
s * h_xy_i;
1312 grad_grad_psi[psiIndex][2] = c * h_xz_r -
s * h_xz_i;
1313 grad_grad_psi[psiIndex][3] = c * h_xy_r -
s * h_xy_i;
1314 grad_grad_psi[psiIndex][4] = c * h_yy_r -
s * h_yy_i;
1315 grad_grad_psi[psiIndex][5] = c * h_yz_r -
s * h_yz_i;
1316 grad_grad_psi[psiIndex][6] = c * h_xz_r -
s * h_xz_i;
1317 grad_grad_psi[psiIndex][7] = c * h_yz_r -
s * h_yz_i;
1318 grad_grad_psi[psiIndex][8] = c * h_zz_r -
s * h_zz_i;
1321 if (psiIndex + 1 < requested_orb_size)
1323 grad_grad_psi[psiIndex + 1][0] = c * h_xx_i +
s * h_xx_r;
1324 grad_grad_psi[psiIndex + 1][1] = c * h_xy_i +
s * h_xy_r;
1325 grad_grad_psi[psiIndex + 1][2] = c * h_xz_i +
s * h_xz_r;
1326 grad_grad_psi[psiIndex + 1][3] = c * h_xy_i +
s * h_xy_r;
1327 grad_grad_psi[psiIndex + 1][4] = c * h_yy_i +
s * h_yy_r;
1328 grad_grad_psi[psiIndex + 1][5] = c * h_yz_i +
s * h_yz_r;
1329 grad_grad_psi[psiIndex + 1][6] = c * h_xz_i +
s * h_xz_r;
1330 grad_grad_psi[psiIndex + 1][7] = c * h_yz_i +
s * h_yz_r;
1331 grad_grad_psi[psiIndex + 1][8] = c * h_zz_i +
s * h_zz_r;
1337 const ST f3_xxx_r =
t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
1338 gh112[jr], gh122[jr], gh222[jr], g00, g01, g02, g00, g01, g02, g00, g01, g02);
1339 const ST f3_xxy_r =
t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
1340 gh112[jr], gh122[jr], gh222[jr], g00, g01, g02, g00, g01, g02, g10, g11, g12);
1341 const ST f3_xxz_r =
t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
1342 gh112[jr], gh122[jr], gh222[jr], g00, g01, g02, g00, g01, g02, g20, g21, g22);
1343 const ST f3_xyy_r =
t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
1344 gh112[jr], gh122[jr], gh222[jr], g00, g01, g02, g10, g11, g12, g10, g11, g12);
1345 const ST f3_xyz_r =
t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
1346 gh112[jr], gh122[jr], gh222[jr], g00, g01, g02, g10, g11, g12, g20, g21, g22);
1347 const ST f3_xzz_r =
t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
1348 gh112[jr], gh122[jr], gh222[jr], g00, g01, g02, g20, g21, g22, g20, g21, g22);
1349 const ST f3_yyy_r =
t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
1350 gh112[jr], gh122[jr], gh222[jr], g10, g11, g12, g10, g11, g12, g10, g11, g12);
1351 const ST f3_yyz_r =
t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
1352 gh112[jr], gh122[jr], gh222[jr], g10, g11, g12, g10, g11, g12, g20, g21, g22);
1353 const ST f3_yzz_r =
t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
1354 gh112[jr], gh122[jr], gh222[jr], g10, g11, g12, g20, g21, g22, g20, g21, g22);
1355 const ST f3_zzz_r =
t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
1356 gh112[jr], gh122[jr], gh222[jr], g20, g21, g22, g20, g21, g22, g20, g21, g22);
1358 const ST f3_xxx_i =
t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
1359 gh112[ji], gh122[ji], gh222[ji], g00, g01, g02, g00, g01, g02, g00, g01, g02);
1360 const ST f3_xxy_i =
t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
1361 gh112[ji], gh122[ji], gh222[ji], g00, g01, g02, g00, g01, g02, g10, g11, g12);
1362 const ST f3_xxz_i =
t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
1363 gh112[ji], gh122[ji], gh222[ji], g00, g01, g02, g00, g01, g02, g20, g21, g22);
1364 const ST f3_xyy_i =
t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
1365 gh112[ji], gh122[ji], gh222[ji], g00, g01, g02, g10, g11, g12, g10, g11, g12);
1366 const ST f3_xyz_i =
t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
1367 gh112[ji], gh122[ji], gh222[ji], g00, g01, g02, g10, g11, g12, g20, g21, g22);
1368 const ST f3_xzz_i =
t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
1369 gh112[ji], gh122[ji], gh222[ji], g00, g01, g02, g20, g21, g22, g20, g21, g22);
1370 const ST f3_yyy_i =
t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
1371 gh112[ji], gh122[ji], gh222[ji], g10, g11, g12, g10, g11, g12, g10, g11, g12);
1372 const ST f3_yyz_i =
t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
1373 gh112[ji], gh122[ji], gh222[ji], g10, g11, g12, g10, g11, g12, g20, g21, g22);
1374 const ST f3_yzz_i =
t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
1375 gh112[ji], gh122[ji], gh222[ji], g10, g11, g12, g20, g21, g22, g20, g21, g22);
1376 const ST f3_zzz_i =
t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
1377 gh112[ji], gh122[ji], gh222[ji], g20, g21, g22, g20, g21, g22, g20, g21, g22);
1380 const ST gh_xxx_r = f3_xxx_r + 3 * kX * f_xx_i - 3 * kX * kX * dX_r - kX * kX * kX * val_i;
1381 const ST gh_xxx_i = f3_xxx_i - 3 * kX * f_xx_r - 3 * kX * kX * dX_i + kX * kX * kX * val_r;
1383 f3_xxy_r + (kY * f_xx_i + 2 * kX * f_xy_i) - (kX * kX * dY_r + 2 * kX * kY * dX_r) - kX * kX * kY * val_i;
1385 f3_xxy_i - (kY * f_xx_r + 2 * kX * f_xy_r) - (kX * kX * dY_i + 2 * kX * kY * dX_i) + kX * kX * kY * val_r;
1387 f3_xxz_r + (kZ * f_xx_i + 2 * kX * f_xz_i) - (kX * kX * dZ_r + 2 * kX * kZ * dX_r) - kX * kX * kZ * val_i;
1389 f3_xxz_i - (kZ * f_xx_r + 2 * kX * f_xz_r) - (kX * kX * dZ_i + 2 * kX * kZ * dX_i) + kX * kX * kZ * val_r;
1391 f3_xyy_r + (2 * kY * f_xy_i + kX * f_yy_i) - (2 * kX * kY * dY_r + kY * kY * dX_r) - kX * kY * kY * val_i;
1393 f3_xyy_i - (2 * kY * f_xy_r + kX * f_yy_r) - (2 * kX * kY * dY_i + kY * kY * dX_i) + kX * kY * kY * val_r;
1394 const ST gh_xyz_r = f3_xyz_r + (kX * f_yz_i + kY * f_xz_i + kZ * f_xy_i) -
1395 (kX * kY * dZ_r + kY * kZ * dX_r + kZ * kX * dY_r) - kX * kY * kZ * val_i;
1396 const ST gh_xyz_i = f3_xyz_i - (kX * f_yz_r + kY * f_xz_r + kZ * f_xy_r) -
1397 (kX * kY * dZ_i + kY * kZ * dX_i + kZ * kX * dY_i) + kX * kY * kZ * val_r;
1399 f3_xzz_r + (2 * kZ * f_xz_i + kX * f_zz_i) - (2 * kX * kZ * dZ_r + kZ * kZ * dX_r) - kX * kZ * kZ * val_i;
1401 f3_xzz_i - (2 * kZ * f_xz_r + kX * f_zz_r) - (2 * kX * kZ * dZ_i + kZ * kZ * dX_i) + kX * kZ * kZ * val_r;
1402 const ST gh_yyy_r = f3_yyy_r + 3 * kY * f_yy_i - 3 * kY * kY * dY_r - kY * kY * kY * val_i;
1403 const ST gh_yyy_i = f3_yyy_i - 3 * kY * f_yy_r - 3 * kY * kY * dY_i + kY * kY * kY * val_r;
1405 f3_yyz_r + (kZ * f_yy_i + 2 * kY * f_yz_i) - (kY * kY * dZ_r + 2 * kY * kZ * dY_r) - kY * kY * kZ * val_i;
1407 f3_yyz_i - (kZ * f_yy_r + 2 * kY * f_yz_r) - (kY * kY * dZ_i + 2 * kY * kZ * dY_i) + kY * kY * kZ * val_r;
1409 f3_yzz_r + (2 * kZ * f_yz_i + kY * f_zz_i) - (2 * kY * kZ * dZ_r + kZ * kZ * dY_r) - kY * kZ * kZ * val_i;
1411 f3_yzz_i - (2 * kZ * f_yz_r + kY * f_zz_r) - (2 * kY * kZ * dZ_i + kZ * kZ * dY_i) + kY * kZ * kZ * val_r;
1412 const ST gh_zzz_r = f3_zzz_r + 3 * kZ * f_zz_i - 3 * kZ * kZ * dZ_r - kZ * kZ * kZ * val_i;
1413 const ST gh_zzz_i = f3_zzz_i - 3 * kZ * f_zz_r - 3 * kZ * kZ * dZ_i + kZ * kZ * kZ * val_r;
1415 if (psiIndex < requested_orb_size)
1417 grad_grad_grad_psi[psiIndex][0][0] = c * gh_xxx_r -
s * gh_xxx_i;
1418 grad_grad_grad_psi[psiIndex][0][1] = c * gh_xxy_r -
s * gh_xxy_i;
1419 grad_grad_grad_psi[psiIndex][0][2] = c * gh_xxz_r -
s * gh_xxz_i;
1420 grad_grad_grad_psi[psiIndex][0][3] = c * gh_xxy_r -
s * gh_xxy_i;
1421 grad_grad_grad_psi[psiIndex][0][4] = c * gh_xyy_r -
s * gh_xyy_i;
1422 grad_grad_grad_psi[psiIndex][0][5] = c * gh_xyz_r -
s * gh_xyz_i;
1423 grad_grad_grad_psi[psiIndex][0][6] = c * gh_xxz_r -
s * gh_xxz_i;
1424 grad_grad_grad_psi[psiIndex][0][7] = c * gh_xyz_r -
s * gh_xyz_i;
1425 grad_grad_grad_psi[psiIndex][0][8] = c * gh_xzz_r -
s * gh_xzz_i;
1427 grad_grad_grad_psi[psiIndex][1][0] = c * gh_xxy_r -
s * gh_xxy_i;
1428 grad_grad_grad_psi[psiIndex][1][1] = c * gh_xyy_r -
s * gh_xyy_i;
1429 grad_grad_grad_psi[psiIndex][1][2] = c * gh_xyz_r -
s * gh_xyz_i;
1430 grad_grad_grad_psi[psiIndex][1][3] = c * gh_xyy_r -
s * gh_xyy_i;
1431 grad_grad_grad_psi[psiIndex][1][4] = c * gh_yyy_r -
s * gh_yyy_i;
1432 grad_grad_grad_psi[psiIndex][1][5] = c * gh_yyz_r -
s * gh_yyz_i;
1433 grad_grad_grad_psi[psiIndex][1][6] = c * gh_xyz_r -
s * gh_xyz_i;
1434 grad_grad_grad_psi[psiIndex][1][7] = c * gh_yyz_r -
s * gh_yyz_i;
1435 grad_grad_grad_psi[psiIndex][1][8] = c * gh_yzz_r -
s * gh_yzz_i;
1437 grad_grad_grad_psi[psiIndex][2][0] = c * gh_xxz_r -
s * gh_xxz_i;
1438 grad_grad_grad_psi[psiIndex][2][1] = c * gh_xyz_r -
s * gh_xyz_i;
1439 grad_grad_grad_psi[psiIndex][2][2] = c * gh_xzz_r -
s * gh_xzz_i;
1440 grad_grad_grad_psi[psiIndex][2][3] = c * gh_xyz_r -
s * gh_xyz_i;
1441 grad_grad_grad_psi[psiIndex][2][4] = c * gh_yyz_r -
s * gh_yyz_i;
1442 grad_grad_grad_psi[psiIndex][2][5] = c * gh_yzz_r -
s * gh_yzz_i;
1443 grad_grad_grad_psi[psiIndex][2][6] = c * gh_xzz_r -
s * gh_xzz_i;
1444 grad_grad_grad_psi[psiIndex][2][7] = c * gh_yzz_r -
s * gh_yzz_i;
1445 grad_grad_grad_psi[psiIndex][2][8] = c * gh_zzz_r -
s * gh_zzz_i;
1448 if (psiIndex + 1 < requested_orb_size)
1450 grad_grad_grad_psi[psiIndex + 1][0][0] = c * gh_xxx_i +
s * gh_xxx_r;
1451 grad_grad_grad_psi[psiIndex + 1][0][1] = c * gh_xxy_i +
s * gh_xxy_r;
1452 grad_grad_grad_psi[psiIndex + 1][0][2] = c * gh_xxz_i +
s * gh_xxz_r;
1453 grad_grad_grad_psi[psiIndex + 1][0][3] = c * gh_xxy_i +
s * gh_xxy_r;
1454 grad_grad_grad_psi[psiIndex + 1][0][4] = c * gh_xyy_i +
s * gh_xyy_r;
1455 grad_grad_grad_psi[psiIndex + 1][0][5] = c * gh_xyz_i +
s * gh_xyz_r;
1456 grad_grad_grad_psi[psiIndex + 1][0][6] = c * gh_xxz_i +
s * gh_xxz_r;
1457 grad_grad_grad_psi[psiIndex + 1][0][7] = c * gh_xyz_i +
s * gh_xyz_r;
1458 grad_grad_grad_psi[psiIndex + 1][0][8] = c * gh_xzz_i +
s * gh_xzz_r;
1460 grad_grad_grad_psi[psiIndex + 1][1][0] = c * gh_xxy_i +
s * gh_xxy_r;
1461 grad_grad_grad_psi[psiIndex + 1][1][1] = c * gh_xyy_i +
s * gh_xyy_r;
1462 grad_grad_grad_psi[psiIndex + 1][1][2] = c * gh_xyz_i +
s * gh_xyz_r;
1463 grad_grad_grad_psi[psiIndex + 1][1][3] = c * gh_xyy_i +
s * gh_xyy_r;
1464 grad_grad_grad_psi[psiIndex + 1][1][4] = c * gh_yyy_i +
s * gh_yyy_r;
1465 grad_grad_grad_psi[psiIndex + 1][1][5] = c * gh_yyz_i +
s * gh_yyz_r;
1466 grad_grad_grad_psi[psiIndex + 1][1][6] = c * gh_xyz_i +
s * gh_xyz_r;
1467 grad_grad_grad_psi[psiIndex + 1][1][7] = c * gh_yyz_i +
s * gh_yyz_r;
1468 grad_grad_grad_psi[psiIndex + 1][1][8] = c * gh_yzz_i +
s * gh_yzz_r;
1470 grad_grad_grad_psi[psiIndex + 1][2][0] = c * gh_xxz_i +
s * gh_xxz_r;
1471 grad_grad_grad_psi[psiIndex + 1][2][1] = c * gh_xyz_i +
s * gh_xyz_r;
1472 grad_grad_grad_psi[psiIndex + 1][2][2] = c * gh_xzz_i +
s * gh_xzz_r;
1473 grad_grad_grad_psi[psiIndex + 1][2][3] = c * gh_xyz_i +
s * gh_xyz_r;
1474 grad_grad_grad_psi[psiIndex + 1][2][4] = c * gh_yyz_i +
s * gh_yyz_r;
1475 grad_grad_grad_psi[psiIndex + 1][2][5] = c * gh_yzz_i +
s * gh_yzz_r;
1476 grad_grad_grad_psi[psiIndex + 1][2][6] = c * gh_xzz_i +
s * gh_xzz_r;
1477 grad_grad_grad_psi[psiIndex + 1][2][7] = c * gh_yzz_i +
s * gh_yzz_r;
1478 grad_grad_grad_psi[psiIndex + 1][2][8] = c * gh_zzz_i +
s * gh_zzz_r;
1482 for (
size_t j = std::max(nComplexBands, first); j < last; j++)
1487 const ST kX = k0[j];
1488 const ST kY = k1[j];
1489 const ST kZ = k2[j];
1490 const ST val_r = myV[jr];
1491 const ST val_i = myV[ji];
1498 const ST dX_r = g00 * g0[jr] + g01 * g1[jr] + g02 * g2[jr];
1499 const ST dY_r = g10 * g0[jr] + g11 * g1[jr] + g12 * g2[jr];
1500 const ST dZ_r = g20 * g0[jr] + g21 * g1[jr] + g22 * g2[jr];
1502 const ST dX_i = g00 * g0[ji] + g01 * g1[ji] + g02 * g2[ji];
1503 const ST dY_i = g10 * g0[ji] + g11 * g1[ji] + g12 * g2[ji];
1504 const ST dZ_i = g20 * g0[ji] + g21 * g1[ji] + g22 * g2[ji];
1507 const ST gX_r = dX_r + val_i * kX;
1508 const ST gY_r = dY_r + val_i * kY;
1509 const ST gZ_r = dZ_r + val_i * kZ;
1510 const ST gX_i = dX_i - val_r * kX;
1511 const ST gY_i = dY_i - val_r * kY;
1512 const ST gZ_i = dZ_i - val_r * kZ;
1514 if (
const size_t psiIndex = first_spo + nComplexBands + j; psiIndex < requested_orb_size)
1516 psi[psiIndex] = c * val_r -
s * val_i;
1517 dpsi[psiIndex][0] = c * gX_r -
s * gX_i;
1518 dpsi[psiIndex][1] = c * gY_r -
s * gY_i;
1519 dpsi[psiIndex][2] = c * gZ_r -
s * gZ_i;
1522 const ST f_xx_r =
v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g00, g01, g02, g00, g01, g02);
1523 const ST f_xy_r =
v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g00, g01, g02, g10, g11, g12);
1524 const ST f_xz_r =
v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g00, g01, g02, g20, g21, g22);
1525 const ST f_yy_r =
v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g10, g11, g12, g10, g11, g12);
1526 const ST f_yz_r =
v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g10, g11, g12, g20, g21, g22);
1527 const ST f_zz_r =
v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g20, g21, g22, g20, g21, g22);
1529 const ST f_xx_i =
v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g00, g01, g02, g00, g01, g02);
1530 const ST f_xy_i =
v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g00, g01, g02, g10, g11, g12);
1531 const ST f_xz_i =
v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g00, g01, g02, g20, g21, g22);
1532 const ST f_yy_i =
v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g10, g11, g12, g10, g11, g12);
1533 const ST f_yz_i =
v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g10, g11, g12, g20, g21, g22);
1534 const ST f_zz_i =
v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g20, g21, g22, g20, g21, g22);
1536 const ST h_xx_r = f_xx_r + 2 * kX * dX_i - kX * kX * val_r;
1537 const ST h_xy_r = f_xy_r + (kX * dY_i + kY * dX_i) - kX * kY * val_r;
1538 const ST h_xz_r = f_xz_r + (kX * dZ_i + kZ * dX_i) - kX * kZ * val_r;
1539 const ST h_yy_r = f_yy_r + 2 * kY * dY_i - kY * kY * val_r;
1540 const ST h_yz_r = f_yz_r + (kY * dZ_i + kZ * dY_i) - kY * kZ * val_r;
1541 const ST h_zz_r = f_zz_r + 2 * kZ * dZ_i - kZ * kZ * val_r;
1543 const ST h_xx_i = f_xx_i - 2 * kX * dX_r - kX * kX * val_i;
1544 const ST h_xy_i = f_xy_i - (kX * dY_r + kY * dX_r) - kX * kY * val_i;
1545 const ST h_xz_i = f_xz_i - (kX * dZ_r + kZ * dX_r) - kX * kZ * val_i;
1546 const ST h_yy_i = f_yy_i - 2 * kY * dY_r - kY * kY * val_i;
1547 const ST h_yz_i = f_yz_i - (kZ * dY_r + kY * dZ_r) - kZ * kY * val_i;
1548 const ST h_zz_i = f_zz_i - 2 * kZ * dZ_r - kZ * kZ * val_i;
1550 grad_grad_psi[psiIndex][0] = c * h_xx_r -
s * h_xx_i;
1551 grad_grad_psi[psiIndex][1] = c * h_xy_r -
s * h_xy_i;
1552 grad_grad_psi[psiIndex][2] = c * h_xz_r -
s * h_xz_i;
1553 grad_grad_psi[psiIndex][3] = c * h_xy_r -
s * h_xy_i;
1554 grad_grad_psi[psiIndex][4] = c * h_yy_r -
s * h_yy_i;
1555 grad_grad_psi[psiIndex][5] = c * h_yz_r -
s * h_yz_i;
1556 grad_grad_psi[psiIndex][6] = c * h_xz_r -
s * h_xz_i;
1557 grad_grad_psi[psiIndex][7] = c * h_yz_r -
s * h_yz_i;
1558 grad_grad_psi[psiIndex][8] = c * h_zz_r -
s * h_zz_i;
1563 const ST f3_xxx_r =
t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
1564 gh112[jr], gh122[jr], gh222[jr], g00, g01, g02, g00, g01, g02, g00, g01, g02);
1565 const ST f3_xxy_r =
t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
1566 gh112[jr], gh122[jr], gh222[jr], g00, g01, g02, g00, g01, g02, g10, g11, g12);
1567 const ST f3_xxz_r =
t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
1568 gh112[jr], gh122[jr], gh222[jr], g00, g01, g02, g00, g01, g02, g20, g21, g22);
1569 const ST f3_xyy_r =
t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
1570 gh112[jr], gh122[jr], gh222[jr], g00, g01, g02, g10, g11, g12, g10, g11, g12);
1571 const ST f3_xyz_r =
t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
1572 gh112[jr], gh122[jr], gh222[jr], g00, g01, g02, g10, g11, g12, g20, g21, g22);
1573 const ST f3_xzz_r =
t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
1574 gh112[jr], gh122[jr], gh222[jr], g00, g01, g02, g20, g21, g22, g20, g21, g22);
1575 const ST f3_yyy_r =
t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
1576 gh112[jr], gh122[jr], gh222[jr], g10, g11, g12, g10, g11, g12, g10, g11, g12);
1577 const ST f3_yyz_r =
t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
1578 gh112[jr], gh122[jr], gh222[jr], g10, g11, g12, g10, g11, g12, g20, g21, g22);
1579 const ST f3_yzz_r =
t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
1580 gh112[jr], gh122[jr], gh222[jr], g10, g11, g12, g20, g21, g22, g20, g21, g22);
1581 const ST f3_zzz_r =
t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
1582 gh112[jr], gh122[jr], gh222[jr], g20, g21, g22, g20, g21, g22, g20, g21, g22);
1584 const ST f3_xxx_i =
t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
1585 gh112[ji], gh122[ji], gh222[ji], g00, g01, g02, g00, g01, g02, g00, g01, g02);
1586 const ST f3_xxy_i =
t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
1587 gh112[ji], gh122[ji], gh222[ji], g00, g01, g02, g00, g01, g02, g10, g11, g12);
1588 const ST f3_xxz_i =
t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
1589 gh112[ji], gh122[ji], gh222[ji], g00, g01, g02, g00, g01, g02, g20, g21, g22);
1590 const ST f3_xyy_i =
t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
1591 gh112[ji], gh122[ji], gh222[ji], g00, g01, g02, g10, g11, g12, g10, g11, g12);
1592 const ST f3_xyz_i =
t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
1593 gh112[ji], gh122[ji], gh222[ji], g00, g01, g02, g10, g11, g12, g20, g21, g22);
1594 const ST f3_xzz_i =
t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
1595 gh112[ji], gh122[ji], gh222[ji], g00, g01, g02, g20, g21, g22, g20, g21, g22);
1596 const ST f3_yyy_i =
t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
1597 gh112[ji], gh122[ji], gh222[ji], g10, g11, g12, g10, g11, g12, g10, g11, g12);
1598 const ST f3_yyz_i =
t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
1599 gh112[ji], gh122[ji], gh222[ji], g10, g11, g12, g10, g11, g12, g20, g21, g22);
1600 const ST f3_yzz_i =
t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
1601 gh112[ji], gh122[ji], gh222[ji], g10, g11, g12, g20, g21, g22, g20, g21, g22);
1602 const ST f3_zzz_i =
t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
1603 gh112[ji], gh122[ji], gh222[ji], g20, g21, g22, g20, g21, g22, g20, g21, g22);
1606 const ST gh_xxx_r = f3_xxx_r + 3 * kX * f_xx_i - 3 * kX * kX * dX_r - kX * kX * kX * val_i;
1607 const ST gh_xxx_i = f3_xxx_i - 3 * kX * f_xx_r - 3 * kX * kX * dX_i + kX * kX * kX * val_r;
1609 f3_xxy_r + (kY * f_xx_i + 2 * kX * f_xy_i) - (kX * kX * dY_r + 2 * kX * kY * dX_r) - kX * kX * kY * val_i;
1611 f3_xxy_i - (kY * f_xx_r + 2 * kX * f_xy_r) - (kX * kX * dY_i + 2 * kX * kY * dX_i) + kX * kX * kY * val_r;
1613 f3_xxz_r + (kZ * f_xx_i + 2 * kX * f_xz_i) - (kX * kX * dZ_r + 2 * kX * kZ * dX_r) - kX * kX * kZ * val_i;
1615 f3_xxz_i - (kZ * f_xx_r + 2 * kX * f_xz_r) - (kX * kX * dZ_i + 2 * kX * kZ * dX_i) + kX * kX * kZ * val_r;
1617 f3_xyy_r + (2 * kY * f_xy_i + kX * f_yy_i) - (2 * kX * kY * dY_r + kY * kY * dX_r) - kX * kY * kY * val_i;
1619 f3_xyy_i - (2 * kY * f_xy_r + kX * f_yy_r) - (2 * kX * kY * dY_i + kY * kY * dX_i) + kX * kY * kY * val_r;
1620 const ST gh_xyz_r = f3_xyz_r + (kX * f_yz_i + kY * f_xz_i + kZ * f_xy_i) -
1621 (kX * kY * dZ_r + kY * kZ * dX_r + kZ * kX * dY_r) - kX * kY * kZ * val_i;
1622 const ST gh_xyz_i = f3_xyz_i - (kX * f_yz_r + kY * f_xz_r + kZ * f_xy_r) -
1623 (kX * kY * dZ_i + kY * kZ * dX_i + kZ * kX * dY_i) + kX * kY * kZ * val_r;
1625 f3_xzz_r + (2 * kZ * f_xz_i + kX * f_zz_i) - (2 * kX * kZ * dZ_r + kZ * kZ * dX_r) - kX * kZ * kZ * val_i;
1627 f3_xzz_i - (2 * kZ * f_xz_r + kX * f_zz_r) - (2 * kX * kZ * dZ_i + kZ * kZ * dX_i) + kX * kZ * kZ * val_r;
1628 const ST gh_yyy_r = f3_yyy_r + 3 * kY * f_yy_i - 3 * kY * kY * dY_r - kY * kY * kY * val_i;
1629 const ST gh_yyy_i = f3_yyy_i - 3 * kY * f_yy_r - 3 * kY * kY * dY_i + kY * kY * kY * val_r;
1631 f3_yyz_r + (kZ * f_yy_i + 2 * kY * f_yz_i) - (kY * kY * dZ_r + 2 * kY * kZ * dY_r) - kY * kY * kZ * val_i;
1633 f3_yyz_i - (kZ * f_yy_r + 2 * kY * f_yz_r) - (kY * kY * dZ_i + 2 * kY * kZ * dY_i) + kY * kY * kZ * val_r;
1635 f3_yzz_r + (2 * kZ * f_yz_i + kY * f_zz_i) - (2 * kY * kZ * dZ_r + kZ * kZ * dY_r) - kY * kZ * kZ * val_i;
1637 f3_yzz_i - (2 * kZ * f_yz_r + kY * f_zz_r) - (2 * kY * kZ * dZ_i + kZ * kZ * dY_i) + kY * kZ * kZ * val_r;
1638 const ST gh_zzz_r = f3_zzz_r + 3 * kZ * f_zz_i - 3 * kZ * kZ * dZ_r - kZ * kZ * kZ * val_i;
1639 const ST gh_zzz_i = f3_zzz_i - 3 * kZ * f_zz_r - 3 * kZ * kZ * dZ_i + kZ * kZ * kZ * val_r;
1641 grad_grad_grad_psi[psiIndex][0][0] = c * gh_xxx_r -
s * gh_xxx_i;
1642 grad_grad_grad_psi[psiIndex][0][1] = c * gh_xxy_r -
s * gh_xxy_i;
1643 grad_grad_grad_psi[psiIndex][0][2] = c * gh_xxz_r -
s * gh_xxz_i;
1644 grad_grad_grad_psi[psiIndex][0][3] = c * gh_xxy_r -
s * gh_xxy_i;
1645 grad_grad_grad_psi[psiIndex][0][4] = c * gh_xyy_r -
s * gh_xyy_i;
1646 grad_grad_grad_psi[psiIndex][0][5] = c * gh_xyz_r -
s * gh_xyz_i;
1647 grad_grad_grad_psi[psiIndex][0][6] = c * gh_xxz_r -
s * gh_xxz_i;
1648 grad_grad_grad_psi[psiIndex][0][7] = c * gh_xyz_r -
s * gh_xyz_i;
1649 grad_grad_grad_psi[psiIndex][0][8] = c * gh_xzz_r -
s * gh_xzz_i;
1651 grad_grad_grad_psi[psiIndex][1][0] = c * gh_xxy_r -
s * gh_xxy_i;
1652 grad_grad_grad_psi[psiIndex][1][1] = c * gh_xyy_r -
s * gh_xyy_i;
1653 grad_grad_grad_psi[psiIndex][1][2] = c * gh_xyz_r -
s * gh_xyz_i;
1654 grad_grad_grad_psi[psiIndex][1][3] = c * gh_xyy_r -
s * gh_xyy_i;
1655 grad_grad_grad_psi[psiIndex][1][4] = c * gh_yyy_r -
s * gh_yyy_i;
1656 grad_grad_grad_psi[psiIndex][1][5] = c * gh_yyz_r -
s * gh_yyz_i;
1657 grad_grad_grad_psi[psiIndex][1][6] = c * gh_xyz_r -
s * gh_xyz_i;
1658 grad_grad_grad_psi[psiIndex][1][7] = c * gh_yyz_r -
s * gh_yyz_i;
1659 grad_grad_grad_psi[psiIndex][1][8] = c * gh_yzz_r -
s * gh_yzz_i;
1661 grad_grad_grad_psi[psiIndex][2][0] = c * gh_xxz_r -
s * gh_xxz_i;
1662 grad_grad_grad_psi[psiIndex][2][1] = c * gh_xyz_r -
s * gh_xyz_i;
1663 grad_grad_grad_psi[psiIndex][2][2] = c * gh_xzz_r -
s * gh_xzz_i;
1664 grad_grad_grad_psi[psiIndex][2][3] = c * gh_xyz_r -
s * gh_xyz_i;
1665 grad_grad_grad_psi[psiIndex][2][4] = c * gh_yyz_r -
s * gh_yyz_i;
1666 grad_grad_grad_psi[psiIndex][2][5] = c * gh_yzz_r -
s * gh_yzz_i;
1667 grad_grad_grad_psi[psiIndex][2][6] = c * gh_xzz_r -
s * gh_xzz_i;
1668 grad_grad_grad_psi[psiIndex][2][7] = c * gh_yzz_r -
s * gh_yzz_i;
1669 grad_grad_grad_psi[psiIndex][2][8] = c * gh_zzz_r -
s * gh_zzz_i;
1674 template<
typename ST>
1683 PointType ru(PrimLattice.toUnit_floor(r));
1684 #pragma omp parallel 1689 spline2::evaluate3d_vghgh(SplineInst->getSplinePtr(), ru, myV, myG, myH, mygH, first, last);
1690 assign_vghgh(r, psi, dpsi, grad_grad_psi, grad_grad_grad_psi, first / 2, last / 2);
1694 template<
typename ST>
1703 const int block_size = 16;
1706 std::vector<ValueVector> multi_psi_v;
1707 std::vector<GradVector> multi_dpsi_v;
1708 std::vector<ValueVector> multi_d2psi_v;
1713 multi_psi_v.reserve(block_size);
1714 multi_dpsi_v.reserve(block_size);
1715 multi_d2psi_v.reserve(block_size);
1716 psi_v_list.reserve(block_size);
1717 dpsi_v_list.reserve(block_size);
1718 d2psi_v_list.reserve(block_size);
1720 for (
int iat = first, i = 0; iat < last; iat += block_size, i += block_size)
1722 const int actual_block_size =
std::min(last - iat, block_size);
1723 multi_pos_copy.resize(actual_block_size * 6);
1724 multi_psi_v.clear();
1725 multi_dpsi_v.clear();
1726 multi_d2psi_v.clear();
1728 dpsi_v_list.clear();
1729 d2psi_v_list.clear();
1731 for (
int ipos = 0; ipos < actual_block_size; ++ipos)
1735 PointType ru(PrimLattice.toUnit_floor(r));
1736 multi_pos_copy[ipos * 6] = r[0];
1737 multi_pos_copy[ipos * 6 + 1] = r[1];
1738 multi_pos_copy[ipos * 6 + 2] = r[2];
1739 multi_pos_copy[ipos * 6 + 3] = ru[0];
1740 multi_pos_copy[ipos * 6 + 4] = ru[1];
1741 multi_pos_copy[ipos * 6 + 5] = ru[2];
1743 multi_psi_v.emplace_back(logdet[i + ipos], OrbitalSetSize);
1744 multi_dpsi_v.emplace_back(dlogdet[i + ipos], OrbitalSetSize);
1745 multi_d2psi_v.emplace_back(d2logdet[i + ipos], OrbitalSetSize);
1747 psi_v_list.push_back(multi_psi_v[ipos]);
1748 dpsi_v_list.push_back(multi_dpsi_v[ipos]);
1749 d2psi_v_list.push_back(multi_d2psi_v[ipos]);
1752 evaluateVGLMultiPos(multi_pos_copy, offload_scratch, results_scratch, psi_v_list, dpsi_v_list, d2psi_v_list);
OrbitalSetTraits< ValueType >::HessVector HessVector
T SymTrace(T h00, T h01, T h02, T h11, T h12, T h22, const T gg[6])
compute Trace(H*G)
helper functions for EinsplineSetBuilder
virtual void mw_evaluateVGLandDetRatioGrads(const RefVectorWithLeader< SPOSet > &spo_list, const RefVectorWithLeader< ParticleSet > &P_list, int iat, const std::vector< const ValueType *> &invRow_ptr_list, OffloadMWVGLArray &phi_vgl_v, std::vector< ValueType > &ratios, std::vector< GradType > &grads) const override
evaluate the values, gradients and laplacians of this single-particle orbital sets and determinant ra...
virtual void evaluateVGH(const ParticleSet &P, const int iat, ValueVector &psi, GradVector &dpsi, HessVector &grad_grad_psi) override
evaluate the values, gradients and hessians of this single-particle orbital set
virtual void mw_evaluateDetRatios(const RefVectorWithLeader< SPOSet > &spo_list, const RefVectorWithLeader< const VirtualParticleSet > &vp_list, const RefVector< ValueVector > &psi_list, const std::vector< const ValueType *> &invRow_ptr_list, std::vector< std::vector< ValueType >> &ratios_list) const override
evaluate determinant ratios for virtual moves, e.g., sphere move for nonlocalPP, of multiple walkers ...
A ParticleSet that handles virtual moves of a selected particle of a given physical ParticleSet Virtu...
OrbitalSetTraits< ValueType >::ValueMatrix ValueMatrix
size_t getTotalNum() const
void assign_vgl_from_l(const PointType &r, ValueVector &psi, GradVector &dpsi, ValueVector &d2psi)
assign_vgl_from_l can be used when myL is precomputed and myV,myG,myL in cartesian ...
T v_m_v(T h00, T h01, T h02, T h11, T h12, T h22, T g1x, T g1y, T g1z, T g2x, T g2y, T g2z)
compute vector[3]^T x matrix[3][3] x vector[3]
T t3_contract(T h000, T h001, T h002, T h011, T h012, T h022, T h111, T h112, T h122, T h222, T g1x, T g1y, T g1z, T g2x, T g2y, T g2z, T g3x, T g3y, T g3z)
Coordinate transform for a 3rd rank symmetric tensor representing coordinate derivatives (hence t3_co...
void assign_vgl(ST x, ST y, ST z, TT *restrict results_scratch_ptr, size_t orb_padded_size, const ST *mKK_ptr, const ST *restrict offload_scratch_ptr, size_t spline_padded_size, const ST G[9], const ST *myKcart_ptr, size_t myKcart_padded_size, size_t first_spo, int nComplexBands, int index)
assign_vgl
void assign_v(ST x, ST y, ST z, TT *restrict results_scratch_ptr, const ST *restrict offload_scratch_ptr, const ST *restrict myKcart_ptr, size_t myKcart_padded_size, size_t first_spo, int index)
bool read_splines(hdf_archive &h5f)
void evaluateVGLMultiPos(const Vector< ST, OffloadPinnedAllocator< ST >> &multi_pos_copy, Vector< ST, OffloadPinnedAllocator< ST >> &offload_scratch, Vector< TT, OffloadPinnedAllocator< TT >> &results_scratch, const RefVector< ValueVector > &psi_v_list, const RefVector< GradVector > &dpsi_v_list, const RefVector< ValueVector > &d2psi_v_list) const
OrbitalSetTraits< ValueType >::GradMatrix GradMatrix
void assign_vgh(const PointType &r, ValueVector &psi, GradVector &dpsi, HessVector &grad_grad_psi, int first, int last) const
omp_int_t omp_get_thread_num()
Specialized paritlce class for atomistic simulations.
void assign_vghgh(const PointType &r, ValueVector &psi, GradVector &dpsi, HessVector &grad_grad_psi, GGGVector &grad_grad_grad_psi, int first=0, int last=-1) const
void assign_v(const PointType &r, const vContainer_type &myV, ValueVector &psi, int first, int last) const
class to match std::complex<ST> spline with BsplineSet::ValueType (real) SPOs with OpenMP offload ...
virtual void mw_evaluateVGL(const RefVectorWithLeader< SPOSet > &sa_list, const RefVectorWithLeader< ParticleSet > &P_list, int iat, const RefVector< ValueVector > &psi_v_list, const RefVector< GradVector > &dpsi_v_list, const RefVector< ValueVector > &d2psi_v_list) const override
evaluate the values, gradients and laplacians of this single-particle orbital sets of multiple walker...
bool write_splines(hdf_archive &h5f)
QTBase::ValueType ValueType
ResourceHandle< SplineOMPTargetMultiWalkerMem< ST, TT > > mw_mem_handle_
void FairDivideAligned(const int ntot, const int base, const int npart, const int me, int &first, int &last)
Partition ntot over npart and the size of each partition is a multiple of base size.
OrbitalSetTraits< ValueType >::ValueVector ValueVector
virtual void evaluateDetRatios(const VirtualParticleSet &VP, ValueVector &psi, const ValueVector &psiinv, std::vector< ValueType > &ratios) override
evaluate determinant ratios for virtual moves, e.g., sphere move for nonlocalPP
class to handle complex splines to real orbitals with splines of arbitrary precision splines storage ...
UBspline_3d_d SingleSplineType
virtual void evaluateVGHGH(const ParticleSet &P, const int iat, ValueVector &psi, GradVector &dpsi, HessVector &grad_grad_psi, GGGVector &grad_grad_grad_psi) override
evaluate the values, gradients, hessians, and grad hessians of this single-particle orbital set ...
const PosType & activeR(int iat) const
return the active position if the particle is active or the return current position if not ...
omp_int_t omp_get_num_threads()
OMPallocator is an allocator with fused device and dualspace allocator functionality.
void assign_v(ST x, ST y, ST z, TT *restrict results_scratch_ptr, const ST *restrict offload_scratch_ptr, const ST *restrict myKcart_ptr, size_t myKcart_padded_size, size_t first_spo, int nComplexBands, int index)
std::vector< std::reference_wrapper< T > > RefVector
CASTTYPE & getCastedLeader() const
void set_spline(SingleSplineType *spline_r, SingleSplineType *spline_i, int twist, int ispline, int level)
OrbitalSetTraits< ValueType >::GradHessVector GGGVector
sycl::event copy_n(sycl::queue &aq, const T1 *restrict VA, size_t array_size, T2 *restrict VC, const std::vector< sycl::event > &events)
OrbitalSetTraits< ValueType >::GradVector GradVector
typename BsplineSet::ValueType TT
void sincos(T a, T *restrict s, T *restrict c)
sincos function wrapper
#define ASSUME_ALIGNED(x)
virtual void evaluate_notranspose(const ParticleSet &P, int first, int last, ValueMatrix &logdet, GradMatrix &dlogdet, ValueMatrix &d2logdet) override
evaluate the values, gradients and laplacians of this single-particle orbital for [first...
bool readEntry(T &data, const std::string &aname)
read the data from the group aname and return status use read() for inbuilt error checking ...
virtual void evaluateVGL(const ParticleSet &P, const int iat, ValueVector &psi, GradVector &dpsi, ValueVector &d2psi) override
evaluate the values, gradients and laplacians of this single-particle orbital set ...
A D-dimensional Array class based on PETE.
SplineC2ROMPTarget(const std::string &my_name)
virtual void evaluateValue(const ParticleSet &P, const int iat, ValueVector &psi) override
evaluate the values of this single-particle orbital set
bool writeEntry(T &data, const std::string &aname)
write the data to the group aname and return status use write() for inbuilt error checking ...