25 template<
typename REAL>
39 static_assert(
DIM == 3,
"only support 3D due to explicit x,y,z coded.");
40 const size_t dist_stride = n_padded * (
DIM + 1);
46 transfer_buffer.resize((
sizeof(REAL*) +
sizeof(REAL) * 2 +
sizeof(
int)) * num_groups);
47 REAL** mw_coefs_ptr =
reinterpret_cast<REAL**
>(transfer_buffer.data());
48 REAL* mw_DeltaRInv_ptr =
reinterpret_cast<REAL*
>(transfer_buffer.data() +
sizeof(REAL*) * num_groups);
49 REAL* mw_cutoff_radius_ptr = mw_DeltaRInv_ptr + num_groups;
50 int* mw_max_index_ptr =
reinterpret_cast<int*
>(transfer_buffer.data() + (
sizeof(REAL*) +
sizeof(REAL) * 2) * num_groups);
51 for (
int ig = 0; ig < num_groups; ig++)
54 mw_coefs_ptr[ig] = functors[ig]->
spline_coefs_->device_data();
55 mw_DeltaRInv_ptr[ig] = functors[ig]->
DeltaRInv;
61 mw_coefs_ptr[ig] =
nullptr;
62 mw_DeltaRInv_ptr[ig] = 0.0;
63 mw_cutoff_radius_ptr[ig] = 0.0;
64 mw_max_index_ptr[ig] = 0;
67 auto* transfer_buffer_ptr = transfer_buffer.data();
69 PRAGMA_OFFLOAD(
"omp target teams distribute map(always, to: transfer_buffer_ptr[:transfer_buffer.size()]) \ 70 map(to: grp_ids[:n_src]) \ 71 map(to: mw_dist[:dist_stride*nw]) \ 72 map(from: mw_cur_allu[:n_padded*3*nw]) \ 73 map(always, from: mw_vgl[:(DIM+2)*nw])")
74 for (
int ip = 0; ip < nw; ip++)
82 const REAL* dist = mw_dist + ip * dist_stride;
83 const REAL* dipl_x = dist + n_padded;
84 const REAL* dipl_y = dist + n_padded * 2;
85 const REAL* dipl_z = dist + n_padded * 3;
87 REAL** mw_coefs =
reinterpret_cast<REAL**
>(transfer_buffer_ptr);
88 REAL* mw_DeltaRInv =
reinterpret_cast<REAL*
>(transfer_buffer_ptr +
sizeof(REAL*) * num_groups);
89 REAL* mw_cutoff_radius = mw_DeltaRInv + num_groups;
90 int* mw_max_index =
reinterpret_cast<int*
>(transfer_buffer_ptr + (
sizeof(REAL*) +
sizeof(REAL) * 2) * num_groups);
92 REAL* cur_allu = mw_cur_allu + ip * n_padded * 3;
94 PRAGMA_OFFLOAD(
"omp parallel for reduction(+: val_sum, grad_x, grad_y, grad_z, lapl)")
95 for (
int j = 0; j < n_src; j++)
99 const int ig = grp_ids[j];
100 const REAL* coefs = mw_coefs[ig];
101 REAL DeltaRInv = mw_DeltaRInv[ig];
102 REAL cutoff_radius = mw_cutoff_radius[ig];
103 const int max_index = mw_max_index[ig];
109 if (r < cutoff_radius)
111 u = evaluate_impl(r, coefs, DeltaRInv, max_index, dudr, d2udr2);
116 cur_allu[j + n_padded] = dudr;
117 cur_allu[j + n_padded * 2] = d2udr2;
119 lapl += d2udr2 + (
DIM - 1) * dudr;
120 grad_x += dudr * dipl_x[j];
121 grad_y += dudr * dipl_y[j];
122 grad_z += dudr * dipl_z[j];
125 REAL* vgl = mw_vgl + ip * (
DIM + 2);
134 template<
typename REAL>
142 const int dist_stride,
150 transfer_buffer.resize((
sizeof(REAL*) +
sizeof(REAL) * 2 +
sizeof(
int)) * num_groups);
151 REAL** mw_coefs_ptr =
reinterpret_cast<REAL**
>(transfer_buffer.data());
152 REAL* mw_DeltaRInv_ptr =
reinterpret_cast<REAL*
>(transfer_buffer.data() +
sizeof(REAL*) * num_groups);
153 REAL* mw_cutoff_radius_ptr = mw_DeltaRInv_ptr + num_groups;
154 int* mw_max_index_ptr =
reinterpret_cast<int*
>(transfer_buffer.data() + (
sizeof(REAL*) +
sizeof(REAL) * 2) * num_groups);
155 for (
int ig = 0; ig < num_groups; ig++)
158 mw_coefs_ptr[ig] = functors[ig]->
spline_coefs_->device_data();
159 mw_DeltaRInv_ptr[ig] = functors[ig]->
DeltaRInv;
161 mw_max_index_ptr[ig] = functors[ig]->
getMaxIndex();
165 mw_coefs_ptr[ig] =
nullptr;
166 mw_DeltaRInv_ptr[ig] = 0.0;
167 mw_cutoff_radius_ptr[ig] = 0.0;
168 mw_max_index_ptr[ig] = 0;
171 auto* transfer_buffer_ptr = transfer_buffer.data();
173 PRAGMA_OFFLOAD(
"omp target teams distribute map(always, to:transfer_buffer_ptr[:transfer_buffer.size()]) \ 174 map(to: grp_ids[:n_src]) \ 175 map(to:ref_at[:num_pairs], mw_dist[:dist_stride*num_pairs]) \ 176 map(always, from:mw_vals[:num_pairs])")
177 for (
int ip = 0; ip < num_pairs; ip++)
180 const REAL* dist = mw_dist + ip * dist_stride;
181 REAL** mw_coefs =
reinterpret_cast<REAL**
>(transfer_buffer_ptr);
182 REAL* mw_DeltaRInv =
reinterpret_cast<REAL*
>(transfer_buffer_ptr +
sizeof(REAL*) * num_groups);
183 REAL* mw_cutoff_radius = mw_DeltaRInv + num_groups;
184 int* mw_max_index =
reinterpret_cast<int*
>(transfer_buffer_ptr + (
sizeof(REAL*) +
sizeof(REAL) * 2) * num_groups);
185 PRAGMA_OFFLOAD(
"omp parallel for reduction(+: sum)")
186 for (
int j = 0; j < n_src; j++)
188 const int ig = grp_ids[j];
189 const REAL* coefs = mw_coefs[ig];
190 REAL DeltaRInv = mw_DeltaRInv[ig];
191 REAL cutoff_radius = mw_cutoff_radius[ig];
192 const int max_index = mw_max_index[ig];
195 if (j != ref_at[ip] && r < cutoff_radius)
196 sum += evaluate_impl(r, coefs, DeltaRInv, max_index);
202 template<
typename REAL>
204 const std::vector<bool>& isAccepted,
205 const int num_groups,
218 static_assert(
DIM == 3,
"only support 3D due to explicit x,y,z coded.");
219 const size_t dist_stride = n_padded * (
DIM + 1);
226 transfer_buffer.resize((
sizeof(REAL*) +
sizeof(REAL) * 2 +
sizeof(
int)) * num_groups + nw *
sizeof(
int));
227 REAL** mw_coefs_ptr =
reinterpret_cast<REAL**
>(transfer_buffer.data());
228 REAL* mw_DeltaRInv_ptr =
reinterpret_cast<REAL*
>(transfer_buffer.data() +
sizeof(REAL*) * num_groups);
229 REAL* mw_cutoff_radius_ptr = mw_DeltaRInv_ptr + num_groups;
230 int* mw_max_index_ptr =
reinterpret_cast<int*
>(transfer_buffer.data() + (
sizeof(REAL*) +
sizeof(REAL) * 2) * num_groups);
231 int* accepted_indices = mw_max_index_ptr + num_groups;
233 for (
int ig = 0; ig < num_groups; ig++)
236 mw_coefs_ptr[ig] = functors[ig]->
spline_coefs_->device_data();
237 mw_DeltaRInv_ptr[ig] = functors[ig]->
DeltaRInv;
239 mw_max_index_ptr[ig] = functors[ig]->
getMaxIndex();
243 mw_coefs_ptr[ig] =
nullptr;
244 mw_DeltaRInv_ptr[ig] = 0.0;
245 mw_cutoff_radius_ptr[ig] = 0.0;
246 mw_max_index_ptr[ig] = 0;
250 for (
int iw = 0; iw < nw; iw++)
252 accepted_indices[nw_accepted++] = iw;
254 auto* transfer_buffer_ptr = transfer_buffer.data();
256 PRAGMA_OFFLOAD(
"omp target teams distribute map(always, to: transfer_buffer_ptr[:transfer_buffer.size()]) \ 257 map(to: grp_ids[:n_src]) \ 258 map(to: mw_dist[:dist_stride*nw]) \ 259 map(to: mw_vgl[:(DIM+2)*nw]) \ 260 map(always, from: mw_allUat[:nw * n_padded * (DIM + 2)])")
261 for (
int iw = 0; iw < nw_accepted; iw++)
263 REAL** mw_coefs =
reinterpret_cast<REAL**
>(transfer_buffer_ptr);
264 REAL* mw_DeltaRInv =
reinterpret_cast<REAL*
>(transfer_buffer_ptr +
sizeof(REAL*) * num_groups);
265 REAL* mw_cutoff_radius = mw_DeltaRInv + num_groups;
266 int* mw_max_index =
reinterpret_cast<int*
>(transfer_buffer_ptr + (
sizeof(REAL*) +
sizeof(REAL) * 2) * num_groups);
267 int* accepted_indices = mw_max_index + num_groups;
268 const int ip = accepted_indices[iw];
270 const REAL* dist_new = mw_dist + ip * dist_stride;
271 const REAL* dipl_x_new = dist_new + n_padded;
272 const REAL* dipl_y_new = dist_new + n_padded * 2;
273 const REAL* dipl_z_new = dist_new + n_padded * 3;
275 const REAL* dist_old = mw_dist + ip * dist_stride + dist_stride * nw;
276 const REAL* dipl_x_old = dist_old + n_padded;
277 const REAL* dipl_y_old = dist_old + n_padded * 2;
278 const REAL* dipl_z_old = dist_old + n_padded * 3;
280 REAL* Uat = mw_allUat + ip * n_padded;
281 REAL* dUat_x = mw_allUat + n_padded * nw + ip * n_padded *
DIM;
282 REAL* dUat_y = dUat_x + n_padded;
283 REAL* dUat_z = dUat_y + n_padded;
284 REAL* d2Uat = mw_allUat + n_padded * (
DIM + 1) * nw + ip * n_padded;
286 REAL* cur_allu = mw_cur_allu + ip * n_padded * 3;
288 PRAGMA_OFFLOAD(
"omp parallel for")
289 for (
int j = 0; j < n_src; j++)
293 const int ig = grp_ids[j];
294 const REAL* coefs = mw_coefs[ig];
295 REAL DeltaRInv = mw_DeltaRInv[ig];
296 REAL cutoff_radius = mw_cutoff_radius[ig];
297 const int max_index = mw_max_index[ig];
299 REAL r = dist_old[j];
303 if (r < cutoff_radius)
305 u = evaluate_impl(dist_old[j], coefs, DeltaRInv, max_index, dudr, d2udr2);
309 REAL cur_u = cur_allu[j];
310 REAL cur_dudr = cur_allu[j + n_padded];
311 REAL cur_d2udr2 = cur_allu[j + n_padded * 2];
313 dUat_x[j] -= dipl_x_new[j] * cur_dudr - dipl_x_old[j] * dudr;
314 dUat_y[j] -= dipl_y_new[j] * cur_dudr - dipl_y_old[j] * dudr;
315 dUat_z[j] -= dipl_z_new[j] * cur_dudr - dipl_z_old[j] * dudr;
316 constexpr REAL lapfac(
DIM - 1);
317 d2Uat[j] -= cur_d2udr2 + lapfac * cur_dudr - (d2udr2 + lapfac * dudr);
319 REAL* vgl = mw_vgl + ip * (
DIM + 2);
321 dUat_x[iat] = vgl[1];
322 dUat_y[iat] = vgl[2];
323 dUat_z[iat] = vgl[3];
helper functions for EinsplineSetBuilder
BsplineFunctor class for the Jastrows REAL is the real type used by offload target, it is the correct type for the mw data pointers and is also used to coerce/implicitly convert the Real type inherited OptimizableFunctorBase into that buffer if offload is off this happens too but is just an implementation quirk.
int getMaxIndex() const
return the max allowed beginning index to access spline coefficients
static void mw_evaluateVGL(const int iat, const int num_groups, const BsplineFunctor *const functors[], const int n_src, const int *grp_ids, const int nw, REAL *mw_vgl, const int n_padded, const REAL *mw_dist, REAL *mw_cur_allu, Vector< char, OffloadPinnedAllocator< char >> &transfer_buffer)
compute value, gradient and laplacian for target particles This more than just a batched call of eval...
static void mw_updateVGL(const int iat, const std::vector< bool > &isAccepted, const int num_groups, const BsplineFunctor *const functors[], const int n_src, const int *grp_ids, const int nw, REAL *mw_vgl, const int n_padded, const REAL *mw_dist, REAL *mw_allUat, REAL *mw_cur_allu, Vector< char, OffloadPinnedAllocator< char >> &transfer_buffer)
update value, gradient and laplacian for target particles It serves multile walkers and handles updat...
static void mw_evaluateV(const int num_groups, const BsplineFunctor *const functors[], const int n_src, const int *grp_ids, const int num_pairs, const int *ref_at, const REAL *mw_dist, const int dist_stride, REAL *mw_vals, Vector< char, OffloadPinnedAllocator< char >> &transfer_buffer)
compute value for target-source particle pair potentials This more than just a batched call of evalua...
OMPallocator is an allocator with fused device and dualspace allocator functionality.
real_type cutoff_radius
maximum cutoff
std::shared_ptr< Vector< Real, OffloadAllocator< Real > > > spline_coefs_