QMCPACK
BsplineFunctor.cpp
Go to the documentation of this file.
1 //////////////////////////////////////////////////////////////////////////////////////
2 // This file is distributed under the University of Illinois/NCSA Open Source License.
3 // See LICENSE file in top directory for details.
4 //
5 // Copyright (c) 2022 QMCPACK developers.
6 //
7 // File developed by: John R. Gergely, University of Illinois at Urbana-Champaign
8 // Ken Esler, kpesler@gmail.com, University of Illinois at Urbana-Champaign
9 // Miguel Morales, moralessilva2@llnl.gov, Lawrence Livermore National Laboratory
10 // Raymond Clay III, j.k.rofling@gmail.com, Lawrence Livermore National Laboratory
11 // Jeremy McMinnis, jmcminis@gmail.com, University of Illinois at Urbana-Champaign
12 // Jeongnim Kim, jeongnim.kim@gmail.com, University of Illinois at Urbana-Champaign
13 // Jaron T. Krogel, krogeljt@ornl.gov, Oak Ridge National Laboratory
14 // Mark A. Berrill, berrillma@ornl.gov, Oak Ridge National Laboratory
15 // Amrita Mathuriya, amrita.mathuriya@intel.com, Intel Corp.
16 // Peter W. Doak, doakpw@ornl.gov, Oak Ridge National Lab
17 //
18 // File created by: Ken Esler, kpesler@gmail.com, University of Illinois at Urbana-Champaign
19 //////////////////////////////////////////////////////////////////////////////////////
20 
21 #include "BsplineFunctor.h"
22 
23 namespace qmcplusplus
24 {
25 template<typename REAL>
27  const int num_groups,
28  const BsplineFunctor* const functors[],
29  const int n_src,
30  const int* grp_ids,
31  const int nw,
32  REAL* mw_vgl, // [nw][DIM+2]
33  const int n_padded,
34  const REAL* mw_dist, // [nw][DIM+1][n_padded]
35  REAL* mw_cur_allu, // [nw][3][n_padded]
36  Vector<char, OffloadPinnedAllocator<char>>& transfer_buffer)
37 {
38  constexpr unsigned DIM = OHMMS_DIM;
39  static_assert(DIM == 3, "only support 3D due to explicit x,y,z coded.");
40  const size_t dist_stride = n_padded * (DIM + 1);
41 
42  /* transfer buffer used for
43  * Bspline coefs device pointer sizeof(T*), DeltaRInv sizeof(T) and cutoff_radius sizeof(T)
44  * these contents change based on the group of the target particle, so it is prepared per call.
45  */
46  transfer_buffer.resize((sizeof(REAL*) + sizeof(REAL) * 2 + sizeof(int)) * num_groups);
47  REAL** mw_coefs_ptr = reinterpret_cast<REAL**>(transfer_buffer.data());
48  REAL* mw_DeltaRInv_ptr = reinterpret_cast<REAL*>(transfer_buffer.data() + sizeof(REAL*) * num_groups);
49  REAL* mw_cutoff_radius_ptr = mw_DeltaRInv_ptr + num_groups;
50  int* mw_max_index_ptr = reinterpret_cast<int*>(transfer_buffer.data() + (sizeof(REAL*) + sizeof(REAL) * 2) * num_groups);
51  for (int ig = 0; ig < num_groups; ig++)
52  if (functors[ig])
53  {
54  mw_coefs_ptr[ig] = functors[ig]->spline_coefs_->device_data();
55  mw_DeltaRInv_ptr[ig] = functors[ig]->DeltaRInv;
56  mw_cutoff_radius_ptr[ig] = functors[ig]->cutoff_radius;
57  mw_max_index_ptr[ig] = functors[ig]->getMaxIndex();
58  }
59  else
60  {
61  mw_coefs_ptr[ig] = nullptr;
62  mw_DeltaRInv_ptr[ig] = 0.0;
63  mw_cutoff_radius_ptr[ig] = 0.0; // important! Prevent spline evaluation to access nullptr.
64  mw_max_index_ptr[ig] = 0;
65  }
66 
67  auto* transfer_buffer_ptr = transfer_buffer.data();
68 
69  PRAGMA_OFFLOAD("omp target teams distribute map(always, to: transfer_buffer_ptr[:transfer_buffer.size()]) \
70  map(to: grp_ids[:n_src]) \
71  map(to: mw_dist[:dist_stride*nw]) \
72  map(from: mw_cur_allu[:n_padded*3*nw]) \
73  map(always, from: mw_vgl[:(DIM+2)*nw])")
74  for (int ip = 0; ip < nw; ip++)
75  {
76  REAL val_sum(0);
77  REAL grad_x(0);
78  REAL grad_y(0);
79  REAL grad_z(0);
80  REAL lapl(0);
81 
82  const REAL* dist = mw_dist + ip * dist_stride;
83  const REAL* dipl_x = dist + n_padded;
84  const REAL* dipl_y = dist + n_padded * 2;
85  const REAL* dipl_z = dist + n_padded * 3;
86 
87  REAL** mw_coefs = reinterpret_cast<REAL**>(transfer_buffer_ptr);
88  REAL* mw_DeltaRInv = reinterpret_cast<REAL*>(transfer_buffer_ptr + sizeof(REAL*) * num_groups);
89  REAL* mw_cutoff_radius = mw_DeltaRInv + num_groups;
90  int* mw_max_index = reinterpret_cast<int*>(transfer_buffer_ptr + (sizeof(REAL*) + sizeof(REAL) * 2) * num_groups);
91 
92  REAL* cur_allu = mw_cur_allu + ip * n_padded * 3;
93 
94  PRAGMA_OFFLOAD("omp parallel for reduction(+: val_sum, grad_x, grad_y, grad_z, lapl)")
95  for (int j = 0; j < n_src; j++)
96  {
97  if (j == iat)
98  continue;
99  const int ig = grp_ids[j];
100  const REAL* coefs = mw_coefs[ig];
101  REAL DeltaRInv = mw_DeltaRInv[ig];
102  REAL cutoff_radius = mw_cutoff_radius[ig];
103  const int max_index = mw_max_index[ig];
104 
105  REAL r = dist[j];
106  REAL u(0);
107  REAL dudr(0);
108  REAL d2udr2(0);
109  if (r < cutoff_radius)
110  {
111  u = evaluate_impl(r, coefs, DeltaRInv, max_index, dudr, d2udr2);
112  dudr *= REAL(1) / r;
113  }
114  // save u, dudr/r and d2udr2 to cur_allu
115  cur_allu[j] = u;
116  cur_allu[j + n_padded] = dudr;
117  cur_allu[j + n_padded * 2] = d2udr2;
118  val_sum += u;
119  lapl += d2udr2 + (DIM - 1) * dudr;
120  grad_x += dudr * dipl_x[j];
121  grad_y += dudr * dipl_y[j];
122  grad_z += dudr * dipl_z[j];
123  }
124 
125  REAL* vgl = mw_vgl + ip * (DIM + 2);
126  vgl[0] = val_sum;
127  vgl[1] = grad_x;
128  vgl[2] = grad_y;
129  vgl[3] = grad_z;
130  vgl[4] = -lapl;
131  }
132 }
133 
134 template<typename REAL>
135 void BsplineFunctor<REAL>::mw_evaluateV(const int num_groups,
136  const BsplineFunctor<REAL>* const functors[],
137  const int n_src,
138  const int* grp_ids,
139  const int num_pairs,
140  const int* ref_at,
141  const REAL* mw_dist,
142  const int dist_stride,
143  REAL* mw_vals,
144  Vector<char, OffloadPinnedAllocator<char>>& transfer_buffer)
145 {
146  /* transfer buffer used for
147  * Bspline coefs device pointer sizeof(REAL*), DeltaRInv sizeof(REAL), cutoff_radius sizeof(REAL)
148  * these contents change based on the group of the target particle, so it is prepared per call.
149  */
150  transfer_buffer.resize((sizeof(REAL*) + sizeof(REAL) * 2 + sizeof(int)) * num_groups);
151  REAL** mw_coefs_ptr = reinterpret_cast<REAL**>(transfer_buffer.data());
152  REAL* mw_DeltaRInv_ptr = reinterpret_cast<REAL*>(transfer_buffer.data() + sizeof(REAL*) * num_groups);
153  REAL* mw_cutoff_radius_ptr = mw_DeltaRInv_ptr + num_groups;
154  int* mw_max_index_ptr = reinterpret_cast<int*>(transfer_buffer.data() + (sizeof(REAL*) + sizeof(REAL) * 2) * num_groups);
155  for (int ig = 0; ig < num_groups; ig++)
156  if (functors[ig])
157  {
158  mw_coefs_ptr[ig] = functors[ig]->spline_coefs_->device_data();
159  mw_DeltaRInv_ptr[ig] = functors[ig]->DeltaRInv;
160  mw_cutoff_radius_ptr[ig] = functors[ig]->cutoff_radius;
161  mw_max_index_ptr[ig] = functors[ig]->getMaxIndex();
162  }
163  else
164  {
165  mw_coefs_ptr[ig] = nullptr;
166  mw_DeltaRInv_ptr[ig] = 0.0;
167  mw_cutoff_radius_ptr[ig] = 0.0; // important! Prevent spline evaluation to access nullptr.
168  mw_max_index_ptr[ig] = 0;
169  }
170 
171  auto* transfer_buffer_ptr = transfer_buffer.data();
172 
173  PRAGMA_OFFLOAD("omp target teams distribute map(always, to:transfer_buffer_ptr[:transfer_buffer.size()]) \
174  map(to: grp_ids[:n_src]) \
175  map(to:ref_at[:num_pairs], mw_dist[:dist_stride*num_pairs]) \
176  map(always, from:mw_vals[:num_pairs])")
177  for (int ip = 0; ip < num_pairs; ip++)
178  {
179  REAL sum = 0;
180  const REAL* dist = mw_dist + ip * dist_stride;
181  REAL** mw_coefs = reinterpret_cast<REAL**>(transfer_buffer_ptr);
182  REAL* mw_DeltaRInv = reinterpret_cast<REAL*>(transfer_buffer_ptr + sizeof(REAL*) * num_groups);
183  REAL* mw_cutoff_radius = mw_DeltaRInv + num_groups;
184  int* mw_max_index = reinterpret_cast<int*>(transfer_buffer_ptr + (sizeof(REAL*) + sizeof(REAL) * 2) * num_groups);
185  PRAGMA_OFFLOAD("omp parallel for reduction(+: sum)")
186  for (int j = 0; j < n_src; j++)
187  {
188  const int ig = grp_ids[j];
189  const REAL* coefs = mw_coefs[ig];
190  REAL DeltaRInv = mw_DeltaRInv[ig];
191  REAL cutoff_radius = mw_cutoff_radius[ig];
192  const int max_index = mw_max_index[ig];
193 
194  REAL r = dist[j];
195  if (j != ref_at[ip] && r < cutoff_radius)
196  sum += evaluate_impl(r, coefs, DeltaRInv, max_index);
197  }
198  mw_vals[ip] = sum;
199  }
200 }
201 
202 template<typename REAL>
204  const std::vector<bool>& isAccepted,
205  const int num_groups,
206  const BsplineFunctor* const functors[],
207  const int n_src,
208  const int* grp_ids,
209  const int nw,
210  REAL* mw_vgl, // [nw][DIM+2]
211  const int n_padded,
212  const REAL* mw_dist, // [nw][DIM+1][n_padded]
213  REAL* mw_allUat, // [nw][DIM+2][n_padded]
214  REAL* mw_cur_allu, // [nw][3][n_padded]
215  Vector<char, OffloadPinnedAllocator<char>>& transfer_buffer)
216 {
217  constexpr unsigned DIM = OHMMS_DIM;
218  static_assert(DIM == 3, "only support 3D due to explicit x,y,z coded.");
219  const size_t dist_stride = n_padded * (DIM + 1);
220 
221  /* transfer buffer used for
222  * Bspline coefs device pointer sizeof(REAL*), DeltaRInv sizeof(REAL), cutoff_radius sizeof(REAL)
223  * and packed accept list at most nw * sizeof(int)
224  * these contents change based on the group of the target particle, so it is prepared per call.
225  */
226  transfer_buffer.resize((sizeof(REAL*) + sizeof(REAL) * 2 + sizeof(int)) * num_groups + nw * sizeof(int));
227  REAL** mw_coefs_ptr = reinterpret_cast<REAL**>(transfer_buffer.data());
228  REAL* mw_DeltaRInv_ptr = reinterpret_cast<REAL*>(transfer_buffer.data() + sizeof(REAL*) * num_groups);
229  REAL* mw_cutoff_radius_ptr = mw_DeltaRInv_ptr + num_groups;
230  int* mw_max_index_ptr = reinterpret_cast<int*>(transfer_buffer.data() + (sizeof(REAL*) + sizeof(REAL) * 2) * num_groups);
231  int* accepted_indices = mw_max_index_ptr + num_groups;
232 
233  for (int ig = 0; ig < num_groups; ig++)
234  if (functors[ig])
235  {
236  mw_coefs_ptr[ig] = functors[ig]->spline_coefs_->device_data();
237  mw_DeltaRInv_ptr[ig] = functors[ig]->DeltaRInv;
238  mw_cutoff_radius_ptr[ig] = functors[ig]->cutoff_radius;
239  mw_max_index_ptr[ig] = functors[ig]->getMaxIndex();
240  }
241  else
242  {
243  mw_coefs_ptr[ig] = nullptr;
244  mw_DeltaRInv_ptr[ig] = 0.0;
245  mw_cutoff_radius_ptr[ig] = 0.0; // important! Prevent spline evaluation to access nullptr.
246  mw_max_index_ptr[ig] = 0;
247  }
248 
249  int nw_accepted = 0;
250  for (int iw = 0; iw < nw; iw++)
251  if (isAccepted[iw])
252  accepted_indices[nw_accepted++] = iw;
253 
254  auto* transfer_buffer_ptr = transfer_buffer.data();
255 
256  PRAGMA_OFFLOAD("omp target teams distribute map(always, to: transfer_buffer_ptr[:transfer_buffer.size()]) \
257  map(to: grp_ids[:n_src]) \
258  map(to: mw_dist[:dist_stride*nw]) \
259  map(to: mw_vgl[:(DIM+2)*nw]) \
260  map(always, from: mw_allUat[:nw * n_padded * (DIM + 2)])")
261  for (int iw = 0; iw < nw_accepted; iw++)
262  {
263  REAL** mw_coefs = reinterpret_cast<REAL**>(transfer_buffer_ptr);
264  REAL* mw_DeltaRInv = reinterpret_cast<REAL*>(transfer_buffer_ptr + sizeof(REAL*) * num_groups);
265  REAL* mw_cutoff_radius = mw_DeltaRInv + num_groups;
266  int* mw_max_index = reinterpret_cast<int*>(transfer_buffer_ptr + (sizeof(REAL*) + sizeof(REAL) * 2) * num_groups);
267  int* accepted_indices = mw_max_index + num_groups;
268  const int ip = accepted_indices[iw];
269 
270  const REAL* dist_new = mw_dist + ip * dist_stride;
271  const REAL* dipl_x_new = dist_new + n_padded;
272  const REAL* dipl_y_new = dist_new + n_padded * 2;
273  const REAL* dipl_z_new = dist_new + n_padded * 3;
274 
275  const REAL* dist_old = mw_dist + ip * dist_stride + dist_stride * nw;
276  const REAL* dipl_x_old = dist_old + n_padded;
277  const REAL* dipl_y_old = dist_old + n_padded * 2;
278  const REAL* dipl_z_old = dist_old + n_padded * 3;
279 
280  REAL* Uat = mw_allUat + ip * n_padded;
281  REAL* dUat_x = mw_allUat + n_padded * nw + ip * n_padded * DIM;
282  REAL* dUat_y = dUat_x + n_padded;
283  REAL* dUat_z = dUat_y + n_padded;
284  REAL* d2Uat = mw_allUat + n_padded * (DIM + 1) * nw + ip * n_padded;
285 
286  REAL* cur_allu = mw_cur_allu + ip * n_padded * 3;
287 
288  PRAGMA_OFFLOAD("omp parallel for")
289  for (int j = 0; j < n_src; j++)
290  {
291  if (j == iat)
292  continue;
293  const int ig = grp_ids[j];
294  const REAL* coefs = mw_coefs[ig];
295  REAL DeltaRInv = mw_DeltaRInv[ig];
296  REAL cutoff_radius = mw_cutoff_radius[ig];
297  const int max_index = mw_max_index[ig];
298 
299  REAL r = dist_old[j];
300  REAL u(0);
301  REAL dudr(0);
302  REAL d2udr2(0);
303  if (r < cutoff_radius)
304  {
305  u = evaluate_impl(dist_old[j], coefs, DeltaRInv, max_index, dudr, d2udr2);
306  dudr *= REAL(1) / r;
307  }
308  // update Uat, dUat, d2Uat
309  REAL cur_u = cur_allu[j];
310  REAL cur_dudr = cur_allu[j + n_padded];
311  REAL cur_d2udr2 = cur_allu[j + n_padded * 2];
312  Uat[j] += cur_u - u;
313  dUat_x[j] -= dipl_x_new[j] * cur_dudr - dipl_x_old[j] * dudr;
314  dUat_y[j] -= dipl_y_new[j] * cur_dudr - dipl_y_old[j] * dudr;
315  dUat_z[j] -= dipl_z_new[j] * cur_dudr - dipl_z_old[j] * dudr;
316  constexpr REAL lapfac(DIM - 1);
317  d2Uat[j] -= cur_d2udr2 + lapfac * cur_dudr - (d2udr2 + lapfac * dudr);
318  }
319  REAL* vgl = mw_vgl + ip * (DIM + 2);
320  Uat[iat] = vgl[0];
321  dUat_x[iat] = vgl[1];
322  dUat_y[iat] = vgl[2];
323  dUat_z[iat] = vgl[3];
324  d2Uat[iat] = vgl[4];
325  }
326 }
327 
329 
330 } // namespace qmcplusplus
helper functions for EinsplineSetBuilder
Definition: Configuration.h:43
BsplineFunctor class for the Jastrows REAL is the real type used by offload target, it is the correct type for the mw data pointers and is also used to coerce/implicitly convert the Real type inherited OptimizableFunctorBase into that buffer if offload is off this happens too but is just an implementation quirk.
int getMaxIndex() const
return the max allowed beginning index to access spline coefficients
#define OHMMS_DIM
Definition: config.h:64
static void mw_evaluateVGL(const int iat, const int num_groups, const BsplineFunctor *const functors[], const int n_src, const int *grp_ids, const int nw, REAL *mw_vgl, const int n_padded, const REAL *mw_dist, REAL *mw_cur_allu, Vector< char, OffloadPinnedAllocator< char >> &transfer_buffer)
compute value, gradient and laplacian for target particles This more than just a batched call of eval...
static void mw_updateVGL(const int iat, const std::vector< bool > &isAccepted, const int num_groups, const BsplineFunctor *const functors[], const int n_src, const int *grp_ids, const int nw, REAL *mw_vgl, const int n_padded, const REAL *mw_dist, REAL *mw_allUat, REAL *mw_cur_allu, Vector< char, OffloadPinnedAllocator< char >> &transfer_buffer)
update value, gradient and laplacian for target particles It serves multile walkers and handles updat...
static void mw_evaluateV(const int num_groups, const BsplineFunctor *const functors[], const int n_src, const int *grp_ids, const int num_pairs, const int *ref_at, const REAL *mw_dist, const int dist_stride, REAL *mw_vals, Vector< char, OffloadPinnedAllocator< char >> &transfer_buffer)
compute value for target-source particle pair potentials This more than just a batched call of evalua...
OMPallocator is an allocator with fused device and dualspace allocator functionality.
std::shared_ptr< Vector< Real, OffloadAllocator< Real > > > spline_coefs_