QMCPACK
SoaDistanceTableABOMPTarget.h
Go to the documentation of this file.
1 //////////////////////////////////////////////////////////////////////////////////////
2 // This file is distributed under the University of Illinois/NCSA Open Source License.
3 // See LICENSE file in top directory for details.
4 //
5 // Copyright (c) 2021 QMCPACK developers.
6 //
7 // File developed by: Jeongnim Kim, jeongnim.kim@intel.com, Intel Corp.
8 // Amrita Mathuriya, amrita.mathuriya@intel.com, Intel Corp.
9 // Ye Luo, yeluo@anl.gov, Argonne National Laboratory
10 //
11 // File created by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
12 //////////////////////////////////////////////////////////////////////////////////////
13 // -*- C++ -*-
14 #ifndef QMCPLUSPLUS_DTDIMPL_AB_OMPTARGET_H
15 #define QMCPLUSPLUS_DTDIMPL_AB_OMPTARGET_H
16 
18 #include "DistanceTable.h"
22 #include "ResourceCollection.h"
24 
25 namespace qmcplusplus
26 {
27 /**@ingroup nnlist
28  * @brief A derived classe from DistacneTableData, specialized for AB using a transposed form
29  */
30 template<typename T, unsigned D, int SC>
31 class SoaDistanceTableABOMPTarget : public DTD_BConds<T, D, SC>, public DistanceTableAB
32 {
33 private:
34  template<typename DT>
36 
37  ///accelerator output buffer for r and dr
39  ///accelerator input array for a list of target particle positions, num_targets_ x D
41 
42  ///multi walker shared memory buffer
43  struct DTABMultiWalkerMem : public Resource
44  {
45  ///accelerator output array for multiple walkers, [1+D][num_targets_][num_padded] (distances, displacements)
47  ///accelerator input buffer for multiple data set
49 
50  DTABMultiWalkerMem() : Resource("DTABMultiWalkerMem") {}
51 
53 
54  std::unique_ptr<Resource> makeClone() const override { return std::make_unique<DTABMultiWalkerMem>(*this); }
55  };
56 
58 
59  void resize()
60  {
61  if (num_sources_ * num_targets_ == 0)
62  return;
63  if (distances_.size())
64  return;
65 
66  // initialize memory containers and views
67  const size_t num_padded = getAlignedSize<T>(num_sources_);
68  const size_t stride_size = getPerTargetPctlStrideSize();
69  r_dr_memorypool_.resize(stride_size * num_targets_);
70 
71  distances_.resize(num_targets_);
73  for (int i = 0; i < num_targets_; ++i)
74  {
75  distances_[i].attachReference(r_dr_memorypool_.data() + i * stride_size, num_sources_);
76  displacements_[i].attachReference(num_sources_, num_padded,
77  r_dr_memorypool_.data() + i * stride_size + num_padded);
78  }
79  }
80 
82  {
83  auto& dt_leader = dt_list.getCastedLeader<SoaDistanceTableABOMPTarget>();
84 
85  // initialize memory containers and views
86  size_t count_targets = 0;
87  for (size_t iw = 0; iw < dt_list.size(); iw++)
88  {
89  auto& dt = dt_list.getCastedElement<SoaDistanceTableABOMPTarget>(iw);
90  count_targets += dt.targets();
91  dt.r_dr_memorypool_.free();
92  }
93 
94  const size_t num_sources = dt_leader.num_sources_;
95  const size_t num_padded = getAlignedSize<T>(dt_leader.num_sources_);
96  const size_t stride_size = num_padded * (D + 1);
97  const size_t total_targets = count_targets;
98  auto& mw_r_dr = dt_leader.mw_mem_handle_.getResource().mw_r_dr;
99  mw_r_dr.resize(total_targets * stride_size);
100 
101  count_targets = 0;
102  for (size_t iw = 0; iw < dt_list.size(); iw++)
103  {
104  auto& dt = dt_list.getCastedElement<SoaDistanceTableABOMPTarget>(iw);
105  assert(num_sources == dt.num_sources_);
106 
107  dt.distances_.resize(dt.targets());
108  dt.displacements_.resize(dt.targets());
109 
110  for (int i = 0; i < dt.targets(); ++i)
111  {
112  dt.distances_[i].attachReference(mw_r_dr.data() + (i + count_targets) * stride_size, num_sources);
113  dt.displacements_[i].attachReference(num_sources, num_padded,
114  mw_r_dr.data() + (i + count_targets) * stride_size + num_padded);
115  }
116  count_targets += dt.targets();
117  }
118  }
119 
120 public:
122  : DTD_BConds<T, D, SC>(source.getLattice()),
123  DistanceTableAB(source, target, DTModes::ALL_OFF),
124  offload_timer_(createGlobalTimer(std::string("DTABOMPTarget::offload_") + name_, timer_level_fine)),
125  evaluate_timer_(createGlobalTimer(std::string("DTABOMPTarget::evaluate_") + name_, timer_level_fine)),
126  move_timer_(createGlobalTimer(std::string("DTABOMPTarget::move_") + name_, timer_level_fine)),
127  update_timer_(createGlobalTimer(std::string("DTABOMPTarget::update_") + name_, timer_level_fine))
128 
129  {
130  auto* coordinates_soa = dynamic_cast<const RealSpacePositionsOMPTarget*>(&source.getCoordinates());
131  if (!coordinates_soa)
132  throw std::runtime_error("Source particle set doesn't have OpenMP offload. Contact developers!");
133  PRAGMA_OFFLOAD("omp target enter data map(to : this[:1])")
134 
135  // The padding of temp_r_ and temp_dr_ is necessary for the memory copy in the update function
136  // temp_r_ is padded explicitly while temp_dr_ is padded internally
137  const int num_padded = getAlignedSize<T>(num_sources_);
138  temp_r_.resize(num_padded);
140  }
141 
142  SoaDistanceTableABOMPTarget() = delete;
144 
145  ~SoaDistanceTableABOMPTarget() { PRAGMA_OFFLOAD("omp target exit data map(delete : this[:1])") }
146 
147  void createResource(ResourceCollection& collection) const override
148  {
149  auto resource_index = collection.addResource(std::make_unique<DTABMultiWalkerMem>());
150  }
151 
152  void acquireResource(ResourceCollection& collection, const RefVectorWithLeader<DistanceTable>& dt_list) const override
153  {
154  auto& dt_leader = dt_list.getCastedLeader<SoaDistanceTableABOMPTarget>();
155  dt_leader.mw_mem_handle_ = collection.lendResource<DTABMultiWalkerMem>();
156  associateResource(dt_list);
157  }
158 
159  void releaseResource(ResourceCollection& collection, const RefVectorWithLeader<DistanceTable>& dt_list) const override
160  {
162  for (size_t iw = 0; iw < dt_list.size(); iw++)
163  {
164  auto& dt = dt_list.getCastedElement<SoaDistanceTableABOMPTarget>(iw);
165  dt.distances_.clear();
166  dt.displacements_.clear();
167  }
168  }
169 
170  const T* getMultiWalkerDataPtr() const override { return mw_mem_handle_.getResource().mw_r_dr.data(); }
171 
172  size_t getPerTargetPctlStrideSize() const override { return getAlignedSize<T>(num_sources_) * (D + 1); }
173 
174  /** evaluate the full table */
175  inline void evaluate(ParticleSet& P) override
176  {
177  resize();
178 
179  ScopedTimer local_timer(evaluate_timer_);
180  // be aware of the sign of Displacement
181  const int num_targets_local = num_targets_;
182  const int num_sources_local = num_sources_;
183  const int num_padded = getAlignedSize<T>(num_sources_);
184 
185  target_pos.resize(num_targets_ * D);
186  for (size_t iat = 0; iat < num_targets_; iat++)
187  for (size_t idim = 0; idim < D; idim++)
188  target_pos[iat * D + idim] = P.R[iat][idim];
189 
190  auto* target_pos_ptr = target_pos.data();
191  auto* source_pos_ptr = origin_.getCoordinates().getAllParticlePos().data();
192  auto* r_dr_ptr = distances_[0].data();
193  assert(distances_[0].data() + num_padded == displacements_[0].data());
194 
195  // To maximize thread usage, the loop over electrons is chunked. Each chunk is sent to an OpenMP offload thread team.
196  const int ChunkSizePerTeam = 512;
197  const size_t num_teams = (num_sources_ + ChunkSizePerTeam - 1) / ChunkSizePerTeam;
198  const size_t stride_size = getPerTargetPctlStrideSize();
199 
200  {
201  ScopedTimer offload(offload_timer_);
202  PRAGMA_OFFLOAD("omp target teams distribute collapse(2) num_teams(num_targets_*num_teams) \
203  map(to: source_pos_ptr[:num_padded*D]) \
204  map(always, to: target_pos_ptr[:num_targets_*D]) \
205  map(always, from: r_dr_ptr[:num_targets_*stride_size])")
206  for (int iat = 0; iat < num_targets_local; ++iat)
207  for (int team_id = 0; team_id < num_teams; team_id++)
208  {
209  const int first = ChunkSizePerTeam * team_id;
210  const int last = omptarget::min(first + ChunkSizePerTeam, num_sources_local);
211 
212  T pos[D];
213  for (int idim = 0; idim < D; idim++)
214  pos[idim] = target_pos_ptr[iat * D + idim];
215 
216  auto* r_iat_ptr = r_dr_ptr + iat * stride_size;
217  auto* dr_iat_ptr = r_iat_ptr + num_padded;
218 
219  PRAGMA_OFFLOAD("omp parallel for")
220  for (int iel = first; iel < last; iel++)
221  DTD_BConds<T, D, SC>::computeDistancesOffload(pos, source_pos_ptr, num_padded, r_iat_ptr, dr_iat_ptr,
222  num_padded, iel);
223  }
224  }
225  }
226 
228  const RefVectorWithLeader<ParticleSet>& p_list) const override
229  {
230  assert(this == &dt_list.getLeader());
231  auto& dt_leader = dt_list.getCastedLeader<SoaDistanceTableABOMPTarget>();
232 
233  ScopedTimer local_timer(evaluate_timer_);
234 
235  const size_t nw = dt_list.size();
236  DTABMultiWalkerMem& mw_mem = dt_leader.mw_mem_handle_;
237  auto& mw_r_dr = mw_mem.mw_r_dr;
238 
239  size_t count_targets = 0;
240  for (ParticleSet& p : p_list)
241  count_targets += p.getTotalNum();
242  const size_t total_targets = count_targets;
243 
244  const int num_padded = getAlignedSize<T>(num_sources_);
245 
246 #ifndef NDEBUG
247  const int stride_size = getPerTargetPctlStrideSize();
248  count_targets = 0;
249  for (size_t iw = 0; iw < dt_list.size(); iw++)
250  {
251  auto& dt = dt_list.getCastedElement<SoaDistanceTableABOMPTarget>(iw);
252 
253  for (int i = 0; i < dt.targets(); ++i)
254  {
255  assert(dt.distances_[i].data() == mw_r_dr.data() + (i + count_targets) * stride_size);
256  assert(dt.displacements_[i].data() == mw_r_dr.data() + (i + count_targets) * stride_size + num_padded);
257  }
258  count_targets += dt.targets();
259  }
260 #endif
261 
262  // This is horrible optimization putting different data types in a single buffer but allows a single H2D transfer
263  const size_t realtype_size = sizeof(RealType);
264  const size_t int_size = sizeof(int);
265  const size_t ptr_size = sizeof(RealType*);
266  auto& offload_input = mw_mem.offload_input;
267  offload_input.resize(total_targets * D * realtype_size + total_targets * int_size + nw * ptr_size);
268  auto source_ptrs = reinterpret_cast<RealType**>(offload_input.data());
269  auto target_positions = reinterpret_cast<RealType*>(offload_input.data() + ptr_size * nw);
270  auto walker_id_ptr =
271  reinterpret_cast<int*>(offload_input.data() + ptr_size * nw + total_targets * D * realtype_size);
272 
273  count_targets = 0;
274  for (size_t iw = 0; iw < nw; iw++)
275  {
276  auto& dt = dt_list.getCastedElement<SoaDistanceTableABOMPTarget>(iw);
277  ParticleSet& pset(p_list[iw]);
278 
279  assert(dt.targets() == pset.getTotalNum());
280  assert(num_sources_ == dt.num_sources_);
281 
282  auto& RSoA_OMPTarget = static_cast<const RealSpacePositionsOMPTarget&>(dt.origin_.getCoordinates());
283  source_ptrs[iw] = const_cast<RealType*>(RSoA_OMPTarget.getDevicePtr());
284 
285  for (size_t iat = 0; iat < pset.getTotalNum(); ++iat, ++count_targets)
286  {
287  walker_id_ptr[count_targets] = iw;
288  for (size_t idim = 0; idim < D; idim++)
289  target_positions[count_targets * D + idim] = pset.R[iat][idim];
290  }
291  }
292 
293  // To maximize thread usage, the loop over electrons is chunked. Each chunk is sent to an OpenMP offload thread team.
294  const int ChunkSizePerTeam = 512;
295  const size_t num_teams = (num_sources_ + ChunkSizePerTeam - 1) / ChunkSizePerTeam;
296 
297  auto* r_dr_ptr = mw_r_dr.data();
298  auto* input_ptr = offload_input.data();
299  const int num_sources_local = num_sources_;
300 
301  {
302  ScopedTimer offload(dt_leader.offload_timer_);
303  PRAGMA_OFFLOAD("omp target teams distribute collapse(2) num_teams(total_targets*num_teams) \
304  map(always, to: input_ptr[:offload_input.size()]) \
305  depend(out:r_dr_ptr[:mw_r_dr.size()])")
306  for (int iat = 0; iat < total_targets; ++iat)
307  for (int team_id = 0; team_id < num_teams; team_id++)
308  {
309  auto* target_pos_ptr = reinterpret_cast<RealType*>(input_ptr + ptr_size * nw);
310  const int walker_id =
311  reinterpret_cast<int*>(input_ptr + ptr_size * nw + total_targets * D * realtype_size)[iat];
312  auto* source_pos_ptr = reinterpret_cast<RealType**>(input_ptr)[walker_id];
313  auto* r_iat_ptr = r_dr_ptr + iat * num_padded * (D + 1);
314  auto* dr_iat_ptr = r_dr_ptr + iat * num_padded * (D + 1) + num_padded;
315 
316  const int first = ChunkSizePerTeam * team_id;
317  const int last = omptarget::min(first + ChunkSizePerTeam, num_sources_local);
318 
319  T pos[D];
320  for (int idim = 0; idim < D; idim++)
321  pos[idim] = target_pos_ptr[iat * D + idim];
322 
323  PRAGMA_OFFLOAD("omp parallel for")
324  for (int iel = first; iel < last; iel++)
325  DTD_BConds<T, D, SC>::computeDistancesOffload(pos, source_pos_ptr, num_padded, r_iat_ptr, dr_iat_ptr,
326  num_padded, iel);
327  }
328 
330  {
331  PRAGMA_OFFLOAD(
332  "omp target update from(r_dr_ptr[:mw_r_dr.size()]) depend(inout:r_dr_ptr[:mw_r_dr.size()]) nowait")
333  }
334  // wait for computing and (optional) transferring back to host.
335  // It can potentially be moved to ParticleSet to fuse multiple similar taskwait
336  PRAGMA_OFFLOAD("omp taskwait")
337  }
338  }
339 
341  const RefVectorWithLeader<ParticleSet>& p_list,
342  const std::vector<bool>& recompute) const override
343  {
344  mw_evaluate(dt_list, p_list);
345  }
346 
347  ///evaluate the temporary pair relations
348  inline void move(const ParticleSet& P, const PosType& rnew, const IndexType iat, bool prepare_old) override
349  {
350  ScopedTimer local_timer(move_timer_);
352  0, num_sources_);
353  // If the full table is not ready all the time, overwrite the current value.
354  // If this step is missing, DT values can be undefined in case a move is rejected.
355  if (!(modes_ & DTModes::NEED_FULL_TABLE_ANYTIME) && prepare_old)
357  distances_[iat].data(), displacements_[iat], 0, num_sources_);
358  }
359 
360  ///update the stripe for jat-th particle
361  inline void update(IndexType iat) override
362  {
363  ScopedTimer local_timer(update_timer_);
365  for (int idim = 0; idim < D; ++idim)
366  std::copy_n(temp_dr_.data(idim), num_sources_, displacements_[iat].data(idim));
367  }
368 
369  int get_first_neighbor(IndexType iat, RealType& r, PosType& dr, bool newpos) const override
370  {
371  RealType min_dist = std::numeric_limits<RealType>::max();
372  int index = -1;
373  if (newpos)
374  {
375  for (int jat = 0; jat < num_sources_; ++jat)
376  if (temp_r_[jat] < min_dist)
377  {
378  min_dist = temp_r_[jat];
379  index = jat;
380  }
381  if (index >= 0)
382  {
383  r = min_dist;
384  dr = temp_dr_[index];
385  }
386  }
387  else
388  {
389  for (int jat = 0; jat < num_sources_; ++jat)
390  if (distances_[iat][jat] < min_dist)
391  {
392  min_dist = distances_[iat][jat];
393  index = jat;
394  }
395  if (index >= 0)
396  {
397  r = min_dist;
398  dr = displacements_[iat][index];
399  }
400  }
401  assert(index >= 0 && index < num_sources_);
402  return index;
403  }
404 
405 private:
406  /// timer for offload portion
408  /// timer for evaluate()
410  /// timer for move()
412  /// timer for update()
414 };
415 } // namespace qmcplusplus
416 #endif
void resize(size_type n, Type_t val=Type_t())
Resize the container.
Definition: OhmmsVector.h:166
static void associateResource(const RefVectorWithLeader< DistanceTable > &dt_list)
QMCTraits::RealType RealType
Definition: DistanceTable.h:44
size_t addResource(std::unique_ptr< Resource > &&res, bool noprint=false)
NewTimer & offload_timer_
timer for offload portion
void takebackResource(ResourceHandle< RS > &res_handle)
helper functions for EinsplineSetBuilder
Definition: Configuration.h:43
void evaluate(ParticleSet &P) override
evaluate the full table
virtual const PosVectorSoa & getAllParticlePos() const =0
all particle position accessor
ResourceHandle manages the temporary resource referenced from a collection.
OffloadPinnedVector< T > mw_r_dr
accelerator output array for multiple walkers, [1+D][num_targets_][num_padded] (distances, displacements)
void releaseResource(ResourceCollection &collection, const RefVectorWithLeader< DistanceTable > &dt_list) const override
return a shared resource to a collection
void createResource(ResourceCollection &collection) const override
initialize a shared resource and hand it to a collection
Timer accumulates time and call counts.
Definition: NewTimer.h:135
OffloadPinnedVector< RealType > r_dr_memorypool_
accelerator output buffer for r and dr
void acquireResource(ResourceCollection &collection, const RefVectorWithLeader< DistanceTable > &dt_list) const override
acquire a shared resource from a collection
void update(IndexType iat) override
update the stripe for jat-th particle
const std::string name_
name of the table
Definition: DistanceTable.h:57
T min(T a, T b)
Specialized paritlce class for atomistic simulations.
Definition: ParticleSet.h:55
const T * getMultiWalkerDataPtr() const override
return multi-walker full (all pairs) distance table data pointer
int get_first_neighbor(IndexType iat, RealType &r, PosType &dr, bool newpos) const override
AB type of DistanceTable containing storage.
size_t targets() const
returns the number of centers
Definition: DistanceTable.h:94
Introduced to handle virtual moves and ratio computations, e.g.
CASTTYPE & getCastedElement(size_t i) const
NewTimer & createGlobalTimer(const std::string &myname, timer_levels mylevel)
ParticlePos R
Position.
Definition: ParticleSet.h:79
const DynamicCoordinates & getCoordinates() const
Definition: ParticleSet.h:246
whether full table needs to be ready at anytime or not during PbyP Optimization can be implemented du...
size_t getPerTargetPctlStrideSize() const override
return stride of per target pctl data. full table data = stride * num of target particles ...
QMCTraits::IndexType IndexType
Definition: DistanceTable.h:43
sycl::event copy_n(sycl::queue &aq, const T1 *restrict VA, size_t array_size, T2 *restrict VC, const std::vector< sycl::event > &events)
Definition: syclBLAS.cpp:548
A derived classe from DistacneTableData, specialized for AB using a transposed form.
ResourceHandle< DTABMultiWalkerMem > mw_mem_handle_
OffloadPinnedVector< T > target_pos
accelerator input array for a list of target particle positions, num_targets_ x D ...
void mw_recompute(const RefVectorWithLeader< DistanceTable > &dt_list, const RefVectorWithLeader< ParticleSet > &p_list, const std::vector< bool > &recompute) const override
recompute multi walker internal data, recompute
SoaDistanceTableABOMPTarget(const ParticleSet &source, ParticleSet &target)
handle math function mapping inside OpenMP offload regions.
ResourceHandle< RS > lendResource()
std::vector< DisplRow > displacements_
displacements_[num_targets_][3][num_sources_], [i][3][j] = r_A2[j] - r_A1[i] Note: Derived classes de...
OffloadPinnedVector< char > offload_input
accelerator input buffer for multiple data set
void resize(size_type n)
resize myData
const ParticleSet & origin_
Definition: DistanceTable.h:51
void mw_evaluate(const RefVectorWithLeader< DistanceTable > &dt_list, const RefVectorWithLeader< ParticleSet > &p_list) const override
void move(const ParticleSet &P, const PosType &rnew, const IndexType iat, bool prepare_old) override
evaluate the temporary pair relations
std::vector< DistRow > distances_
distances_[num_targets_][num_sources_], [i][3][j] = |r_A2[j] - r_A1[i]| Note: Derived classes decide ...
DTModes modes_
operation modes defined by DTModes
Definition: DistanceTable.h:60
skip data transfer back to host after mw_evalaute full distance table.