QMCPACK
OMPallocator.hpp
Go to the documentation of this file.
1 //////////////////////////////////////////////////////////////////////////////////////
2 // This file is distributed under the University of Illinois/NCSA Open Source License.
3 // See LICENSE file in top directory for details.
4 //
5 // Copyright (c) 2021 QMCPACK developers.
6 //
7 // File developed by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
8 // Peter Doak, doakpw@ornl.gov, Oak Ridge National Laboratory
9 //
10 // File created by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
11 //////////////////////////////////////////////////////////////////////////////////////
12 // -*- C++ -*-
13 /** @file
14  */
15 #ifndef QMCPLUSPLUS_OMPTARGET_ALLOCATOR_H
16 #define QMCPLUSPLUS_OMPTARGET_ALLOCATOR_H
17 
18 #include <memory>
19 #include <type_traits>
20 #include <atomic>
21 #include "config.h"
22 #include "allocator_traits.hpp"
23 #if defined(ENABLE_OFFLOAD)
24 #include <omp.h>
25 #endif
26 
27 #if defined(QMC_OFFLOAD_MEM_ASSOCIATED)
28 #include <CUDA/CUDAruntime.hpp>
29 #endif
30 
31 namespace qmcplusplus
32 {
33 extern std::atomic<size_t> OMPallocator_device_mem_allocated;
34 
36 
37 template<typename T>
38 T* getOffloadDevicePtr(T* host_ptr)
39 {
40  T* device_ptr;
41  PRAGMA_OFFLOAD("omp target data use_device_ptr(host_ptr)") { device_ptr = host_ptr; }
42  return device_ptr;
43 }
44 
45 /** OMPallocator is an allocator with fused device and dualspace allocator functionality.
46  * it is mostly c++03 style but is stateful with respect to the bond between the returned pt on the host
47  * and the device_ptr_. While many containers may need a copy only one can own the memory
48  * it returns and it can only service one owner. i.e. only one object should call the allocate
49  * and deallocate methods.
50  *
51  * Note: in the style of openmp portability this class always thinks its dual space even when its not,
52  * this happens through the magic of openmp ignoring target pragmas when target isn't enabled. i.e.
53  * -fopenmp-targets=... isn't passed at compile time.
54  * This makes the code "simpler" and more "portable" since its the same code you would write for
55  * openmp CPU implementation *exploding head* and that is the same implementation + pragmas
56  * as the serial implementation. This definitely isn't true for all QMCPACK code using offload
57  * but it is true for OMPAllocator so we do test it that way.
58  */
59 template<typename T, class HostAllocator = std::allocator<T>>
60 struct OMPallocator : public HostAllocator
61 {
63  using size_type = typename HostAllocator::size_type;
64  using pointer = typename HostAllocator::pointer;
65  using const_pointer = typename HostAllocator::const_pointer;
66 
67  OMPallocator() = default;
68  /** Gives you a OMPallocator with no state.
69  * But OMPallocoator is stateful so this copy constructor is a lie.
70  * However until allocators are correct > c++11 this is retained since
71  * our < c++11 compliant containers may expect it.
72  */
73  OMPallocator(const OMPallocator&) : device_ptr_(nullptr) {}
74  template<class U, class V>
76  {}
77 
78  template<class U, class V>
79  struct rebind
80  {
82  };
83 
84  value_type* allocate(std::size_t n)
85  {
86  static_assert(std::is_same<T, value_type>::value, "OMPallocator and HostAllocator data types must agree!");
87  value_type* pt = HostAllocator::allocate(n);
88 #if defined(QMC_OFFLOAD_MEM_ASSOCIATED)
89  cudaErrorCheck(cudaMalloc(&device_ptr_, n * sizeof(T)), "cudaMalloc failed in OMPallocator!");
90  const int status = omp_target_associate_ptr(pt, device_ptr_, n * sizeof(T), 0, omp_get_default_device());
91  if (status != 0)
92  throw std::runtime_error("omp_target_associate_ptr failed in OMPallocator!");
93 #else
94  PRAGMA_OFFLOAD("omp target enter data map(alloc:pt[0:n])")
96 #endif
97  OMPallocator_device_mem_allocated += n * sizeof(T);
98  return pt;
99  }
100 
101  void deallocate(value_type* pt, std::size_t n)
102  {
103  OMPallocator_device_mem_allocated -= n * sizeof(T);
104 #if defined(QMC_OFFLOAD_MEM_ASSOCIATED)
105  T* device_ptr_from_omp = getOffloadDevicePtr(pt);
106  const int status = omp_target_disassociate_ptr(pt, omp_get_default_device());
107  if (status != 0)
108  throw std::runtime_error("omp_target_disassociate_ptr failed in OMPallocator!");
109  cudaErrorCheck(cudaFree(device_ptr_from_omp), "cudaFree failed in OMPallocator!");
110 #else
111  PRAGMA_OFFLOAD("omp target exit data map(delete:pt[0:n])")
112 #endif
113  HostAllocator::deallocate(pt, n);
114  }
115 
116  void attachReference(const OMPallocator& from, std::ptrdiff_t ptr_offset)
117  {
118  device_ptr_ = const_cast<typename OMPallocator::pointer>(from.get_device_ptr()) + ptr_offset;
119  }
120 
121  T* get_device_ptr() { return device_ptr_; }
122  const T* get_device_ptr() const { return device_ptr_; }
123 
124 private:
125  // pointee is on device.
126  T* device_ptr_ = nullptr;
127 };
128 
129 /** Specialization for OMPallocator which is a special DualAllocator with fused
130  * device and dualspace allocator functionality.
131  */
132 template<typename T, class HostAllocator>
133 struct qmc_allocator_traits<OMPallocator<T, HostAllocator>>
134 {
135  static constexpr bool is_host_accessible = true;
136  static constexpr bool is_dual_space = true;
137 
138  static void fill_n(T* ptr, size_t n, const T& value)
139  {
141  //PRAGMA_OFFLOAD("omp target update to(ptr[:n])")
142  }
143 
146  std::ptrdiff_t ptr_offset)
147  {
148  to.attachReference(from, ptr_offset);
149  }
150 
151  static void updateTo(OMPallocator<T, HostAllocator>& alloc, T* host_ptr, size_t n, size_t offset = 0)
152  {
153  PRAGMA_OFFLOAD("omp target update to(host_ptr[offset:n])");
154  }
155 
156  static void updateFrom(OMPallocator<T, HostAllocator>& alloc, T* host_ptr, size_t n, size_t offset = 0)
157  {
158  PRAGMA_OFFLOAD("omp target update from(host_ptr[offset:n])");
159  }
160 
161  // Not very optimized device side copy. Only used for testing.
162  static void deviceSideCopyN(OMPallocator<T, HostAllocator>& alloc, size_t to, size_t n, size_t from)
163  {
164  auto* dev_ptr = alloc.get_device_ptr();
165  PRAGMA_OFFLOAD("omp target teams distribute parallel for is_device_ptr(dev_ptr)")
166  for (int i = 0; i < n; i++)
167  dev_ptr[to + i] = dev_ptr[from + i];
168  }
169 };
170 
171 #if defined(ENABLE_OFFLOAD)
172 /** allocator for OMPTarget device memory
173  * @tparam T data type
174  *
175  * using this with something other than Ohmms containers?
176  * -- use caution, write unit tests! --
177  * It's not tested beyond use in some unit tests using std::vector with constant size.
178  * OMPTargetAllocator appears to meet all the nonoptional requirements of a c++ Allocator.
179  *
180  * Some of the default implementations in std::allocator_traits
181  * of optional Allocator requirements may cause runtime or compilation failures.
182  * They assume there is only one memory space and that the host has access to it.
183  */
184 template<typename T>
185 class OMPTargetAllocator
186 {
187 public:
188  using value_type = T;
189  using size_type = size_t;
190  using pointer = T*;
191  using const_pointer = const T*;
192 
193  OMPTargetAllocator() = default;
194  template<class U>
195  OMPTargetAllocator(const OMPTargetAllocator<U>&)
196  {}
197 
198  template<class U>
199  struct rebind
200  {
201  using other = OMPTargetAllocator<U>;
202  };
203 
204  T* allocate(std::size_t n)
205  {
206  void* pt = omp_target_alloc(n * sizeof(T), omp_get_default_device());
207  if (!pt)
208  throw std::runtime_error("Allocation failed in OMPTargetAllocator!");
209  OMPallocator_device_mem_allocated += n * sizeof(T);
210  return static_cast<T*>(pt);
211  }
212 
213  void deallocate(T* p, std::size_t n)
214  {
215  omp_target_free(p, omp_get_default_device());
216  OMPallocator_device_mem_allocated -= n * sizeof(T);
217  }
218 
219  /** Provide a construct for std::allocator_traits::contruct to call.
220  * Don't do anything on construct, pointer p is on the device!
221  *
222  * For example std::vector calls this to default initialize each element. You'll segfault
223  * if std::allocator_traits::construct tries doing that at p.
224  *
225  * The standard is a bit confusing on this point. Implementing this is an optional requirement
226  * of Allocator from C++11 on, its not slated to be removed.
227  *
228  * Its deprecated for the std::allocator in c++17 and will be removed in c++20. But we are not implementing
229  * std::allocator.
230  *
231  * STL containers only use Allocators through allocator_traits and std::allocator_traits handles the case
232  * where no construct method is present in the Allocator.
233  * But std::allocator_traits will call the Allocators construct method if present.
234  */
235  template<class U, class... Args>
236  static void construct(U* p, Args&&... args)
237  {}
238 
239  /** Give std::allocator_traits something to call.
240  * The default if this isn't present is to call p->~T() which
241  * we can't do on device memory.
242  */
243  template<class U>
244  static void destroy(U* p)
245  {}
246 
247  void copyToDevice(T* device_ptr, T* host_ptr, size_t n)
248  {
249  const auto host_id = omp_get_initial_device();
250  if (omp_target_memcpy(device_ptr, host_ptr, n, 0, 0, omp_get_default_device(), host_id))
251  throw std::runtime_error("omp_target_memcpy failed in copyToDevice");
252  }
253 
254  void copyFromDevice(T* host_ptr, T* device_ptr, size_t n)
255  {
256  const auto host_id = omp_get_initial_device();
257  if (omp_target_memcpy(host_ptr, device_ptr, n, 0, 0, host_id, omp_get_default_device()))
258  throw std::runtime_error("omp_target_memcpy failed in copyToDevice");
259  }
260 
261  void copyDeviceToDevice(T* to_ptr, size_t n, T* from_ptr)
262  {
263  if (omp_target_memcpy(to_ptr, from_ptr, n, 0, 0, omp_get_default_device(), omp_get_default_device()))
264  throw std::runtime_error("omp_target_memcpy failed in copyToDevice");
265  }
266 };
267 
268 template<class T1, class T2>
269 bool operator==(const OMPTargetAllocator<T1>&, const OMPTargetAllocator<T2>&)
270 {
271  return true;
272 }
273 template<class T1, class T2>
274 bool operator!=(const OMPTargetAllocator<T1>&, const OMPTargetAllocator<T2>&)
275 {
276  return false;
277 }
278 
279 template<typename T>
280 struct qmc_allocator_traits<qmcplusplus::OMPTargetAllocator<T>>
281 {
282  static const bool is_host_accessible = false;
283  static const bool is_dual_space = false;
284  static void fill_n(T* ptr, size_t n, const T& value) {}
285 };
286 #endif
287 
288 } // namespace qmcplusplus
289 #endif
typename std::allocator< Value > ::const_pointer const_pointer
helper functions for EinsplineSetBuilder
Definition: Configuration.h:43
void deallocate(value_type *pt, std::size_t n)
void attachReference(const OMPallocator &from, std::ptrdiff_t ptr_offset)
handle CUDA/HIP runtime selection.
size_t getOMPdeviceMemAllocated()
bool operator==(const Matrix< T, Alloc > &lhs, const Matrix< T, Alloc > &rhs)
Definition: OhmmsMatrix.h:388
static void deviceSideCopyN(OMPallocator< T, HostAllocator > &alloc, size_t to, size_t n, size_t from)
const T * get_device_ptr() const
typename std::allocator< Value > ::size_type size_type
T * getOffloadDevicePtr(T *host_ptr)
static void updateTo(OMPallocator< T, HostAllocator > &alloc, T *host_ptr, size_t n, size_t offset=0)
cudaErrorCheck(cudaMemcpyAsync(dev_lu.data(), lu.data(), sizeof(decltype(lu)::value_type) *lu.size(), cudaMemcpyHostToDevice, hstream), "cudaMemcpyAsync failed copying log_values to device")
OMPallocator(const OMPallocator &)
Gives you a OMPallocator with no state.
#define cudaFree
Definition: cuda2hip.h:99
#define cudaMalloc
Definition: cuda2hip.h:119
static constexpr bool is_host_accessible
typename std::allocator< Value > ::value_type value_type
OMPallocator is an allocator with fused device and dualspace allocator functionality.
template class analogous to std::allocator_traits.
OMPallocator(const OMPallocator< U, V > &)
static void attachReference(const OMPallocator< T, HostAllocator > &from, OMPallocator< T, HostAllocator > &to, std::ptrdiff_t ptr_offset)
std::atomic< size_t > OMPallocator_device_mem_allocated
typename std::allocator< Value > ::pointer pointer
value_type * allocate(std::size_t n)
static void fill_n(value_type *ptr, size_t n, const value_type &value)
QMCTraits::FullPrecRealType value_type
static void updateFrom(OMPallocator< T, HostAllocator > &alloc, T *host_ptr, size_t n, size_t offset=0)
bool operator!=(const Matrix< T, Alloc > &lhs, const Matrix< T, Alloc > &rhs)
Definition: OhmmsMatrix.h:403