QMCPACK
CUDAallocator.hpp
Go to the documentation of this file.
1 //////////////////////////////////////////////////////////////////////////////////////
2 // This file is distributed under the University of Illinois/NCSA Open Source License.
3 // See LICENSE file in top directory for details.
4 //
5 // Copyright (c) 2019 QMCPACK developers.
6 //
7 // File developed by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
8 //
9 // File created by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
10 //////////////////////////////////////////////////////////////////////////////////////
11 // -*- C++ -*-
12 /** @file CUDAallocator.hpp
13  * this file provides three C++ memory allocators using CUDA specific memory allocation functions.
14  *
15  * CUDAManagedAllocator allocates CUDA unified memory
16  * CUDAAllocator allocates CUDA device memory
17  * CUDAHostAllocator allocates CUDA host pinned memory
18  */
19 #ifndef QMCPLUSPLUS_CUDA_ALLOCATOR_H
20 #define QMCPLUSPLUS_CUDA_ALLOCATOR_H
21 
22 #include <memory>
23 #include <cstdlib>
24 #include <stdexcept>
25 #include <atomic>
26 #include <limits>
27 #include "CUDAruntime.hpp"
28 #include "allocator_traits.hpp"
29 #include "CUDAfill.hpp"
30 
31 namespace qmcplusplus
32 {
33 extern std::atomic<size_t> CUDAallocator_device_mem_allocated;
34 
36 
37 /** allocator for CUDA unified memory
38  * @tparam T data type
39  */
40 template<typename T>
42 {
43  using value_type = T;
44  using size_type = size_t;
45  using pointer = T*;
46  using const_pointer = const T*;
47 
48  CUDAManagedAllocator() = default;
49  template<class U>
51  {}
52 
53  template<class U>
54  struct rebind
55  {
57  };
58 
59  T* allocate(std::size_t n)
60  {
61  void* pt;
62  cudaErrorCheck(cudaMallocManaged(&pt, n * sizeof(T)), "Allocation failed in CUDAManagedAllocator!");
63  if ((size_t(pt)) & (QMC_SIMD_ALIGNMENT - 1))
64  throw std::runtime_error("Unaligned memory allocated in CUDAManagedAllocator");
65  return static_cast<T*>(pt);
66  }
67  void deallocate(T* p, std::size_t) { cudaErrorCheck(cudaFree(p), "Deallocation failed in CUDAManagedAllocator!"); }
68 };
69 
70 template<class T1, class T2>
72 {
73  return true;
74 }
75 template<class T1, class T2>
77 {
78  return false;
79 }
80 
81 
82 /** allocator for CUDA device memory
83  * @tparam T data type
84  *
85  * using this with something other than Ohmms containers?
86  * -- use caution, write unit tests! --
87  * It's not tested beyond use in some unit tests using std::vector with constant size.
88  * CUDAAllocator appears to meet all the nonoptional requirements of a c++ Allocator.
89  *
90  * Some of the default implementations in std::allocator_traits
91  * of optional Allocator requirements may cause runtime or compilation failures.
92  * They assume there is only one memory space and that the host has access to it.
93  */
94 template<typename T>
96 {
97 public:
98  using value_type = T;
99  using size_type = size_t;
100  using pointer = T*;
101  using const_pointer = const T*;
102 
103  CUDAAllocator() = default;
104  template<class U>
106  {}
107 
108  template<class U>
109  struct rebind
110  {
112  };
113 
114  T* allocate(std::size_t n)
115  {
116  void* pt;
117  cudaErrorCheck(cudaMalloc(&pt, n * sizeof(T)), "Allocation failed in CUDAAllocator!");
118  CUDAallocator_device_mem_allocated += n * sizeof(T);
119  return static_cast<T*>(pt);
120  }
121  void deallocate(T* p, std::size_t n)
122  {
123  cudaErrorCheck(cudaFree(p), "Deallocation failed in CUDAAllocator!");
124  CUDAallocator_device_mem_allocated -= n * sizeof(T);
125  }
126 
127  /** Provide a construct for std::allocator_traits::contruct to call.
128  * Don't do anything on construct, pointer p is on the device!
129  *
130  * For example std::vector calls this to default initialize each element. You'll segfault
131  * if std::allocator_traits::construct tries doing that at p.
132  *
133  * The standard is a bit confusing on this point. Implementing this is an optional requirement
134  * of Allocator from C++11 on, its not slated to be removed.
135  *
136  * Its deprecated for the std::allocator in c++17 and will be removed in c++20. But we are not implementing
137  * std::allocator.
138  *
139  * STL containers only use Allocators through allocator_traits and std::allocator_traits handles the case
140  * where no construct method is present in the Allocator.
141  * But std::allocator_traits will call the Allocators construct method if present.
142  */
143  template<class U, class... Args>
144  static void construct(U* p, Args&&... args)
145  {}
146 
147  /** Give std::allocator_traits something to call.
148  * The default if this isn't present is to call p->~T() which
149  * we can't do on device memory.
150  */
151  template<class U>
152  static void destroy(U* p)
153  {}
154 
155  void copyToDevice(T* device_ptr, T* host_ptr, size_t n)
156  {
157  cudaErrorCheck(cudaMemcpy(device_ptr, host_ptr, sizeof(T) * n, cudaMemcpyHostToDevice),
158  "cudaMemcpy failed in copyToDevice");
159  }
160 
161  void copyFromDevice(T* host_ptr, T* device_ptr, size_t n)
162  {
163  cudaErrorCheck(cudaMemcpy(host_ptr, device_ptr, sizeof(T) * n, cudaMemcpyDeviceToHost),
164  "cudaMemcpy failed in copyFromDevice");
165  }
166 
167  void copyDeviceToDevice(T* to_ptr, size_t n, T* from_ptr)
168  {
169  cudaErrorCheck(cudaMemcpy(to_ptr, from_ptr, sizeof(T) * n, cudaMemcpyDeviceToDevice),
170  "cudaMemcpy failed in copyDeviceToDevice");
171  }
172 };
173 
174 template<class T1, class T2>
176 {
177  return true;
178 }
179 template<class T1, class T2>
181 {
182  return false;
183 }
184 
185 template<typename T>
187 {
188  static const bool is_host_accessible = false;
189  static const bool is_dual_space = false;
190  static void fill_n(T* ptr, size_t n, const T& value) { qmcplusplus::CUDAfill_n(ptr, n, value); }
191 };
192 
193 /** allocator for CUDA host pinned memory
194  * @tparam T data type
195  */
196 template<typename T>
198 {
199  using value_type = T;
200  using size_type = size_t;
201  using pointer = T*;
202  using const_pointer = const T*;
203 
204  CUDAHostAllocator() = default;
205  template<class U>
207  {}
208 
209  template<class U>
210  struct rebind
211  {
213  };
214 
215  T* allocate(std::size_t n)
216  {
217  void* pt;
218  cudaErrorCheck(cudaMallocHost(&pt, n * sizeof(T)), "Allocation failed in CUDAHostAllocator!");
219  return static_cast<T*>(pt);
220  }
221  void deallocate(T* p, std::size_t) { cudaErrorCheck(cudaFreeHost(p), "Deallocation failed in CUDAHostAllocator!"); }
222 };
223 
224 template<class T1, class T2>
226 {
227  return true;
228 }
229 template<class T1, class T2>
231 {
232  return false;
233 }
234 
235 /** allocator locks memory pages allocated by ULPHA
236  * @tparam T data type
237  * @tparam ULPHA host memory allocator using unlocked page
238  *
239  * ULPHA cannot be CUDAHostAllocator
240  */
241 template<typename T, class ULPHA = std::allocator<T>>
242 struct CUDALockedPageAllocator : public ULPHA
243 {
244  using value_type = typename ULPHA::value_type;
245  using size_type = typename ULPHA::size_type;
246  using pointer = typename ULPHA::pointer;
247  using const_pointer = typename ULPHA::const_pointer;
248 
249  CUDALockedPageAllocator() = default;
250  template<class U, class V>
252  {}
253 
254  template<class U, class V>
255  struct rebind
256  {
258  };
259 
260  value_type* allocate(std::size_t n)
261  {
262  static_assert(std::is_same<T, value_type>::value, "CUDALockedPageAllocator and ULPHA data types must agree!");
263  value_type* pt = ULPHA::allocate(n);
265  "cudaHostRegister failed in CUDALockedPageAllocator!");
266  return pt;
267  }
268 
269  void deallocate(value_type* pt, std::size_t n)
270  {
271  cudaErrorCheck(cudaHostUnregister(pt), "cudaHostUnregister failed in CUDALockedPageAllocator!");
272  ULPHA::deallocate(pt, n);
273  }
274 };
275 
276 } // namespace qmcplusplus
277 
278 #endif
#define cudaHostRegister
Definition: cuda2hip.h:126
helper functions for EinsplineSetBuilder
Definition: Configuration.h:43
void deallocate(T *p, std::size_t n)
#define cudaMemcpy
Definition: cuda2hip.h:135
handle CUDA/HIP runtime selection.
typename ULPHA::size_type size_type
CUDAAllocator(const CUDAAllocator< U > &)
typename ULPHA::const_pointer const_pointer
bool operator==(const Matrix< T, Alloc > &lhs, const Matrix< T, Alloc > &rhs)
Definition: OhmmsMatrix.h:388
void deallocate(value_type *pt, std::size_t n)
CUDAHostAllocator(const CUDAHostAllocator< U > &)
void deallocate(T *p, std::size_t)
#define cudaHostRegisterDefault
Definition: cuda2hip.h:129
void copyFromDevice(T *host_ptr, T *device_ptr, size_t n)
typename ULPHA::value_type value_type
static void destroy(U *p)
Give std::allocator_traits something to call.
#define cudaMallocHost
Definition: cuda2hip.h:121
void copyDeviceToDevice(T *to_ptr, size_t n, T *from_ptr)
CUDAManagedAllocator(const CUDAManagedAllocator< U > &)
cudaErrorCheck(cudaMemcpyAsync(dev_lu.data(), lu.data(), sizeof(decltype(lu)::value_type) *lu.size(), cudaMemcpyHostToDevice, hstream), "cudaMemcpyAsync failed copying log_values to device")
#define cudaMemcpyDeviceToHost
Definition: cuda2hip.h:138
#define cudaFree
Definition: cuda2hip.h:99
std::atomic< size_t > CUDAallocator_device_mem_allocated
#define cudaMalloc
Definition: cuda2hip.h:119
static constexpr bool is_host_accessible
#define cudaMallocManaged
Definition: cuda2hip.h:130
#define cudaMemcpyDeviceToDevice
Definition: cuda2hip.h:137
#define cudaMemcpyHostToDevice
Definition: cuda2hip.h:139
static void construct(U *p, Args &&... args)
Provide a construct for std::allocator_traits::contruct to call.
template class analogous to std::allocator_traits.
void deallocate(T *p, std::size_t)
T * allocate(std::size_t n)
void copyToDevice(T *device_ptr, T *host_ptr, size_t n)
#define cudaFreeHost
Definition: cuda2hip.h:100
size_t getCUDAdeviceMemAllocated()
void CUDAfill_n(T *ptr, size_t n, const T &value)
fill device memory with a given value.
Definition: CUDAfill.cpp:20
allocator for CUDA unified memory
#define cudaHostUnregister
Definition: cuda2hip.h:127
value_type * allocate(std::size_t n)
allocator for CUDA host pinned memory
CUDALockedPageAllocator(const CUDALockedPageAllocator< U, V > &)
allocator for CUDA device memory
QMCTraits::FullPrecRealType value_type
bool operator!=(const Matrix< T, Alloc > &lhs, const Matrix< T, Alloc > &rhs)
Definition: OhmmsMatrix.h:403
allocator locks memory pages allocated by ULPHA