d2/ddd/a00755_source.html

 //////////////////////////////////////////////////////////////////////////////////////
 // This file is distributed under the University of Illinois/NCSA Open Source License.
 // See LICENSE file in top directory for details.
 //
 // Copyright (c) 2019 QMCPACK developers.
 //
 // File developed by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
 //
 // File created by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
 //////////////////////////////////////////////////////////////////////////////////////
 // -*- C++ -*-
 /** @file CUDAallocator.hpp
  * this file provides three C++ memory allocators using CUDA specific memory allocation functions.
  *
  * CUDAManagedAllocator allocates CUDA unified memory
  * CUDAAllocator allocates CUDA device memory
  * CUDAHostAllocator allocates CUDA host pinned memory
  */
 #ifndef QMCPLUSPLUS_CUDA_ALLOCATOR_H
 #define QMCPLUSPLUS_CUDA_ALLOCATOR_H

 #include <memory>
 #include <cstdlib>
 #include <stdexcept>
 #include <atomic>
 #include <limits>
 #include "CUDAruntime.hpp"
 #include "allocator_traits.hpp"
 #include "CUDAfill.hpp"

 namespace qmcplusplus
 {
 extern std::atomic<size_t> CUDAallocator_device_mem_allocated;

 inline size_t getCUDAdeviceMemAllocated() { return CUDAallocator_device_mem_allocated; }

 /** allocator for CUDA unified memory
  * @tparam T data type
  */
 template<typename T>
 struct CUDAManagedAllocator
 {
   using value_type    = T;
   using size_type     = size_t;
   using pointer       = T*;
   using const_pointer = const T*;

   CUDAManagedAllocator() = default;
   template<class U>
   CUDAManagedAllocator(const CUDAManagedAllocator<U>&)
   {}

   template<class U>
   struct rebind
   {
     using other = CUDAManagedAllocator<U>;
   };

   T* allocate(std::size_t n)
   {
     void* pt;
     cudaErrorCheck(cudaMallocManaged(&pt, n * sizeof(T)), "Allocation failed in CUDAManagedAllocator!");
     if ((size_t(pt)) & (QMC_SIMD_ALIGNMENT - 1))
       throw std::runtime_error("Unaligned memory allocated in CUDAManagedAllocator");
     return static_cast<T*>(pt);
   }
   void deallocate(T* p, std::size_t) { cudaErrorCheck(cudaFree(p), "Deallocation failed in CUDAManagedAllocator!"); }
 };

 template<class T1, class T2>
 bool operator==(const CUDAManagedAllocator<T1>&, const CUDAManagedAllocator<T2>&)
 {
   return true;
 }
 template<class T1, class T2>
 bool operator!=(const CUDAManagedAllocator<T1>&, const CUDAManagedAllocator<T2>&)
 {
   return false;
 }


 /** allocator for CUDA device memory
  * @tparam T data type
  *
  * using this with something other than Ohmms containers?
  *  -- use caution, write unit tests! --
  * It's not tested beyond use in some unit tests using std::vector with constant size.
  * CUDAAllocator appears to meet all the nonoptional requirements of a c++ Allocator.
  *
  * Some of the default implementations in std::allocator_traits
  * of optional Allocator requirements may cause runtime or compilation failures.
  * They assume there is only one memory space and that the host has access to it.
  */
 template<typename T>
 class CUDAAllocator
 {
 public:
   using value_type    = T;
   using size_type     = size_t;
   using pointer       = T*;
   using const_pointer = const T*;

   CUDAAllocator() = default;
   template<class U>
   CUDAAllocator(const CUDAAllocator<U>&)
   {}

   template<class U>
   struct rebind
   {
     using other = CUDAAllocator<U>;
   };

   T* allocate(std::size_t n)
   {
     void* pt;
     cudaErrorCheck(cudaMalloc(&pt, n * sizeof(T)), "Allocation failed in CUDAAllocator!");
     CUDAallocator_device_mem_allocated += n * sizeof(T);
     return static_cast<T*>(pt);
   }
   void deallocate(T* p, std::size_t n)
   {
     cudaErrorCheck(cudaFree(p), "Deallocation failed in CUDAAllocator!");
     CUDAallocator_device_mem_allocated -= n * sizeof(T);
   }

   /** Provide a construct for std::allocator_traits::contruct to call.
    *  Don't do anything on construct, pointer p is on the device!
    *
    *  For example std::vector calls this to default initialize each element. You'll segfault
    *  if std::allocator_traits::construct tries doing that at p.
    *
    *  The standard is a bit confusing on this point. Implementing this is an optional requirement
    *  of Allocator from C++11 on, its not slated to be removed.
    *
    *  Its deprecated for the std::allocator in c++17 and will be removed in c++20.  But we are not implementing
    *  std::allocator.
    *
    *  STL containers only use Allocators through allocator_traits and std::allocator_traits handles the case
    *  where no construct method is present in the Allocator.
    *  But std::allocator_traits will call the Allocators construct method if present.
    */
   template<class U, class... Args>
   static void construct(U* p, Args&&... args)
   {}

   /** Give std::allocator_traits something to call.
    *  The default if this isn't present is to call p->~T() which
    *  we can't do on device memory.
    */
   template<class U>
   static void destroy(U* p)
   {}

   void copyToDevice(T* device_ptr, T* host_ptr, size_t n)
   {
     cudaErrorCheck(cudaMemcpy(device_ptr, host_ptr, sizeof(T) * n, cudaMemcpyHostToDevice),
                    "cudaMemcpy failed in copyToDevice");
   }

   void copyFromDevice(T* host_ptr, T* device_ptr, size_t n)
   {
     cudaErrorCheck(cudaMemcpy(host_ptr, device_ptr, sizeof(T) * n, cudaMemcpyDeviceToHost),
                    "cudaMemcpy failed in copyFromDevice");
   }

   void copyDeviceToDevice(T* to_ptr, size_t n, T* from_ptr)
   {
     cudaErrorCheck(cudaMemcpy(to_ptr, from_ptr, sizeof(T) * n, cudaMemcpyDeviceToDevice),
                    "cudaMemcpy failed in copyDeviceToDevice");
   }
 };

 template<class T1, class T2>
 bool operator==(const CUDAAllocator<T1>&, const CUDAAllocator<T2>&)
 {
   return true;
 }
 template<class T1, class T2>
 bool operator!=(const CUDAAllocator<T1>&, const CUDAAllocator<T2>&)
 {
   return false;
 }

 template<typename T>
 struct qmc_allocator_traits<qmcplusplus::CUDAAllocator<T>>
 {
   static const bool is_host_accessible = false;
   static const bool is_dual_space      = false;
   static void fill_n(T* ptr, size_t n, const T& value) { qmcplusplus::CUDAfill_n(ptr, n, value); }
 };

 /** allocator for CUDA host pinned memory
  * @tparam T data type
  */
 template<typename T>
 struct CUDAHostAllocator
 {
   using value_type    = T;
   using size_type     = size_t;
   using pointer       = T*;
   using const_pointer = const T*;

   CUDAHostAllocator() = default;
   template<class U>
   CUDAHostAllocator(const CUDAHostAllocator<U>&)
   {}

   template<class U>
   struct rebind
   {
     using other = CUDAHostAllocator<U>;
   };

   T* allocate(std::size_t n)
   {
     void* pt;
     cudaErrorCheck(cudaMallocHost(&pt, n * sizeof(T)), "Allocation failed in CUDAHostAllocator!");
     return static_cast<T*>(pt);
   }
   void deallocate(T* p, std::size_t) { cudaErrorCheck(cudaFreeHost(p), "Deallocation failed in CUDAHostAllocator!"); }
 };

 template<class T1, class T2>
 bool operator==(const CUDAHostAllocator<T1>&, const CUDAHostAllocator<T2>&)
 {
   return true;
 }
 template<class T1, class T2>
 bool operator!=(const CUDAHostAllocator<T1>&, const CUDAHostAllocator<T2>&)
 {
   return false;
 }

 /** allocator locks memory pages allocated by ULPHA
  * @tparam T data type
  * @tparam ULPHA host memory allocator using unlocked page
  *
  * ULPHA cannot be CUDAHostAllocator
  */
 template<typename T, class ULPHA = std::allocator<T>>
 struct CUDALockedPageAllocator : public ULPHA
 {
   using value_type    = typename ULPHA::value_type;
   using size_type     = typename ULPHA::size_type;
   using pointer       = typename ULPHA::pointer;
   using const_pointer = typename ULPHA::const_pointer;

   CUDALockedPageAllocator() = default;
   template<class U, class V>
   CUDALockedPageAllocator(const CUDALockedPageAllocator<U, V>&)
   {}

   template<class U, class V>
   struct rebind
   {
     using other = CUDALockedPageAllocator<U, V>;
   };

   value_type* allocate(std::size_t n)
   {
     static_assert(std::is_same<T, value_type>::value, "CUDALockedPageAllocator and ULPHA data types must agree!");
     value_type* pt = ULPHA::allocate(n);
     cudaErrorCheck(cudaHostRegister(pt, n * sizeof(T), cudaHostRegisterDefault),
                    "cudaHostRegister failed in CUDALockedPageAllocator!");
     return pt;
   }

   void deallocate(value_type* pt, std::size_t n)
   {
     cudaErrorCheck(cudaHostUnregister(pt), "cudaHostUnregister failed in CUDALockedPageAllocator!");
     ULPHA::deallocate(pt, n);
   }
 };

 } // namespace qmcplusplus

 #endif
qmcplusplus::qmc_allocator_traits::is_dual_space
static constexpr bool is_dual_space
Definition: allocator_traits.hpp:31

qmcplusplus::CUDAManagedAllocator::rebind
Definition: CUDAallocator.hpp:54

qmcplusplus::CUDAAllocator< T_FP >::size_type
size_t size_type
Definition: CUDAallocator.hpp:99

cudaHostRegister
#define cudaHostRegister
Definition: cuda2hip.h:126

qmcplusplus
helper functions for EinsplineSetBuilder
Definition: Configuration.h:43

qmcplusplus::CUDAHostAllocator::rebind
Definition: CUDAallocator.hpp:210

qmcplusplus::CUDAManagedAllocator::pointer
T * pointer
Definition: CUDAallocator.hpp:45

qmcplusplus::CUDAAllocator::deallocate
void deallocate(T *p, std::size_t n)
Definition: CUDAallocator.hpp:121

qmcplusplus::CUDAManagedAllocator::CUDAManagedAllocator
CUDAManagedAllocator()=default

cudaMemcpy
#define cudaMemcpy
Definition: cuda2hip.h:135

CUDAruntime.hpp
handle CUDA/HIP runtime selection.

qmcplusplus::CUDALockedPageAllocator::size_type
typename ULPHA::size_type size_type
Definition: CUDAallocator.hpp:245

qmcplusplus::CUDAHostAllocator< T_FP >::size_type
size_t size_type
Definition: CUDAallocator.hpp:200

qmcplusplus::CUDAAllocator::CUDAAllocator
CUDAAllocator(const CUDAAllocator< U > &)
Definition: CUDAallocator.hpp:105

qmcplusplus::CUDALockedPageAllocator::const_pointer
typename ULPHA::const_pointer const_pointer
Definition: CUDAallocator.hpp:247

qmcplusplus::operator==
bool operator==(const Matrix< T, Alloc > &lhs, const Matrix< T, Alloc > &rhs)
Definition: OhmmsMatrix.h:388

qmcplusplus::CUDALockedPageAllocator::rebind
Definition: CUDAallocator.hpp:255

qmcplusplus::CUDALockedPageAllocator::deallocate
void deallocate(value_type *pt, std::size_t n)
Definition: CUDAallocator.hpp:269

qmcplusplus::CUDAHostAllocator::CUDAHostAllocator
CUDAHostAllocator(const CUDAHostAllocator< U > &)
Definition: CUDAallocator.hpp:206

qmcplusplus::CUDAManagedAllocator::deallocate
void deallocate(T *p, std::size_t)
Definition: CUDAallocator.hpp:67

cudaHostRegisterDefault
#define cudaHostRegisterDefault
Definition: cuda2hip.h:129

qmcplusplus::CUDAAllocator::copyFromDevice
void copyFromDevice(T *host_ptr, T *device_ptr, size_t n)
Definition: CUDAallocator.hpp:161

qmcplusplus::CUDAAllocator::CUDAAllocator
CUDAAllocator()=default

CUDAfill.hpp

qmcplusplus::CUDALockedPageAllocator::value_type
typename ULPHA::value_type value_type
Definition: CUDAallocator.hpp:244

qmcplusplus::CUDAAllocator< T_FP >::const_pointer
const T_FP * const_pointer
Definition: CUDAallocator.hpp:101

qmcplusplus::CUDAAllocator::destroy
static void destroy(U *p)
Give std::allocator_traits something to call.
Definition: CUDAallocator.hpp:152

qmcplusplus::CUDAHostAllocator::CUDAHostAllocator
CUDAHostAllocator()=default

cudaMallocHost
#define cudaMallocHost
Definition: cuda2hip.h:121

qmcplusplus::CUDAAllocator::copyDeviceToDevice
void copyDeviceToDevice(T *to_ptr, size_t n, T *from_ptr)
Definition: CUDAallocator.hpp:167

qmcplusplus::CUDAHostAllocator::allocate
T * allocate(std::size_t n)
Definition: CUDAallocator.hpp:215

qmcplusplus::CUDAManagedAllocator::const_pointer
const T * const_pointer
Definition: CUDAallocator.hpp:46

qmcplusplus::CUDAManagedAllocator::CUDAManagedAllocator
CUDAManagedAllocator(const CUDAManagedAllocator< U > &)
Definition: CUDAallocator.hpp:50

qmcplusplus::cudaErrorCheck
cudaErrorCheck(cudaMemcpyAsync(dev_lu.data(), lu.data(), sizeof(decltype(lu)::value_type) *lu.size(), cudaMemcpyHostToDevice, hstream), "cudaMemcpyAsync failed copying log_values to device")

qmcplusplus::CUDAManagedAllocator::size_type
size_t size_type
Definition: CUDAallocator.hpp:44

cudaMemcpyDeviceToHost
#define cudaMemcpyDeviceToHost
Definition: cuda2hip.h:138

qmcplusplus::CUDAManagedAllocator::allocate
T * allocate(std::size_t n)
Definition: CUDAallocator.hpp:59

cudaFree
#define cudaFree
Definition: cuda2hip.h:99

qmcplusplus::CUDAallocator_device_mem_allocated
std::atomic< size_t > CUDAallocator_device_mem_allocated

qmcplusplus::CUDAHostAllocator< T_FP >::const_pointer
const T_FP * const_pointer
Definition: CUDAallocator.hpp:202

cudaMalloc
#define cudaMalloc
Definition: cuda2hip.h:119

qmcplusplus::qmc_allocator_traits::is_host_accessible
static constexpr bool is_host_accessible
Definition: allocator_traits.hpp:30

cudaMallocManaged
#define cudaMallocManaged
Definition: cuda2hip.h:130

cudaMemcpyDeviceToDevice
#define cudaMemcpyDeviceToDevice
Definition: cuda2hip.h:137

cudaMemcpyHostToDevice
#define cudaMemcpyHostToDevice
Definition: cuda2hip.h:139

qmcplusplus::CUDAAllocator::construct
static void construct(U *p, Args &&... args)
Provide a construct for std::allocator_traits::contruct to call.
Definition: CUDAallocator.hpp:144

qmcplusplus::qmc_allocator_traits
template class analogous to std::allocator_traits.
Definition: allocator_traits.hpp:26

qmcplusplus::CUDALockedPageAllocator::pointer
typename ULPHA::pointer pointer
Definition: CUDAallocator.hpp:246

qmcplusplus::CUDAHostAllocator::deallocate
void deallocate(T *p, std::size_t)
Definition: CUDAallocator.hpp:221

qmcplusplus::qmc_allocator_traits< qmcplusplus::CUDAAllocator< T > >::fill_n
static void fill_n(T *ptr, size_t n, const T &value)
Definition: CUDAallocator.hpp:190

qmcplusplus::CUDAAllocator::allocate
T * allocate(std::size_t n)
Definition: CUDAallocator.hpp:114

qmcplusplus::CUDAAllocator::copyToDevice
void copyToDevice(T *device_ptr, T *host_ptr, size_t n)
Definition: CUDAallocator.hpp:155

cudaFreeHost
#define cudaFreeHost
Definition: cuda2hip.h:100

qmcplusplus::CUDAAllocator::rebind
Definition: CUDAallocator.hpp:109

qmcplusplus::n
int n
Definition: test_cuBLAS_LU.cpp:216

qmcplusplus::CUDAManagedAllocator::value_type
T value_type
Definition: CUDAallocator.hpp:43

allocator_traits.hpp

qmcplusplus::getCUDAdeviceMemAllocated
size_t getCUDAdeviceMemAllocated()
Definition: CUDAallocator.hpp:35

qmcplusplus::CUDAfill_n
void CUDAfill_n(T *ptr, size_t n, const T &value)
fill device memory with a given value.
Definition: CUDAfill.cpp:20

qmcplusplus::CUDAManagedAllocator
allocator for CUDA unified memory
Definition: CUDAallocator.hpp:41

qmcplusplus::CUDAHostAllocator< T_FP >::pointer
T_FP * pointer
Definition: CUDAallocator.hpp:201

cudaHostUnregister
#define cudaHostUnregister
Definition: cuda2hip.h:127

qmcplusplus::CUDALockedPageAllocator::allocate
value_type * allocate(std::size_t n)
Definition: CUDAallocator.hpp:260

qmcplusplus::CUDAHostAllocator
allocator for CUDA host pinned memory
Definition: CUDAallocator.hpp:197

qmcplusplus::CUDALockedPageAllocator::CUDALockedPageAllocator
CUDALockedPageAllocator(const CUDALockedPageAllocator< U, V > &)
Definition: CUDAallocator.hpp:251

qmcplusplus::CUDAAllocator< T_FP >::value_type
T_FP value_type
Definition: CUDAallocator.hpp:98

qmcplusplus::CUDAAllocator
allocator for CUDA device memory
Definition: CUDAallocator.hpp:95

qmcplusplus::CUDALockedPageAllocator::CUDALockedPageAllocator
CUDALockedPageAllocator()=default

qmcplusplus::CUDAAllocator< T_FP >::pointer
T_FP * pointer
Definition: CUDAallocator.hpp:100

qmcplusplus::value_type
QMCTraits::FullPrecRealType value_type
Definition: ObservableHelper.h:28

qmcplusplus::operator!=
bool operator!=(const Matrix< T, Alloc > &lhs, const Matrix< T, Alloc > &rhs)
Definition: OhmmsMatrix.h:403

qmcplusplus::CUDAHostAllocator< T_FP >::value_type
T_FP value_type
Definition: CUDAallocator.hpp:199

qmcplusplus::CUDALockedPageAllocator
allocator locks memory pages allocated by ULPHA
Definition: CUDAallocator.hpp:242