db/dad/a01616_source.html

 //////////////////////////////////////////////////////////////////////////////////////
 // This file is distributed under the University of Illinois/NCSA Open Source License.
 // See LICENSE file in top directory for details.
 //
 // Copyright (c) 2019 QMCPACK developers.
 //
 // File developed by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
 //
 // File created by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
 //////////////////////////////////////////////////////////////////////////////////////

 #ifndef QMCPLUSPLUS_DELAYED_UPDATE_CUDA_H
 #define QMCPLUSPLUS_DELAYED_UPDATE_CUDA_H

 #include "OhmmsPETE/OhmmsVector.h"
 #include "OhmmsPETE/OhmmsMatrix.h"
 #include "CUDA/CUDAruntime.hpp"
 #include "CUDA/CUDAallocator.hpp"
 #include "CUDA/AccelBLAS_CUDA.hpp"
 #include "QMCWaveFunctions/detail/CUDA/delayed_update_helper.h"
 #include "PrefetchedRange.h"
 #if defined(QMC_CUDA2HIP)
 #include "rocSolverInverter.hpp"
 #else
 #include "cuSolverInverter.hpp"
 #endif

 namespace qmcplusplus
 {
 /** implements delayed update on NVIDIA GPU using cuBLAS and cusolverDN
  * @tparam T base precision for most computation
  * @tparam T_FP high precision for matrix inversion, T_FP >= T
  */
 template<typename T, typename T_FP>
 class DelayedUpdateCUDA
 {
   // Data staged during for delayed acceptRows
   Matrix<T, CUDAHostAllocator<T>> U;
   Matrix<T, CUDAHostAllocator<T>> Binv;
   Matrix<T> V;
   //Matrix<T> tempMat; // for debugging only
   Matrix<T, CUDAAllocator<T>> temp_gpu;
   /// GPU copy of U, V, Binv, Ainv
   Matrix<T, CUDAAllocator<T>> U_gpu;
   Matrix<T, CUDAAllocator<T>> V_gpu;
   Matrix<T, CUDAAllocator<T>> Binv_gpu;
   Matrix<T, CUDAAllocator<T>> Ainv_gpu;
   // auxiliary arrays for B
   Vector<T> p;
   Vector<int, CUDAHostAllocator<int>> delay_list;
   Vector<int, CUDAAllocator<int>> delay_list_gpu;
   /// current number of delays, increase one for each acceptance, reset to 0 after updating Ainv
   int delay_count;

 #if defined(QMC_CUDA2HIP)
   rocSolverInverter<T_FP> rocsolver_inverter;
 #else
   cuSolverInverter<T_FP> cusolver_inverter;
 #endif

   // the range of prefetched_Ainv_rows
   PrefetchedRange prefetched_range;
   // Ainv prefetch buffer
   Matrix<T, CUDAHostAllocator<T>> Ainv_buffer;

   // CUDA specific variables
   compute::Queue<PlatformKind::CUDA> queue_;
   compute::BLASHandle<PlatformKind::CUDA> blas_handle_;

   /// reset delay count to 0
   inline void clearDelayCount()
   {
     delay_count = 0;
     prefetched_range.clear();
   }

 public:
   /// default constructor
   DelayedUpdateCUDA() : delay_count(0), blas_handle_(queue_) {}

   /** resize the internal storage
    * @param norb number of electrons/orbitals
    * @param delay, maximum delay 0<delay<=norb
    */
   inline void resize(int norb, int delay)
   {
     //tempMat.resize(norb, delay);
     V.resize(delay, norb);
     U.resize(delay, norb);
     p.resize(delay);
     Binv.resize(delay, delay);
     // prefetch 8% more rows corresponding to roughly 96% acceptance ratio
     Ainv_buffer.resize(std::min(static_cast<int>(delay * 1.08), norb), norb);

     temp_gpu.resize(norb, delay);
     delay_list.resize(delay);
     U_gpu.resize(delay, norb);
     V_gpu.resize(delay, norb);
     Binv_gpu.resize(delay, delay);
     delay_list_gpu.resize(delay);
     Ainv_gpu.resize(norb, norb);
   }

   /** compute the inverse of the transpose of matrix A and its determinant value in log
    * @tparam TREAL real type
    */
   template<typename TREAL>
   void invert_transpose(const Matrix<T>& logdetT, Matrix<T>& Ainv, std::complex<TREAL>& log_value)
   {
     clearDelayCount();
 #if defined(QMC_CUDA2HIP)
     rocsolver_inverter.invert_transpose(logdetT, Ainv, Ainv_gpu, log_value);
 #else
     cusolver_inverter.invert_transpose(logdetT, Ainv, Ainv_gpu, log_value);
 #endif
   }

   /** initialize internal objects when Ainv is refreshed
    * @param Ainv inverse matrix
    */
   inline void initializeInv(const Matrix<T>& Ainv)
   {
     cudaErrorCheck(cudaMemcpyAsync(Ainv_gpu.data(), Ainv.data(), Ainv.size() * sizeof(T), cudaMemcpyHostToDevice,
                                    queue_.getNative()),
                    "cudaMemcpyAsync failed!");
     clearDelayCount();
     // H2D transfer must be synchronized regardless of host memory being pinned or not.
     queue_.sync();
   }

   inline int getDelayCount() const { return delay_count; }

   /** compute the row of up-to-date Ainv
    * @param Ainv inverse matrix
    * @param rowchanged the row id corresponding to the proposed electron
    */
   template<typename VVT>
   inline void getInvRow(const Matrix<T>& Ainv, int rowchanged, VVT& invRow)
   {
     if (!prefetched_range.checkRange(rowchanged))
     {
       int last_row = std::min(rowchanged + Ainv_buffer.rows(), Ainv.rows());
       cudaErrorCheck(cudaMemcpyAsync(Ainv_buffer.data(), Ainv_gpu[rowchanged],
                                      invRow.size() * (last_row - rowchanged) * sizeof(T), cudaMemcpyDeviceToHost,
                                      queue_.getNative()),
                      "cudaMemcpyAsync failed!");
       prefetched_range.setRange(rowchanged, last_row);
       queue_.sync();
     }
     // save AinvRow to new_AinvRow
     std::copy_n(Ainv_buffer[prefetched_range.getOffset(rowchanged)], invRow.size(), invRow.data());
     if (delay_count > 0)
     {
       constexpr T cone(1);
       constexpr T czero(0);
       const int norb     = Ainv.rows();
       const int lda_Binv = Binv.cols();
       // multiply V (NxK) Binv(KxK) U(KxN) AinvRow right to the left
       BLAS::gemv('T', norb, delay_count, cone, U.data(), norb, invRow.data(), 1, czero, p.data(), 1);
       BLAS::gemv('N', delay_count, delay_count, -cone, Binv.data(), lda_Binv, p.data(), 1, czero, Binv[delay_count], 1);
       BLAS::gemv('N', norb, delay_count, cone, V.data(), norb, Binv[delay_count], 1, cone, invRow.data(), 1);
     }
   }

   /** accept a move with the update delayed
    * @param Ainv inverse matrix
    * @param rowchanged the row id corresponding to the proposed electron
    * @param psiV new orbital values
    *
    * Before delay_count reaches the maximum delay, only Binv is updated with a recursive algorithm
    */
   template<typename VVT, typename RATIOT>
   inline void acceptRow(Matrix<T>& Ainv, int rowchanged, const VVT& psiV, const RATIOT ratio_new)
   {
     // update Binv from delay_count to delay_count+1
     constexpr T cone(1);
     constexpr T czero(0);
     const int norb     = Ainv.rows();
     const int lda_Binv = Binv.cols();
     std::copy_n(Ainv_buffer[prefetched_range.getOffset(rowchanged)], norb, V[delay_count]);
     std::copy_n(psiV.data(), norb, U[delay_count]);
     delay_list[delay_count] = rowchanged;
     // the new Binv is [[X Y] [Z sigma]]
     BLAS::gemv('T', norb, delay_count + 1, -cone, V.data(), norb, psiV.data(), 1, czero, p.data(), 1);
     // sigma
     const T sigma                  = static_cast<T>(RATIOT(1) / ratio_new);
     Binv[delay_count][delay_count] = sigma;
     // Y
     BLAS::gemv('T', delay_count, delay_count, sigma, Binv.data(), lda_Binv, p.data(), 1, czero,
                Binv.data() + delay_count, lda_Binv);
     // X
     BLAS::ger(delay_count, delay_count, cone, Binv[delay_count], 1, Binv.data() + delay_count, lda_Binv, Binv.data(),
               lda_Binv);
     // Z
     for (int i = 0; i < delay_count; i++)
       Binv[delay_count][i] *= sigma;
     delay_count++;
     // update Ainv when maximal delay is reached
     if (delay_count == lda_Binv)
       updateInvMat(Ainv, false);
   }

   /** update the full Ainv and reset delay_count
    * @param Ainv inverse matrix
    */
   inline void updateInvMat(Matrix<T>& Ainv, bool transfer_to_host = true)
   {
     // update the inverse matrix
     if (delay_count > 0)
     {
       const int norb     = Ainv.rows();
       const int lda_Binv = Binv.cols();
       cudaErrorCheck(cudaMemcpyAsync(U_gpu.data(), U.data(), norb * delay_count * sizeof(T), cudaMemcpyHostToDevice,
                                      queue_.getNative()),
                      "cudaMemcpyAsync failed!");
       compute::BLAS::gemm(blas_handle_, 'T', 'N', delay_count, norb, norb, T(1), U_gpu.data(), norb, Ainv_gpu.data(),
                           norb, T(0), temp_gpu.data(), lda_Binv);
       cudaErrorCheck(cudaMemcpyAsync(delay_list_gpu.data(), delay_list.data(), delay_count * sizeof(int),
                                      cudaMemcpyHostToDevice, queue_.getNative()),
                      "cudaMemcpyAsync failed!");
       applyW_stageV_cuda(delay_list_gpu.data(), delay_count, temp_gpu.data(), norb, temp_gpu.cols(), V_gpu.data(),
                          Ainv_gpu.data(), queue_.getNative());
       cudaErrorCheck(cudaMemcpyAsync(Binv_gpu.data(), Binv.data(), lda_Binv * delay_count * sizeof(T),
                                      cudaMemcpyHostToDevice, queue_.getNative()),
                      "cudaMemcpyAsync failed!");
       compute::BLAS::gemm(blas_handle_, 'N', 'N', norb, delay_count, delay_count, T(1), V_gpu.data(), norb,
                           Binv_gpu.data(), lda_Binv, T(0), U_gpu.data(), norb);
       compute::BLAS::gemm(blas_handle_, 'N', 'N', norb, norb, delay_count, T(-1), U_gpu.data(), norb, temp_gpu.data(),
                           lda_Binv, T(1), Ainv_gpu.data(), norb);
       clearDelayCount();
     }

     // transfer Ainv_gpu to Ainv and wait till completion
     if (transfer_to_host)
     {
       cudaErrorCheck(cudaMemcpyAsync(Ainv.data(), Ainv_gpu.data(), Ainv.size() * sizeof(T), cudaMemcpyDeviceToHost,
                                      queue_.getNative()),
                      "cudaMemcpyAsync failed!");
       queue_.sync();
     }
   }
 };
 } // namespace qmcplusplus

 #endif // QMCPLUSPLUS_DELAYED_UPDATE_CUDA_H
qmcplusplus::Vector::resize
void resize(size_type n, Type_t val=Type_t())
Resize the container.
Definition: OhmmsVector.h:166

qmcplusplus::DelayedUpdateCUDA::getInvRow
void getInvRow(const Matrix< T > &Ainv, int rowchanged, VVT &invRow)
compute the row of up-to-date Ainv
Definition: DelayedUpdateCUDA.h:138

qmcplusplus::DelayedUpdateCUDA::V
Matrix< T > V
Definition: DelayedUpdateCUDA.h:40

qmcplusplus::DelayedUpdateCUDA::clearDelayCount
void clearDelayCount()
reset delay count to 0
Definition: DelayedUpdateCUDA.h:71

qmcplusplus::DelayedUpdateCUDA::acceptRow
void acceptRow(Matrix< T > &Ainv, int rowchanged, const VVT &psiV, const RATIOT ratio_new)
accept a move with the update delayed
Definition: DelayedUpdateCUDA.h:173

qmcplusplus::compute::BLAS::gemm
void gemm(BLASHandle< PlatformKind::CUDA > &handle, const char transa, const char transb, int m, int n, int k, const float &alpha, const float *A, int lda, const float *B, int ldb, const float &beta, float *C, int ldc)
Definition: AccelBLAS_CUDA.hpp:49

qmcplusplus::DelayedUpdateCUDA::U_gpu
Matrix< T, CUDAAllocator< T > > U_gpu
GPU copy of U, V, Binv, Ainv.
Definition: DelayedUpdateCUDA.h:44

qmcplusplus::DelayedUpdateCUDA::p
Vector< T > p
Definition: DelayedUpdateCUDA.h:49

qmcplusplus
helper functions for EinsplineSetBuilder
Definition: Configuration.h:43

cuSolverInverter.hpp

qmcplusplus::PrefetchedRange::clear
void clear()
Definition: PrefetchedRange.h:30

qmcplusplus::PrefetchedRange::getOffset
int getOffset(int index) const
Definition: PrefetchedRange.h:31

BLAS::czero
constexpr std::complex< float > czero
Definition: BLAS.hpp:51

CUDAruntime.hpp
handle CUDA/HIP runtime selection.

BLAS::cone
constexpr std::complex< float > cone
Definition: BLAS.hpp:50

qmcplusplus::DelayedUpdateCUDA::cusolver_inverter
cuSolverInverter< T_FP > cusolver_inverter
Definition: DelayedUpdateCUDA.h:58

qmcplusplus::rocSolverInverter::invert_transpose
std::enable_if_t< std::is_same< TMAT, T_FP >::value > invert_transpose(const Matrix< TMAT > &logdetT, Matrix< TMAT > &Ainv, Matrix< TMAT, CUDAAllocator< TMAT >> &Ainv_gpu, std::complex< TREAL > &log_value)
compute the inverse of the transpose of matrix A and its determinant value in log when T_FP and TMAT ...
Definition: rocSolverInverter.hpp:102

qmcplusplus::PrefetchedRange::setRange
void setRange(int first_in, int last_in)
Definition: PrefetchedRange.h:25

qmcplusplus::Vector< T >

qmcplusplus::Matrix::resize
void resize(size_type n, size_type m)
Resize the container.
Definition: OhmmsMatrix.h:99

qmcplusplus::DelayedUpdateCUDA
implements delayed update on NVIDIA GPU using cuBLAS and cusolverDN
Definition: DelayedUpdateCUDA.h:35

qmcplusplus::DelayedUpdateCUDA::resize
void resize(int norb, int delay)
resize the internal storage
Definition: DelayedUpdateCUDA.h:85

CUDAallocator.hpp
this file provides three C++ memory allocators using CUDA specific memory allocation functions...

qmcplusplus::DelayedUpdateCUDA::DelayedUpdateCUDA
DelayedUpdateCUDA()
default constructor
Definition: DelayedUpdateCUDA.h:79

qmcplusplus::compute::Queue< PlatformKind::CUDA >::sync
void sync()
Definition: QueueCUDA.hpp:71

qmcplusplus::Vector::data
pointer data()
Definition: OhmmsVector.h:238

qmcplusplus::rocSolverInverter
implements matrix inversion via rocSolver
Definition: rocSolverInverter.hpp:35

AccelBLAS_CUDA.hpp

omptarget::min
T min(T a, T b)
Definition: OMPTargetMath.hpp:36

BLAS::gemv
static void gemv(int n, int m, const double *restrict amat, const double *restrict x, double *restrict y)
Definition: BLAS.hpp:118

qmcplusplus::DelayedUpdateCUDA::delay_list
Vector< int, CUDAHostAllocator< int > > delay_list
Definition: DelayedUpdateCUDA.h:50

qmcplusplus::DelayedUpdateCUDA::delay_count
int delay_count
current number of delays, increase one for each acceptance, reset to 0 after updating Ainv ...
Definition: DelayedUpdateCUDA.h:53

qmcplusplus::Matrix::size
size_type size() const
Definition: OhmmsMatrix.h:76

PrefetchedRange.h

qmcplusplus::compute::Queue< PlatformKind::CUDA >::getNative
cudaStream_t getNative()
Definition: QueueCUDA.hpp:73

rocSolverInverter.hpp

applyW_stageV_cuda
void applyW_stageV_cuda(const int *delay_list_gpu, const int delay_count, float *temp_gpu, const int numorbs, const int ndelay, float *V_gpu, const float *Ainv, cudaStream_t hstream)
helper function for delayed update algorithm W matrix is applied and copy selected rows of Ainv into ...

qmcplusplus::cudaErrorCheck
cudaErrorCheck(cudaMemcpyAsync(dev_lu.data(), lu.data(), sizeof(decltype(lu)::value_type) *lu.size(), cudaMemcpyHostToDevice, hstream), "cudaMemcpyAsync failed copying log_values to device")

BLAS::ger
static void ger(int m, int n, double alpha, const double *x, int incx, const double *y, int incy, double *a, int lda)
Definition: BLAS.hpp:437

cudaMemcpyDeviceToHost
#define cudaMemcpyDeviceToHost
Definition: cuda2hip.h:138

qmcplusplus::DelayedUpdateCUDA::getDelayCount
int getDelayCount() const
Definition: DelayedUpdateCUDA.h:131

qmcplusplus::compute::BLASHandle< PlatformKind::CUDA >
Definition: AccelBLAS_CUDA.hpp:30

qmcplusplus::DelayedUpdateCUDA::Ainv_buffer
Matrix< T, CUDAHostAllocator< T > > Ainv_buffer
Definition: DelayedUpdateCUDA.h:64

qmcplusplus::DelayedUpdateCUDA::temp_gpu
Matrix< T, CUDAAllocator< T > > temp_gpu
Definition: DelayedUpdateCUDA.h:42

qmcplusplus::DelayedUpdateCUDA::updateInvMat
void updateInvMat(Matrix< T > &Ainv, bool transfer_to_host=true)
update the full Ainv and reset delay_count
Definition: DelayedUpdateCUDA.h:206

delayed_update_helper.h

cudaMemcpyHostToDevice
#define cudaMemcpyHostToDevice
Definition: cuda2hip.h:139

qmcplusplus::Matrix::rows
size_type rows() const
Definition: OhmmsMatrix.h:77

OhmmsVector.h
Declaraton of Vector<T,Alloc> Manage memory through Alloc directly and allow referencing an existing ...

qmcplusplus::DelayedUpdateCUDA::initializeInv
void initializeInv(const Matrix< T > &Ainv)
initialize internal objects when Ainv is refreshed
Definition: DelayedUpdateCUDA.h:121

qmcplusplus::syclBLAS::copy_n
sycl::event copy_n(sycl::queue &aq, const T1 *restrict VA, size_t array_size, T2 *restrict VC, const std::vector< sycl::event > &events)
Definition: syclBLAS.cpp:548

qmcplusplus::Matrix
Definition: OhmmsMatrix.h:27

qmcplusplus::DelayedUpdateCUDA::U
Matrix< T, CUDAHostAllocator< T > > U
Definition: DelayedUpdateCUDA.h:38

qmcplusplus::DelayedUpdateCUDA::blas_handle_
compute::BLASHandle< PlatformKind::CUDA > blas_handle_
Definition: DelayedUpdateCUDA.h:68

qmcplusplus::DelayedUpdateCUDA::queue_
compute::Queue< PlatformKind::CUDA > queue_
Definition: DelayedUpdateCUDA.h:67

qmcplusplus::PrefetchedRange::checkRange
bool checkRange(int index) const
Definition: PrefetchedRange.h:37

qmcplusplus::Matrix::data
pointer data()
Definition: OhmmsMatrix.h:182

qmcplusplus::DelayedUpdateCUDA::invert_transpose
void invert_transpose(const Matrix< T > &logdetT, Matrix< T > &Ainv, std::complex< TREAL > &log_value)
compute the inverse of the transpose of matrix A and its determinant value in log ...
Definition: DelayedUpdateCUDA.h:108

qmcplusplus::DelayedUpdateCUDA::delay_list_gpu
Vector< int, CUDAAllocator< int > > delay_list_gpu
Definition: DelayedUpdateCUDA.h:51

qmcplusplus::DelayedUpdateCUDA::V_gpu
Matrix< T, CUDAAllocator< T > > V_gpu
Definition: DelayedUpdateCUDA.h:45

qmcplusplus::DelayedUpdateCUDA::Binv_gpu
Matrix< T, CUDAAllocator< T > > Binv_gpu
Definition: DelayedUpdateCUDA.h:46

qmcplusplus::cuSolverInverter
implements matrix inversion via cuSolverDN
Definition: cuSolverInverter.hpp:29

qmcplusplus::DelayedUpdateCUDA::Binv
Matrix< T, CUDAHostAllocator< T > > Binv
Definition: DelayedUpdateCUDA.h:39

qmcplusplus::PrefetchedRange
helper class for the prefetched range of a vector
Definition: PrefetchedRange.h:18

OhmmsMatrix.h

qmcplusplus::DelayedUpdateCUDA::prefetched_range
PrefetchedRange prefetched_range
Definition: DelayedUpdateCUDA.h:62

qmcplusplus::compute::Queue< PlatformKind::CUDA >
Definition: QueueCUDA.hpp:25

cudaMemcpyAsync
#define cudaMemcpyAsync
Definition: cuda2hip.h:136

qmcplusplus::DelayedUpdateCUDA::Ainv_gpu
Matrix< T, CUDAAllocator< T > > Ainv_gpu
Definition: DelayedUpdateCUDA.h:47