QMCPACK
DelayedUpdateCUDA< T, T_FP > Class Template Reference

implements delayed update on NVIDIA GPU using cuBLAS and cusolverDN More...

+ Collaboration diagram for DelayedUpdateCUDA< T, T_FP >:

Public Member Functions

 DelayedUpdateCUDA ()
 default constructor More...
 
void resize (int norb, int delay)
 resize the internal storage More...
 
template<typename TREAL >
void invert_transpose (const Matrix< T > &logdetT, Matrix< T > &Ainv, std::complex< TREAL > &log_value)
 compute the inverse of the transpose of matrix A and its determinant value in log More...
 
void initializeInv (const Matrix< T > &Ainv)
 initialize internal objects when Ainv is refreshed More...
 
int getDelayCount () const
 
template<typename VVT >
void getInvRow (const Matrix< T > &Ainv, int rowchanged, VVT &invRow)
 compute the row of up-to-date Ainv More...
 
template<typename VVT , typename RATIOT >
void acceptRow (Matrix< T > &Ainv, int rowchanged, const VVT &psiV, const RATIOT ratio_new)
 accept a move with the update delayed More...
 
void updateInvMat (Matrix< T > &Ainv, bool transfer_to_host=true)
 update the full Ainv and reset delay_count More...
 

Private Member Functions

void clearDelayCount ()
 reset delay count to 0 More...
 

Private Attributes

Matrix< T, CUDAHostAllocator< T > > U
 
Matrix< T, CUDAHostAllocator< T > > Binv
 
Matrix< T > V
 
Matrix< T, CUDAAllocator< T > > temp_gpu
 
Matrix< T, CUDAAllocator< T > > U_gpu
 GPU copy of U, V, Binv, Ainv. More...
 
Matrix< T, CUDAAllocator< T > > V_gpu
 
Matrix< T, CUDAAllocator< T > > Binv_gpu
 
Matrix< T, CUDAAllocator< T > > Ainv_gpu
 
Vector< T > p
 
Vector< int, CUDAHostAllocator< int > > delay_list
 
Vector< int, CUDAAllocator< int > > delay_list_gpu
 
int delay_count
 current number of delays, increase one for each acceptance, reset to 0 after updating Ainv More...
 
cuSolverInverter< T_FP > cusolver_inverter
 
PrefetchedRange prefetched_range
 
Matrix< T, CUDAHostAllocator< T > > Ainv_buffer
 
compute::Queue< PlatformKind::CUDAqueue_
 
compute::BLASHandle< PlatformKind::CUDAblas_handle_
 

Detailed Description

template<typename T, typename T_FP>
class qmcplusplus::DelayedUpdateCUDA< T, T_FP >

implements delayed update on NVIDIA GPU using cuBLAS and cusolverDN

Template Parameters
Tbase precision for most computation
T_FPhigh precision for matrix inversion, T_FP >= T

Definition at line 35 of file DelayedUpdateCUDA.h.

Constructor & Destructor Documentation

◆ DelayedUpdateCUDA()

DelayedUpdateCUDA ( )
inline

default constructor

Definition at line 79 of file DelayedUpdateCUDA.h.

int delay_count
current number of delays, increase one for each acceptance, reset to 0 after updating Ainv ...
compute::BLASHandle< PlatformKind::CUDA > blas_handle_
compute::Queue< PlatformKind::CUDA > queue_

Member Function Documentation

◆ acceptRow()

void acceptRow ( Matrix< T > &  Ainv,
int  rowchanged,
const VVT &  psiV,
const RATIOT  ratio_new 
)
inline

accept a move with the update delayed

Parameters
Ainvinverse matrix
rowchangedthe row id corresponding to the proposed electron
psiVnew orbital values

Before delay_count reaches the maximum delay, only Binv is updated with a recursive algorithm

Definition at line 173 of file DelayedUpdateCUDA.h.

References DelayedUpdateCUDA< T, T_FP >::Ainv_buffer, DelayedUpdateCUDA< T, T_FP >::Binv, BLAS::cone, qmcplusplus::syclBLAS::copy_n(), BLAS::czero, Matrix< T, Alloc >::data(), Vector< T, Alloc >::data(), DelayedUpdateCUDA< T, T_FP >::delay_count, DelayedUpdateCUDA< T, T_FP >::delay_list, BLAS::gemv(), BLAS::ger(), PrefetchedRange::getOffset(), DelayedUpdateCUDA< T, T_FP >::p, DelayedUpdateCUDA< T, T_FP >::prefetched_range, Matrix< T, Alloc >::rows(), DelayedUpdateCUDA< T, T_FP >::U, DelayedUpdateCUDA< T, T_FP >::updateInvMat(), and DelayedUpdateCUDA< T, T_FP >::V.

174  {
175  // update Binv from delay_count to delay_count+1
176  constexpr T cone(1);
177  constexpr T czero(0);
178  const int norb = Ainv.rows();
179  const int lda_Binv = Binv.cols();
181  std::copy_n(psiV.data(), norb, U[delay_count]);
182  delay_list[delay_count] = rowchanged;
183  // the new Binv is [[X Y] [Z sigma]]
184  BLAS::gemv('T', norb, delay_count + 1, -cone, V.data(), norb, psiV.data(), 1, czero, p.data(), 1);
185  // sigma
186  const T sigma = static_cast<T>(RATIOT(1) / ratio_new);
187  Binv[delay_count][delay_count] = sigma;
188  // Y
189  BLAS::gemv('T', delay_count, delay_count, sigma, Binv.data(), lda_Binv, p.data(), 1, czero,
190  Binv.data() + delay_count, lda_Binv);
191  // X
192  BLAS::ger(delay_count, delay_count, cone, Binv[delay_count], 1, Binv.data() + delay_count, lda_Binv, Binv.data(),
193  lda_Binv);
194  // Z
195  for (int i = 0; i < delay_count; i++)
196  Binv[delay_count][i] *= sigma;
197  delay_count++;
198  // update Ainv when maximal delay is reached
199  if (delay_count == lda_Binv)
200  updateInvMat(Ainv, false);
201  }
int getOffset(int index) const
constexpr std::complex< float > czero
Definition: BLAS.hpp:51
constexpr std::complex< float > cone
Definition: BLAS.hpp:50
static void gemv(int n, int m, const double *restrict amat, const double *restrict x, double *restrict y)
Definition: BLAS.hpp:118
Vector< int, CUDAHostAllocator< int > > delay_list
int delay_count
current number of delays, increase one for each acceptance, reset to 0 after updating Ainv ...
static void ger(int m, int n, double alpha, const double *x, int incx, const double *y, int incy, double *a, int lda)
Definition: BLAS.hpp:437
Matrix< T, CUDAHostAllocator< T > > Ainv_buffer
void updateInvMat(Matrix< T > &Ainv, bool transfer_to_host=true)
update the full Ainv and reset delay_count
size_type rows() const
Definition: OhmmsMatrix.h:77
sycl::event copy_n(sycl::queue &aq, const T1 *restrict VA, size_t array_size, T2 *restrict VC, const std::vector< sycl::event > &events)
Definition: syclBLAS.cpp:548
Matrix< T, CUDAHostAllocator< T > > U
Matrix< T, CUDAHostAllocator< T > > Binv

◆ clearDelayCount()

void clearDelayCount ( )
inlineprivate

◆ getDelayCount()

int getDelayCount ( ) const
inline

Definition at line 131 of file DelayedUpdateCUDA.h.

References DelayedUpdateCUDA< T, T_FP >::delay_count.

131 { return delay_count; }
int delay_count
current number of delays, increase one for each acceptance, reset to 0 after updating Ainv ...

◆ getInvRow()

void getInvRow ( const Matrix< T > &  Ainv,
int  rowchanged,
VVT &  invRow 
)
inline

compute the row of up-to-date Ainv

Parameters
Ainvinverse matrix
rowchangedthe row id corresponding to the proposed electron

Definition at line 138 of file DelayedUpdateCUDA.h.

References DelayedUpdateCUDA< T, T_FP >::Ainv_buffer, DelayedUpdateCUDA< T, T_FP >::Ainv_gpu, DelayedUpdateCUDA< T, T_FP >::Binv, PrefetchedRange::checkRange(), BLAS::cone, qmcplusplus::syclBLAS::copy_n(), qmcplusplus::cudaErrorCheck(), cudaMemcpyAsync, cudaMemcpyDeviceToHost, BLAS::czero, Matrix< T, Alloc >::data(), Vector< T, Alloc >::data(), DelayedUpdateCUDA< T, T_FP >::delay_count, BLAS::gemv(), Queue< PlatformKind::CUDA >::getNative(), PrefetchedRange::getOffset(), omptarget::min(), DelayedUpdateCUDA< T, T_FP >::p, DelayedUpdateCUDA< T, T_FP >::prefetched_range, DelayedUpdateCUDA< T, T_FP >::queue_, Matrix< T, Alloc >::rows(), PrefetchedRange::setRange(), Queue< PlatformKind::CUDA >::sync(), DelayedUpdateCUDA< T, T_FP >::U, and DelayedUpdateCUDA< T, T_FP >::V.

139  {
140  if (!prefetched_range.checkRange(rowchanged))
141  {
142  int last_row = std::min(rowchanged + Ainv_buffer.rows(), Ainv.rows());
144  invRow.size() * (last_row - rowchanged) * sizeof(T), cudaMemcpyDeviceToHost,
145  queue_.getNative()),
146  "cudaMemcpyAsync failed!");
147  prefetched_range.setRange(rowchanged, last_row);
148  queue_.sync();
149  }
150  // save AinvRow to new_AinvRow
151  std::copy_n(Ainv_buffer[prefetched_range.getOffset(rowchanged)], invRow.size(), invRow.data());
152  if (delay_count > 0)
153  {
154  constexpr T cone(1);
155  constexpr T czero(0);
156  const int norb = Ainv.rows();
157  const int lda_Binv = Binv.cols();
158  // multiply V (NxK) Binv(KxK) U(KxN) AinvRow right to the left
159  BLAS::gemv('T', norb, delay_count, cone, U.data(), norb, invRow.data(), 1, czero, p.data(), 1);
160  BLAS::gemv('N', delay_count, delay_count, -cone, Binv.data(), lda_Binv, p.data(), 1, czero, Binv[delay_count], 1);
161  BLAS::gemv('N', norb, delay_count, cone, V.data(), norb, Binv[delay_count], 1, cone, invRow.data(), 1);
162  }
163  }
int getOffset(int index) const
constexpr std::complex< float > czero
Definition: BLAS.hpp:51
constexpr std::complex< float > cone
Definition: BLAS.hpp:50
void setRange(int first_in, int last_in)
T min(T a, T b)
static void gemv(int n, int m, const double *restrict amat, const double *restrict x, double *restrict y)
Definition: BLAS.hpp:118
int delay_count
current number of delays, increase one for each acceptance, reset to 0 after updating Ainv ...
cudaErrorCheck(cudaMemcpyAsync(dev_lu.data(), lu.data(), sizeof(decltype(lu)::value_type) *lu.size(), cudaMemcpyHostToDevice, hstream), "cudaMemcpyAsync failed copying log_values to device")
#define cudaMemcpyDeviceToHost
Definition: cuda2hip.h:138
Matrix< T, CUDAHostAllocator< T > > Ainv_buffer
size_type rows() const
Definition: OhmmsMatrix.h:77
sycl::event copy_n(sycl::queue &aq, const T1 *restrict VA, size_t array_size, T2 *restrict VC, const std::vector< sycl::event > &events)
Definition: syclBLAS.cpp:548
Matrix< T, CUDAHostAllocator< T > > U
compute::Queue< PlatformKind::CUDA > queue_
bool checkRange(int index) const
Matrix< T, CUDAHostAllocator< T > > Binv
#define cudaMemcpyAsync
Definition: cuda2hip.h:136
Matrix< T, CUDAAllocator< T > > Ainv_gpu

◆ initializeInv()

void initializeInv ( const Matrix< T > &  Ainv)
inline

initialize internal objects when Ainv is refreshed

Parameters
Ainvinverse matrix

Definition at line 121 of file DelayedUpdateCUDA.h.

References DelayedUpdateCUDA< T, T_FP >::Ainv_gpu, DelayedUpdateCUDA< T, T_FP >::clearDelayCount(), qmcplusplus::cudaErrorCheck(), cudaMemcpyAsync, cudaMemcpyHostToDevice, Matrix< T, Alloc >::data(), Queue< PlatformKind::CUDA >::getNative(), DelayedUpdateCUDA< T, T_FP >::queue_, Matrix< T, Alloc >::size(), and Queue< PlatformKind::CUDA >::sync().

122  {
123  cudaErrorCheck(cudaMemcpyAsync(Ainv_gpu.data(), Ainv.data(), Ainv.size() * sizeof(T), cudaMemcpyHostToDevice,
124  queue_.getNative()),
125  "cudaMemcpyAsync failed!");
126  clearDelayCount();
127  // H2D transfer must be synchronized regardless of host memory being pinned or not.
128  queue_.sync();
129  }
void clearDelayCount()
reset delay count to 0
size_type size() const
Definition: OhmmsMatrix.h:76
cudaErrorCheck(cudaMemcpyAsync(dev_lu.data(), lu.data(), sizeof(decltype(lu)::value_type) *lu.size(), cudaMemcpyHostToDevice, hstream), "cudaMemcpyAsync failed copying log_values to device")
#define cudaMemcpyHostToDevice
Definition: cuda2hip.h:139
compute::Queue< PlatformKind::CUDA > queue_
#define cudaMemcpyAsync
Definition: cuda2hip.h:136
Matrix< T, CUDAAllocator< T > > Ainv_gpu

◆ invert_transpose()

void invert_transpose ( const Matrix< T > &  logdetT,
Matrix< T > &  Ainv,
std::complex< TREAL > &  log_value 
)
inline

compute the inverse of the transpose of matrix A and its determinant value in log

Template Parameters
TREALreal type

Definition at line 108 of file DelayedUpdateCUDA.h.

References DelayedUpdateCUDA< T, T_FP >::Ainv_gpu, DelayedUpdateCUDA< T, T_FP >::clearDelayCount(), DelayedUpdateCUDA< T, T_FP >::cusolver_inverter, and rocSolverInverter< T_FP >::invert_transpose().

109  {
110  clearDelayCount();
111 #if defined(QMC_CUDA2HIP)
112  rocsolver_inverter.invert_transpose(logdetT, Ainv, Ainv_gpu, log_value);
113 #else
114  cusolver_inverter.invert_transpose(logdetT, Ainv, Ainv_gpu, log_value);
115 #endif
116  }
void clearDelayCount()
reset delay count to 0
cuSolverInverter< T_FP > cusolver_inverter
Matrix< T, CUDAAllocator< T > > Ainv_gpu

◆ resize()

void resize ( int  norb,
int  delay 
)
inline

resize the internal storage

Parameters
norbnumber of electrons/orbitals
delay,maximumdelay 0<delay<=norb

Definition at line 85 of file DelayedUpdateCUDA.h.

References DelayedUpdateCUDA< T, T_FP >::Ainv_buffer, DelayedUpdateCUDA< T, T_FP >::Ainv_gpu, DelayedUpdateCUDA< T, T_FP >::Binv, DelayedUpdateCUDA< T, T_FP >::Binv_gpu, DelayedUpdateCUDA< T, T_FP >::delay_list, DelayedUpdateCUDA< T, T_FP >::delay_list_gpu, omptarget::min(), DelayedUpdateCUDA< T, T_FP >::p, Matrix< T, Alloc >::resize(), Vector< T, Alloc >::resize(), DelayedUpdateCUDA< T, T_FP >::temp_gpu, DelayedUpdateCUDA< T, T_FP >::U, DelayedUpdateCUDA< T, T_FP >::U_gpu, DelayedUpdateCUDA< T, T_FP >::V, and DelayedUpdateCUDA< T, T_FP >::V_gpu.

86  {
87  //tempMat.resize(norb, delay);
88  V.resize(delay, norb);
89  U.resize(delay, norb);
90  p.resize(delay);
91  Binv.resize(delay, delay);
92  // prefetch 8% more rows corresponding to roughly 96% acceptance ratio
93  Ainv_buffer.resize(std::min(static_cast<int>(delay * 1.08), norb), norb);
94 
95  temp_gpu.resize(norb, delay);
96  delay_list.resize(delay);
97  U_gpu.resize(delay, norb);
98  V_gpu.resize(delay, norb);
99  Binv_gpu.resize(delay, delay);
100  delay_list_gpu.resize(delay);
101  Ainv_gpu.resize(norb, norb);
102  }
void resize(size_type n, Type_t val=Type_t())
Resize the container.
Definition: OhmmsVector.h:166
Matrix< T, CUDAAllocator< T > > U_gpu
GPU copy of U, V, Binv, Ainv.
void resize(size_type n, size_type m)
Resize the container.
Definition: OhmmsMatrix.h:99
T min(T a, T b)
Vector< int, CUDAHostAllocator< int > > delay_list
Matrix< T, CUDAHostAllocator< T > > Ainv_buffer
Matrix< T, CUDAAllocator< T > > temp_gpu
Matrix< T, CUDAHostAllocator< T > > U
Vector< int, CUDAAllocator< int > > delay_list_gpu
Matrix< T, CUDAAllocator< T > > V_gpu
Matrix< T, CUDAAllocator< T > > Binv_gpu
Matrix< T, CUDAHostAllocator< T > > Binv
Matrix< T, CUDAAllocator< T > > Ainv_gpu

◆ updateInvMat()

void updateInvMat ( Matrix< T > &  Ainv,
bool  transfer_to_host = true 
)
inline

update the full Ainv and reset delay_count

Parameters
Ainvinverse matrix

Definition at line 206 of file DelayedUpdateCUDA.h.

References DelayedUpdateCUDA< T, T_FP >::Ainv_gpu, applyW_stageV_cuda(), DelayedUpdateCUDA< T, T_FP >::Binv, DelayedUpdateCUDA< T, T_FP >::Binv_gpu, DelayedUpdateCUDA< T, T_FP >::blas_handle_, DelayedUpdateCUDA< T, T_FP >::clearDelayCount(), qmcplusplus::cudaErrorCheck(), cudaMemcpyAsync, cudaMemcpyDeviceToHost, cudaMemcpyHostToDevice, Matrix< T, Alloc >::data(), DelayedUpdateCUDA< T, T_FP >::delay_count, DelayedUpdateCUDA< T, T_FP >::delay_list, DelayedUpdateCUDA< T, T_FP >::delay_list_gpu, qmcplusplus::compute::BLAS::gemm(), Queue< PlatformKind::CUDA >::getNative(), DelayedUpdateCUDA< T, T_FP >::queue_, Matrix< T, Alloc >::rows(), Matrix< T, Alloc >::size(), Queue< PlatformKind::CUDA >::sync(), DelayedUpdateCUDA< T, T_FP >::temp_gpu, DelayedUpdateCUDA< T, T_FP >::U, DelayedUpdateCUDA< T, T_FP >::U_gpu, and DelayedUpdateCUDA< T, T_FP >::V_gpu.

Referenced by DelayedUpdateCUDA< T, T_FP >::acceptRow().

207  {
208  // update the inverse matrix
209  if (delay_count > 0)
210  {
211  const int norb = Ainv.rows();
212  const int lda_Binv = Binv.cols();
213  cudaErrorCheck(cudaMemcpyAsync(U_gpu.data(), U.data(), norb * delay_count * sizeof(T), cudaMemcpyHostToDevice,
214  queue_.getNative()),
215  "cudaMemcpyAsync failed!");
216  compute::BLAS::gemm(blas_handle_, 'T', 'N', delay_count, norb, norb, T(1), U_gpu.data(), norb, Ainv_gpu.data(),
217  norb, T(0), temp_gpu.data(), lda_Binv);
220  "cudaMemcpyAsync failed!");
221  applyW_stageV_cuda(delay_list_gpu.data(), delay_count, temp_gpu.data(), norb, temp_gpu.cols(), V_gpu.data(),
222  Ainv_gpu.data(), queue_.getNative());
223  cudaErrorCheck(cudaMemcpyAsync(Binv_gpu.data(), Binv.data(), lda_Binv * delay_count * sizeof(T),
225  "cudaMemcpyAsync failed!");
226  compute::BLAS::gemm(blas_handle_, 'N', 'N', norb, delay_count, delay_count, T(1), V_gpu.data(), norb,
227  Binv_gpu.data(), lda_Binv, T(0), U_gpu.data(), norb);
228  compute::BLAS::gemm(blas_handle_, 'N', 'N', norb, norb, delay_count, T(-1), U_gpu.data(), norb, temp_gpu.data(),
229  lda_Binv, T(1), Ainv_gpu.data(), norb);
230  clearDelayCount();
231  }
232 
233  // transfer Ainv_gpu to Ainv and wait till completion
234  if (transfer_to_host)
235  {
236  cudaErrorCheck(cudaMemcpyAsync(Ainv.data(), Ainv_gpu.data(), Ainv.size() * sizeof(T), cudaMemcpyDeviceToHost,
237  queue_.getNative()),
238  "cudaMemcpyAsync failed!");
239  queue_.sync();
240  }
241  }
void clearDelayCount()
reset delay count to 0
void gemm(BLASHandle< PlatformKind::CUDA > &handle, const char transa, const char transb, int m, int n, int k, const float &alpha, const float *A, int lda, const float *B, int ldb, const float &beta, float *C, int ldc)
Matrix< T, CUDAAllocator< T > > U_gpu
GPU copy of U, V, Binv, Ainv.
Vector< int, CUDAHostAllocator< int > > delay_list
int delay_count
current number of delays, increase one for each acceptance, reset to 0 after updating Ainv ...
size_type size() const
Definition: OhmmsMatrix.h:76
void applyW_stageV_cuda(const int *delay_list_gpu, const int delay_count, float *temp_gpu, const int numorbs, const int ndelay, float *V_gpu, const float *Ainv, cudaStream_t hstream)
helper function for delayed update algorithm W matrix is applied and copy selected rows of Ainv into ...
cudaErrorCheck(cudaMemcpyAsync(dev_lu.data(), lu.data(), sizeof(decltype(lu)::value_type) *lu.size(), cudaMemcpyHostToDevice, hstream), "cudaMemcpyAsync failed copying log_values to device")
#define cudaMemcpyDeviceToHost
Definition: cuda2hip.h:138
Matrix< T, CUDAAllocator< T > > temp_gpu
#define cudaMemcpyHostToDevice
Definition: cuda2hip.h:139
size_type rows() const
Definition: OhmmsMatrix.h:77
Matrix< T, CUDAHostAllocator< T > > U
compute::BLASHandle< PlatformKind::CUDA > blas_handle_
compute::Queue< PlatformKind::CUDA > queue_
Vector< int, CUDAAllocator< int > > delay_list_gpu
Matrix< T, CUDAAllocator< T > > V_gpu
Matrix< T, CUDAAllocator< T > > Binv_gpu
Matrix< T, CUDAHostAllocator< T > > Binv
#define cudaMemcpyAsync
Definition: cuda2hip.h:136
Matrix< T, CUDAAllocator< T > > Ainv_gpu

Member Data Documentation

◆ Ainv_buffer

◆ Ainv_gpu

◆ Binv

◆ Binv_gpu

◆ blas_handle_

compute::BLASHandle<PlatformKind::CUDA> blas_handle_
private

Definition at line 68 of file DelayedUpdateCUDA.h.

Referenced by DelayedUpdateCUDA< T, T_FP >::updateInvMat().

◆ cusolver_inverter

cuSolverInverter<T_FP> cusolver_inverter
private

Definition at line 58 of file DelayedUpdateCUDA.h.

Referenced by DelayedUpdateCUDA< T, T_FP >::invert_transpose().

◆ delay_count

int delay_count
private

◆ delay_list

◆ delay_list_gpu

Vector<int, CUDAAllocator<int> > delay_list_gpu
private

◆ p

◆ prefetched_range

◆ queue_

◆ temp_gpu

◆ U

◆ U_gpu

Matrix<T, CUDAAllocator<T> > U_gpu
private

GPU copy of U, V, Binv, Ainv.

Definition at line 44 of file DelayedUpdateCUDA.h.

Referenced by DelayedUpdateCUDA< T, T_FP >::resize(), and DelayedUpdateCUDA< T, T_FP >::updateInvMat().

◆ V

◆ V_gpu


The documentation for this class was generated from the following file: