implements delayed update on NVIDIA GPU using cuBLAS and cusolverDN More...

Collaboration diagram for DelayedUpdateCUDA< T, T_FP >:

Public Member Functions
	DelayedUpdateCUDA ()
	default constructor More...

void	resize (int norb, int delay)
	resize the internal storage More...

template<typename TREAL >
void	invert_transpose (const Matrix< T > &logdetT, Matrix< T > &Ainv, std::complex< TREAL > &log_value)
	compute the inverse of the transpose of matrix A and its determinant value in log More...

void	initializeInv (const Matrix< T > &Ainv)
	initialize internal objects when Ainv is refreshed More...

int	getDelayCount () const

template<typename VVT >
void	getInvRow (const Matrix< T > &Ainv, int rowchanged, VVT &invRow)
	compute the row of up-to-date Ainv More...

template<typename VVT , typename RATIOT >
void	acceptRow (Matrix< T > &Ainv, int rowchanged, const VVT &psiV, const RATIOT ratio_new)
	accept a move with the update delayed More...

void	updateInvMat (Matrix< T > &Ainv, bool transfer_to_host=true)
	update the full Ainv and reset delay_count More...

Private Member Functions
void	clearDelayCount ()
	reset delay count to 0 More...

Private Attributes
Matrix< T, CUDAHostAllocator< T > >	U

Matrix< T, CUDAHostAllocator< T > >	Binv

Matrix< T >	V

Matrix< T, CUDAAllocator< T > >	temp_gpu

Matrix< T, CUDAAllocator< T > >	U_gpu
	GPU copy of U, V, Binv, Ainv. More...

Matrix< T, CUDAAllocator< T > >	V_gpu

Matrix< T, CUDAAllocator< T > >	Binv_gpu

Matrix< T, CUDAAllocator< T > >	Ainv_gpu

Vector< T >	p

Vector< int, CUDAHostAllocator< int > >	delay_list

Vector< int, CUDAAllocator< int > >	delay_list_gpu

int	delay_count
	current number of delays, increase one for each acceptance, reset to 0 after updating Ainv More...

cuSolverInverter< T_FP >	cusolver_inverter

PrefetchedRange	prefetched_range

Matrix< T, CUDAHostAllocator< T > >	Ainv_buffer

compute::Queue< PlatformKind::CUDA >	queue_

compute::BLASHandle< PlatformKind::CUDA >	blas_handle_

Detailed Description

template<typename T, typename T_FP>
class qmcplusplus::DelayedUpdateCUDA< T, T_FP >

implements delayed update on NVIDIA GPU using cuBLAS and cusolverDN

Template Parameters

T	base precision for most computation
T_FP	high precision for matrix inversion, T_FP >= T

Definition at line 35 of file DelayedUpdateCUDA.h.

Constructor & Destructor Documentation

◆ DelayedUpdateCUDA()

DelayedUpdateCUDA ( )

inline

default constructor

Definition at line 79 of file DelayedUpdateCUDA.h.

79 : delay_count(0), blas_handle_(queue_) {}

qmcplusplus::DelayedUpdateCUDA::delay_count

int delay_count

current number of delays, increase one for each acceptance, reset to 0 after updating Ainv ...

Definition: DelayedUpdateCUDA.h:53

qmcplusplus::DelayedUpdateCUDA::blas_handle_

compute::BLASHandle< PlatformKind::CUDA > blas_handle_

Definition: DelayedUpdateCUDA.h:68

qmcplusplus::DelayedUpdateCUDA::queue_

compute::Queue< PlatformKind::CUDA > queue_

Definition: DelayedUpdateCUDA.h:67

Member Function Documentation

◆ acceptRow()

void acceptRow	(	Matrix< T > &	Ainv,
		int	rowchanged,
		const VVT &	psiV,
		const RATIOT	ratio_new
	)

inline

accept a move with the update delayed

Parameters

Ainv	inverse matrix
rowchanged	the row id corresponding to the proposed electron
psiV	new orbital values

Before delay_count reaches the maximum delay, only Binv is updated with a recursive algorithm

Definition at line 173 of file DelayedUpdateCUDA.h.

   {
     // update Binv from delay_count to delay_count+1
     constexpr T cone(1);
     constexpr T czero(0);
     const int norb     = Ainv.rows();
     const int lda_Binv = Binv.cols();
     std::copy_n(Ainv_buffer[prefetched_range.getOffset(rowchanged)], norb, V[delay_count]);
     std::copy_n(psiV.data(), norb, U[delay_count]);
     delay_list[delay_count] = rowchanged;
     // the new Binv is [[X Y] [Z sigma]]
     BLAS::gemv('T', norb, delay_count + 1, -cone, V.data(), norb, psiV.data(), 1, czero, p.data(), 1);
     // sigma
     const T sigma                  = static_cast<T>(RATIOT(1) / ratio_new);
     Binv[delay_count][delay_count] = sigma;
     // Y
     BLAS::gemv('T', delay_count, delay_count, sigma, Binv.data(), lda_Binv, p.data(), 1, czero,
                Binv.data() + delay_count, lda_Binv);
     // X
     BLAS::ger(delay_count, delay_count, cone, Binv[delay_count], 1, Binv.data() + delay_count, lda_Binv, Binv.data(),
               lda_Binv);
     // Z
     for (int i = 0; i < delay_count; i++)
       Binv[delay_count][i] *= sigma;
     delay_count++;
     // update Ainv when maximal delay is reached
     if (delay_count == lda_Binv)
       updateInvMat(Ainv, false);
   }

◆ clearDelayCount()

void clearDelayCount ( )

inlineprivate

reset delay count to 0

Definition at line 71 of file DelayedUpdateCUDA.h.

References PrefetchedRange::clear(), DelayedUpdateCUDA< T, T_FP >::delay_count, and DelayedUpdateCUDA< T, T_FP >::prefetched_range.

Referenced by DelayedUpdateCUDA< T, T_FP >::initializeInv(), DelayedUpdateCUDA< T, T_FP >::invert_transpose(), and DelayedUpdateCUDA< T, T_FP >::updateInvMat().

   {
     delay_count = 0;
     prefetched_range.clear();
   }

◆ getDelayCount()

int getDelayCount ( ) const

inline

Definition at line 131 of file DelayedUpdateCUDA.h.

References DelayedUpdateCUDA< T, T_FP >::delay_count.

131 { return delay_count; }

qmcplusplus::DelayedUpdateCUDA::delay_count

int delay_count

current number of delays, increase one for each acceptance, reset to 0 after updating Ainv ...

Definition: DelayedUpdateCUDA.h:53

◆ getInvRow()

void getInvRow	(	const Matrix< T > &	Ainv,
		int	rowchanged,
		VVT &	invRow
	)

inline

compute the row of up-to-date Ainv

Parameters

Ainv	inverse matrix
rowchanged	the row id corresponding to the proposed electron

Definition at line 138 of file DelayedUpdateCUDA.h.

   {
     if (!prefetched_range.checkRange(rowchanged))
     {
       int last_row = std::min(rowchanged + Ainv_buffer.rows(), Ainv.rows());
       cudaErrorCheck(cudaMemcpyAsync(Ainv_buffer.data(), Ainv_gpu[rowchanged],
                                      invRow.size() * (last_row - rowchanged) * sizeof(T), cudaMemcpyDeviceToHost,
                                      queue_.getNative()),
                      "cudaMemcpyAsync failed!");
       prefetched_range.setRange(rowchanged, last_row);
       queue_.sync();
     }
     // save AinvRow to new_AinvRow
     std::copy_n(Ainv_buffer[prefetched_range.getOffset(rowchanged)], invRow.size(), invRow.data());
     if (delay_count > 0)
     {
       constexpr T cone(1);
       constexpr T czero(0);
       const int norb     = Ainv.rows();
       const int lda_Binv = Binv.cols();
       // multiply V (NxK) Binv(KxK) U(KxN) AinvRow right to the left
       BLAS::gemv('T', norb, delay_count, cone, U.data(), norb, invRow.data(), 1, czero, p.data(), 1);
       BLAS::gemv('N', delay_count, delay_count, -cone, Binv.data(), lda_Binv, p.data(), 1, czero, Binv[delay_count], 1);
       BLAS::gemv('N', norb, delay_count, cone, V.data(), norb, Binv[delay_count], 1, cone, invRow.data(), 1);
     }
   }

◆ initializeInv()

void initializeInv ( const Matrix< T > & Ainv )

inline

initialize internal objects when Ainv is refreshed

Parameters

Ainv	inverse matrix

Definition at line 121 of file DelayedUpdateCUDA.h.

References DelayedUpdateCUDA< T, T_FP >::Ainv_gpu, DelayedUpdateCUDA< T, T_FP >::clearDelayCount(), qmcplusplus::cudaErrorCheck(), cudaMemcpyAsync, cudaMemcpyHostToDevice, Matrix< T, Alloc >::data(), Queue< PlatformKind::CUDA >::getNative(), DelayedUpdateCUDA< T, T_FP >::queue_, Matrix< T, Alloc >::size(), and Queue< PlatformKind::CUDA >::sync().

   {
     cudaErrorCheck(cudaMemcpyAsync(Ainv_gpu.data(), Ainv.data(), Ainv.size() * sizeof(T), cudaMemcpyHostToDevice,
                                    queue_.getNative()),
                    "cudaMemcpyAsync failed!");
     clearDelayCount();
     // H2D transfer must be synchronized regardless of host memory being pinned or not.
     queue_.sync();
   }

◆ invert_transpose()

void invert_transpose	(	const Matrix< T > &	logdetT,
		Matrix< T > &	Ainv,
		std::complex< TREAL > &	log_value
	)

inline

compute the inverse of the transpose of matrix A and its determinant value in log

Template Parameters

TREAL real type

Definition at line 108 of file DelayedUpdateCUDA.h.

References DelayedUpdateCUDA< T, T_FP >::Ainv_gpu, DelayedUpdateCUDA< T, T_FP >::clearDelayCount(), DelayedUpdateCUDA< T, T_FP >::cusolver_inverter, and rocSolverInverter< T_FP >::invert_transpose().

   {
     clearDelayCount();
 #if defined(QMC_CUDA2HIP)
     rocsolver_inverter.invert_transpose(logdetT, Ainv, Ainv_gpu, log_value);
 #else
     cusolver_inverter.invert_transpose(logdetT, Ainv, Ainv_gpu, log_value);
 #endif
   }

◆ resize()

void resize	(	int	norb,
		int	delay
	)

inline

resize the internal storage

Parameters

norb	number of electrons/orbitals
delay,maximum	delay 0<delay<=norb

Definition at line 85 of file DelayedUpdateCUDA.h.

   {
     //tempMat.resize(norb, delay);
     V.resize(delay, norb);
     U.resize(delay, norb);
     p.resize(delay);
     Binv.resize(delay, delay);
     // prefetch 8% more rows corresponding to roughly 96% acceptance ratio
     Ainv_buffer.resize(std::min(static_cast<int>(delay * 1.08), norb), norb);
 
     temp_gpu.resize(norb, delay);
     delay_list.resize(delay);
     U_gpu.resize(delay, norb);
     V_gpu.resize(delay, norb);
     Binv_gpu.resize(delay, delay);
     delay_list_gpu.resize(delay);
     Ainv_gpu.resize(norb, norb);
   }

◆ updateInvMat()

void updateInvMat	(	Matrix< T > &	Ainv,
		bool	transfer_to_host = `true`
	)

inline

update the full Ainv and reset delay_count

Parameters

Ainv	inverse matrix

Definition at line 206 of file DelayedUpdateCUDA.h.

Referenced by DelayedUpdateCUDA< T, T_FP >::acceptRow().

   {
     // update the inverse matrix
     if (delay_count > 0)
     {
       const int norb     = Ainv.rows();
       const int lda_Binv = Binv.cols();
       cudaErrorCheck(cudaMemcpyAsync(U_gpu.data(), U.data(), norb * delay_count * sizeof(T), cudaMemcpyHostToDevice,
                                      queue_.getNative()),
                      "cudaMemcpyAsync failed!");
       compute::BLAS::gemm(blas_handle_, 'T', 'N', delay_count, norb, norb, T(1), U_gpu.data(), norb, Ainv_gpu.data(),
                           norb, T(0), temp_gpu.data(), lda_Binv);
       cudaErrorCheck(cudaMemcpyAsync(delay_list_gpu.data(), delay_list.data(), delay_count * sizeof(int),
                                      cudaMemcpyHostToDevice, queue_.getNative()),
                      "cudaMemcpyAsync failed!");
       applyW_stageV_cuda(delay_list_gpu.data(), delay_count, temp_gpu.data(), norb, temp_gpu.cols(), V_gpu.data(),
                          Ainv_gpu.data(), queue_.getNative());
       cudaErrorCheck(cudaMemcpyAsync(Binv_gpu.data(), Binv.data(), lda_Binv * delay_count * sizeof(T),
                                      cudaMemcpyHostToDevice, queue_.getNative()),
                      "cudaMemcpyAsync failed!");
       compute::BLAS::gemm(blas_handle_, 'N', 'N', norb, delay_count, delay_count, T(1), V_gpu.data(), norb,
                           Binv_gpu.data(), lda_Binv, T(0), U_gpu.data(), norb);
       compute::BLAS::gemm(blas_handle_, 'N', 'N', norb, norb, delay_count, T(-1), U_gpu.data(), norb, temp_gpu.data(),
                           lda_Binv, T(1), Ainv_gpu.data(), norb);
       clearDelayCount();
     }
 
     // transfer Ainv_gpu to Ainv and wait till completion
     if (transfer_to_host)
     {
       cudaErrorCheck(cudaMemcpyAsync(Ainv.data(), Ainv_gpu.data(), Ainv.size() * sizeof(T), cudaMemcpyDeviceToHost,
                                      queue_.getNative()),
                      "cudaMemcpyAsync failed!");
       queue_.sync();
     }
   }