12 #ifndef QMCPLUSPLUS_DELAYED_UPDATE_CUDA_H 13 #define QMCPLUSPLUS_DELAYED_UPDATE_CUDA_H 22 #if defined(QMC_CUDA2HIP) 34 template<
typename T,
typename T_FP>
55 #if defined(QMC_CUDA2HIP) 85 inline void resize(
int norb,
int delay)
89 U.resize(delay, norb);
91 Binv.resize(delay, delay);
97 U_gpu.resize(delay, norb);
98 V_gpu.resize(delay, norb);
107 template<
typename TREAL>
111 #if defined(QMC_CUDA2HIP) 125 "cudaMemcpyAsync failed!");
137 template<
typename VVT>
146 "cudaMemcpyAsync failed!");
155 constexpr T
czero(0);
156 const int norb = Ainv.
rows();
157 const int lda_Binv =
Binv.cols();
159 BLAS::gemv(
'T', norb,
delay_count,
cone,
U.data(), norb, invRow.data(), 1,
czero,
p.
data(), 1);
160 BLAS::gemv(
'N',
delay_count,
delay_count, -
cone,
Binv.data(), lda_Binv,
p.
data(), 1,
czero,
Binv[
delay_count], 1);
161 BLAS::gemv(
'N', norb,
delay_count,
cone,
V.
data(), norb,
Binv[
delay_count], 1,
cone, invRow.data(), 1);
172 template<
typename VVT,
typename RATIOT>
177 constexpr T
czero(0);
178 const int norb = Ainv.
rows();
179 const int lda_Binv =
Binv.cols();
184 BLAS::gemv(
'T', norb,
delay_count + 1, -
cone,
V.
data(), norb, psiV.data(), 1,
czero,
p.
data(), 1);
186 const T sigma =
static_cast<T
>(RATIOT(1) / ratio_new);
211 const int norb = Ainv.
rows();
212 const int lda_Binv =
Binv.cols();
215 "cudaMemcpyAsync failed!");
216 compute::BLAS::gemm(
blas_handle_,
'T',
'N',
delay_count, norb, norb, T(1),
U_gpu.data(), norb,
Ainv_gpu.data(),
217 norb, T(0),
temp_gpu.data(), lda_Binv);
220 "cudaMemcpyAsync failed!");
225 "cudaMemcpyAsync failed!");
228 compute::BLAS::gemm(
blas_handle_,
'N',
'N', norb, norb,
delay_count, T(-1),
U_gpu.data(), norb,
temp_gpu.data(),
229 lda_Binv, T(1),
Ainv_gpu.data(), norb);
234 if (transfer_to_host)
238 "cudaMemcpyAsync failed!");
245 #endif // QMCPLUSPLUS_DELAYED_UPDATE_CUDA_H void resize(size_type n, Type_t val=Type_t())
Resize the container.
void getInvRow(const Matrix< T > &Ainv, int rowchanged, VVT &invRow)
compute the row of up-to-date Ainv
void clearDelayCount()
reset delay count to 0
void acceptRow(Matrix< T > &Ainv, int rowchanged, const VVT &psiV, const RATIOT ratio_new)
accept a move with the update delayed
void gemm(BLASHandle< PlatformKind::CUDA > &handle, const char transa, const char transb, int m, int n, int k, const float &alpha, const float *A, int lda, const float *B, int ldb, const float &beta, float *C, int ldc)
Matrix< T, CUDAAllocator< T > > U_gpu
GPU copy of U, V, Binv, Ainv.
helper functions for EinsplineSetBuilder
int getOffset(int index) const
constexpr std::complex< float > czero
handle CUDA/HIP runtime selection.
constexpr std::complex< float > cone
cuSolverInverter< T_FP > cusolver_inverter
std::enable_if_t< std::is_same< TMAT, T_FP >::value > invert_transpose(const Matrix< TMAT > &logdetT, Matrix< TMAT > &Ainv, Matrix< TMAT, CUDAAllocator< TMAT >> &Ainv_gpu, std::complex< TREAL > &log_value)
compute the inverse of the transpose of matrix A and its determinant value in log when T_FP and TMAT ...
void setRange(int first_in, int last_in)
void resize(size_type n, size_type m)
Resize the container.
implements delayed update on NVIDIA GPU using cuBLAS and cusolverDN
void resize(int norb, int delay)
resize the internal storage
this file provides three C++ memory allocators using CUDA specific memory allocation functions...
DelayedUpdateCUDA()
default constructor
implements matrix inversion via rocSolver
static void gemv(int n, int m, const double *restrict amat, const double *restrict x, double *restrict y)
Vector< int, CUDAHostAllocator< int > > delay_list
int delay_count
current number of delays, increase one for each acceptance, reset to 0 after updating Ainv ...
void applyW_stageV_cuda(const int *delay_list_gpu, const int delay_count, float *temp_gpu, const int numorbs, const int ndelay, float *V_gpu, const float *Ainv, cudaStream_t hstream)
helper function for delayed update algorithm W matrix is applied and copy selected rows of Ainv into ...
cudaErrorCheck(cudaMemcpyAsync(dev_lu.data(), lu.data(), sizeof(decltype(lu)::value_type) *lu.size(), cudaMemcpyHostToDevice, hstream), "cudaMemcpyAsync failed copying log_values to device")
static void ger(int m, int n, double alpha, const double *x, int incx, const double *y, int incy, double *a, int lda)
#define cudaMemcpyDeviceToHost
int getDelayCount() const
Matrix< T, CUDAHostAllocator< T > > Ainv_buffer
Matrix< T, CUDAAllocator< T > > temp_gpu
void updateInvMat(Matrix< T > &Ainv, bool transfer_to_host=true)
update the full Ainv and reset delay_count
#define cudaMemcpyHostToDevice
Declaraton of Vector<T,Alloc> Manage memory through Alloc directly and allow referencing an existing ...
void initializeInv(const Matrix< T > &Ainv)
initialize internal objects when Ainv is refreshed
sycl::event copy_n(sycl::queue &aq, const T1 *restrict VA, size_t array_size, T2 *restrict VC, const std::vector< sycl::event > &events)
Matrix< T, CUDAHostAllocator< T > > U
compute::BLASHandle< PlatformKind::CUDA > blas_handle_
compute::Queue< PlatformKind::CUDA > queue_
bool checkRange(int index) const
void invert_transpose(const Matrix< T > &logdetT, Matrix< T > &Ainv, std::complex< TREAL > &log_value)
compute the inverse of the transpose of matrix A and its determinant value in log ...
Vector< int, CUDAAllocator< int > > delay_list_gpu
Matrix< T, CUDAAllocator< T > > V_gpu
Matrix< T, CUDAAllocator< T > > Binv_gpu
implements matrix inversion via cuSolverDN
Matrix< T, CUDAHostAllocator< T > > Binv
helper class for the prefetched range of a vector
PrefetchedRange prefetched_range
Matrix< T, CUDAAllocator< T > > Ainv_gpu