QMCPACK
DelayedUpdateCUDA.h
Go to the documentation of this file.
1 //////////////////////////////////////////////////////////////////////////////////////
2 // This file is distributed under the University of Illinois/NCSA Open Source License.
3 // See LICENSE file in top directory for details.
4 //
5 // Copyright (c) 2019 QMCPACK developers.
6 //
7 // File developed by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
8 //
9 // File created by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
10 //////////////////////////////////////////////////////////////////////////////////////
11 
12 #ifndef QMCPLUSPLUS_DELAYED_UPDATE_CUDA_H
13 #define QMCPLUSPLUS_DELAYED_UPDATE_CUDA_H
14 
15 #include "OhmmsPETE/OhmmsVector.h"
16 #include "OhmmsPETE/OhmmsMatrix.h"
17 #include "CUDA/CUDAruntime.hpp"
18 #include "CUDA/CUDAallocator.hpp"
19 #include "CUDA/AccelBLAS_CUDA.hpp"
21 #include "PrefetchedRange.h"
22 #if defined(QMC_CUDA2HIP)
23 #include "rocSolverInverter.hpp"
24 #else
25 #include "cuSolverInverter.hpp"
26 #endif
27 
28 namespace qmcplusplus
29 {
30 /** implements delayed update on NVIDIA GPU using cuBLAS and cusolverDN
31  * @tparam T base precision for most computation
32  * @tparam T_FP high precision for matrix inversion, T_FP >= T
33  */
34 template<typename T, typename T_FP>
36 {
37  // Data staged during for delayed acceptRows
41  //Matrix<T> tempMat; // for debugging only
43  /// GPU copy of U, V, Binv, Ainv
48  // auxiliary arrays for B
52  /// current number of delays, increase one for each acceptance, reset to 0 after updating Ainv
54 
55 #if defined(QMC_CUDA2HIP)
56  rocSolverInverter<T_FP> rocsolver_inverter;
57 #else
59 #endif
60 
61  // the range of prefetched_Ainv_rows
63  // Ainv prefetch buffer
65 
66  // CUDA specific variables
69 
70  /// reset delay count to 0
71  inline void clearDelayCount()
72  {
73  delay_count = 0;
75  }
76 
77 public:
78  /// default constructor
80 
81  /** resize the internal storage
82  * @param norb number of electrons/orbitals
83  * @param delay, maximum delay 0<delay<=norb
84  */
85  inline void resize(int norb, int delay)
86  {
87  //tempMat.resize(norb, delay);
88  V.resize(delay, norb);
89  U.resize(delay, norb);
90  p.resize(delay);
91  Binv.resize(delay, delay);
92  // prefetch 8% more rows corresponding to roughly 96% acceptance ratio
93  Ainv_buffer.resize(std::min(static_cast<int>(delay * 1.08), norb), norb);
94 
95  temp_gpu.resize(norb, delay);
96  delay_list.resize(delay);
97  U_gpu.resize(delay, norb);
98  V_gpu.resize(delay, norb);
99  Binv_gpu.resize(delay, delay);
100  delay_list_gpu.resize(delay);
101  Ainv_gpu.resize(norb, norb);
102  }
103 
104  /** compute the inverse of the transpose of matrix A and its determinant value in log
105  * @tparam TREAL real type
106  */
107  template<typename TREAL>
108  void invert_transpose(const Matrix<T>& logdetT, Matrix<T>& Ainv, std::complex<TREAL>& log_value)
109  {
110  clearDelayCount();
111 #if defined(QMC_CUDA2HIP)
112  rocsolver_inverter.invert_transpose(logdetT, Ainv, Ainv_gpu, log_value);
113 #else
114  cusolver_inverter.invert_transpose(logdetT, Ainv, Ainv_gpu, log_value);
115 #endif
116  }
117 
118  /** initialize internal objects when Ainv is refreshed
119  * @param Ainv inverse matrix
120  */
121  inline void initializeInv(const Matrix<T>& Ainv)
122  {
123  cudaErrorCheck(cudaMemcpyAsync(Ainv_gpu.data(), Ainv.data(), Ainv.size() * sizeof(T), cudaMemcpyHostToDevice,
124  queue_.getNative()),
125  "cudaMemcpyAsync failed!");
126  clearDelayCount();
127  // H2D transfer must be synchronized regardless of host memory being pinned or not.
128  queue_.sync();
129  }
130 
131  inline int getDelayCount() const { return delay_count; }
132 
133  /** compute the row of up-to-date Ainv
134  * @param Ainv inverse matrix
135  * @param rowchanged the row id corresponding to the proposed electron
136  */
137  template<typename VVT>
138  inline void getInvRow(const Matrix<T>& Ainv, int rowchanged, VVT& invRow)
139  {
140  if (!prefetched_range.checkRange(rowchanged))
141  {
142  int last_row = std::min(rowchanged + Ainv_buffer.rows(), Ainv.rows());
144  invRow.size() * (last_row - rowchanged) * sizeof(T), cudaMemcpyDeviceToHost,
145  queue_.getNative()),
146  "cudaMemcpyAsync failed!");
147  prefetched_range.setRange(rowchanged, last_row);
148  queue_.sync();
149  }
150  // save AinvRow to new_AinvRow
151  std::copy_n(Ainv_buffer[prefetched_range.getOffset(rowchanged)], invRow.size(), invRow.data());
152  if (delay_count > 0)
153  {
154  constexpr T cone(1);
155  constexpr T czero(0);
156  const int norb = Ainv.rows();
157  const int lda_Binv = Binv.cols();
158  // multiply V (NxK) Binv(KxK) U(KxN) AinvRow right to the left
159  BLAS::gemv('T', norb, delay_count, cone, U.data(), norb, invRow.data(), 1, czero, p.data(), 1);
160  BLAS::gemv('N', delay_count, delay_count, -cone, Binv.data(), lda_Binv, p.data(), 1, czero, Binv[delay_count], 1);
161  BLAS::gemv('N', norb, delay_count, cone, V.data(), norb, Binv[delay_count], 1, cone, invRow.data(), 1);
162  }
163  }
164 
165  /** accept a move with the update delayed
166  * @param Ainv inverse matrix
167  * @param rowchanged the row id corresponding to the proposed electron
168  * @param psiV new orbital values
169  *
170  * Before delay_count reaches the maximum delay, only Binv is updated with a recursive algorithm
171  */
172  template<typename VVT, typename RATIOT>
173  inline void acceptRow(Matrix<T>& Ainv, int rowchanged, const VVT& psiV, const RATIOT ratio_new)
174  {
175  // update Binv from delay_count to delay_count+1
176  constexpr T cone(1);
177  constexpr T czero(0);
178  const int norb = Ainv.rows();
179  const int lda_Binv = Binv.cols();
181  std::copy_n(psiV.data(), norb, U[delay_count]);
182  delay_list[delay_count] = rowchanged;
183  // the new Binv is [[X Y] [Z sigma]]
184  BLAS::gemv('T', norb, delay_count + 1, -cone, V.data(), norb, psiV.data(), 1, czero, p.data(), 1);
185  // sigma
186  const T sigma = static_cast<T>(RATIOT(1) / ratio_new);
187  Binv[delay_count][delay_count] = sigma;
188  // Y
189  BLAS::gemv('T', delay_count, delay_count, sigma, Binv.data(), lda_Binv, p.data(), 1, czero,
190  Binv.data() + delay_count, lda_Binv);
191  // X
192  BLAS::ger(delay_count, delay_count, cone, Binv[delay_count], 1, Binv.data() + delay_count, lda_Binv, Binv.data(),
193  lda_Binv);
194  // Z
195  for (int i = 0; i < delay_count; i++)
196  Binv[delay_count][i] *= sigma;
197  delay_count++;
198  // update Ainv when maximal delay is reached
199  if (delay_count == lda_Binv)
200  updateInvMat(Ainv, false);
201  }
202 
203  /** update the full Ainv and reset delay_count
204  * @param Ainv inverse matrix
205  */
206  inline void updateInvMat(Matrix<T>& Ainv, bool transfer_to_host = true)
207  {
208  // update the inverse matrix
209  if (delay_count > 0)
210  {
211  const int norb = Ainv.rows();
212  const int lda_Binv = Binv.cols();
213  cudaErrorCheck(cudaMemcpyAsync(U_gpu.data(), U.data(), norb * delay_count * sizeof(T), cudaMemcpyHostToDevice,
214  queue_.getNative()),
215  "cudaMemcpyAsync failed!");
216  compute::BLAS::gemm(blas_handle_, 'T', 'N', delay_count, norb, norb, T(1), U_gpu.data(), norb, Ainv_gpu.data(),
217  norb, T(0), temp_gpu.data(), lda_Binv);
220  "cudaMemcpyAsync failed!");
221  applyW_stageV_cuda(delay_list_gpu.data(), delay_count, temp_gpu.data(), norb, temp_gpu.cols(), V_gpu.data(),
222  Ainv_gpu.data(), queue_.getNative());
223  cudaErrorCheck(cudaMemcpyAsync(Binv_gpu.data(), Binv.data(), lda_Binv * delay_count * sizeof(T),
225  "cudaMemcpyAsync failed!");
226  compute::BLAS::gemm(blas_handle_, 'N', 'N', norb, delay_count, delay_count, T(1), V_gpu.data(), norb,
227  Binv_gpu.data(), lda_Binv, T(0), U_gpu.data(), norb);
228  compute::BLAS::gemm(blas_handle_, 'N', 'N', norb, norb, delay_count, T(-1), U_gpu.data(), norb, temp_gpu.data(),
229  lda_Binv, T(1), Ainv_gpu.data(), norb);
230  clearDelayCount();
231  }
232 
233  // transfer Ainv_gpu to Ainv and wait till completion
234  if (transfer_to_host)
235  {
236  cudaErrorCheck(cudaMemcpyAsync(Ainv.data(), Ainv_gpu.data(), Ainv.size() * sizeof(T), cudaMemcpyDeviceToHost,
237  queue_.getNative()),
238  "cudaMemcpyAsync failed!");
239  queue_.sync();
240  }
241  }
242 };
243 } // namespace qmcplusplus
244 
245 #endif // QMCPLUSPLUS_DELAYED_UPDATE_CUDA_H
void resize(size_type n, Type_t val=Type_t())
Resize the container.
Definition: OhmmsVector.h:166
void getInvRow(const Matrix< T > &Ainv, int rowchanged, VVT &invRow)
compute the row of up-to-date Ainv
void clearDelayCount()
reset delay count to 0
void acceptRow(Matrix< T > &Ainv, int rowchanged, const VVT &psiV, const RATIOT ratio_new)
accept a move with the update delayed
void gemm(BLASHandle< PlatformKind::CUDA > &handle, const char transa, const char transb, int m, int n, int k, const float &alpha, const float *A, int lda, const float *B, int ldb, const float &beta, float *C, int ldc)
Matrix< T, CUDAAllocator< T > > U_gpu
GPU copy of U, V, Binv, Ainv.
helper functions for EinsplineSetBuilder
Definition: Configuration.h:43
int getOffset(int index) const
constexpr std::complex< float > czero
Definition: BLAS.hpp:51
handle CUDA/HIP runtime selection.
constexpr std::complex< float > cone
Definition: BLAS.hpp:50
cuSolverInverter< T_FP > cusolver_inverter
std::enable_if_t< std::is_same< TMAT, T_FP >::value > invert_transpose(const Matrix< TMAT > &logdetT, Matrix< TMAT > &Ainv, Matrix< TMAT, CUDAAllocator< TMAT >> &Ainv_gpu, std::complex< TREAL > &log_value)
compute the inverse of the transpose of matrix A and its determinant value in log when T_FP and TMAT ...
void setRange(int first_in, int last_in)
void resize(size_type n, size_type m)
Resize the container.
Definition: OhmmsMatrix.h:99
implements delayed update on NVIDIA GPU using cuBLAS and cusolverDN
void resize(int norb, int delay)
resize the internal storage
this file provides three C++ memory allocators using CUDA specific memory allocation functions...
DelayedUpdateCUDA()
default constructor
implements matrix inversion via rocSolver
T min(T a, T b)
static void gemv(int n, int m, const double *restrict amat, const double *restrict x, double *restrict y)
Definition: BLAS.hpp:118
Vector< int, CUDAHostAllocator< int > > delay_list
int delay_count
current number of delays, increase one for each acceptance, reset to 0 after updating Ainv ...
size_type size() const
Definition: OhmmsMatrix.h:76
void applyW_stageV_cuda(const int *delay_list_gpu, const int delay_count, float *temp_gpu, const int numorbs, const int ndelay, float *V_gpu, const float *Ainv, cudaStream_t hstream)
helper function for delayed update algorithm W matrix is applied and copy selected rows of Ainv into ...
cudaErrorCheck(cudaMemcpyAsync(dev_lu.data(), lu.data(), sizeof(decltype(lu)::value_type) *lu.size(), cudaMemcpyHostToDevice, hstream), "cudaMemcpyAsync failed copying log_values to device")
static void ger(int m, int n, double alpha, const double *x, int incx, const double *y, int incy, double *a, int lda)
Definition: BLAS.hpp:437
#define cudaMemcpyDeviceToHost
Definition: cuda2hip.h:138
Matrix< T, CUDAHostAllocator< T > > Ainv_buffer
Matrix< T, CUDAAllocator< T > > temp_gpu
void updateInvMat(Matrix< T > &Ainv, bool transfer_to_host=true)
update the full Ainv and reset delay_count
#define cudaMemcpyHostToDevice
Definition: cuda2hip.h:139
size_type rows() const
Definition: OhmmsMatrix.h:77
Declaraton of Vector<T,Alloc> Manage memory through Alloc directly and allow referencing an existing ...
void initializeInv(const Matrix< T > &Ainv)
initialize internal objects when Ainv is refreshed
sycl::event copy_n(sycl::queue &aq, const T1 *restrict VA, size_t array_size, T2 *restrict VC, const std::vector< sycl::event > &events)
Definition: syclBLAS.cpp:548
Matrix< T, CUDAHostAllocator< T > > U
compute::BLASHandle< PlatformKind::CUDA > blas_handle_
compute::Queue< PlatformKind::CUDA > queue_
bool checkRange(int index) const
void invert_transpose(const Matrix< T > &logdetT, Matrix< T > &Ainv, std::complex< TREAL > &log_value)
compute the inverse of the transpose of matrix A and its determinant value in log ...
Vector< int, CUDAAllocator< int > > delay_list_gpu
Matrix< T, CUDAAllocator< T > > V_gpu
Matrix< T, CUDAAllocator< T > > Binv_gpu
implements matrix inversion via cuSolverDN
Matrix< T, CUDAHostAllocator< T > > Binv
helper class for the prefetched range of a vector
#define cudaMemcpyAsync
Definition: cuda2hip.h:136
Matrix< T, CUDAAllocator< T > > Ainv_gpu