QMCPACK
DelayedUpdateSYCL.h
Go to the documentation of this file.
1 //////////////////////////////////////////////////////////////////////////////////////
2 // This file is distributed under the University of Illinois/NCSA Open Source License.
3 // See LICENSE file in top directory for details.
4 //
5 // Copyright (c) 2022 QMCPACK developers.
6 //
7 // File developed by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
8 // Jeongnim Kim, jeongnim.kim@intel.com, Intel Corp.
9 //
10 // File created by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
11 //////////////////////////////////////////////////////////////////////////////////////
12 
13 #ifndef QMCPLUSPLUS_DELAYED_UPDATE_SYCL_H
14 #define QMCPLUSPLUS_DELAYED_UPDATE_SYCL_H
15 
16 #include "OhmmsPETE/OhmmsVector.h"
17 #include "OhmmsPETE/OhmmsMatrix.h"
18 #include "SYCL/SYCLallocator.hpp"
19 #include "SYCL/syclBLAS.hpp"
21 #include "DiracMatrix.h"
22 #include "PrefetchedRange.h"
23 #include "syclSolverInverter.hpp"
24 #include "SYCL/SYCLruntime.hpp"
25 
26 //#define SYCL_BLOCKING
27 
28 namespace qmcplusplus
29 {
30 /** implements delayed update on Intel GPU using SYCL
31  * @tparam T base precision for most computation
32  * @tparam T_FP high precision for matrix inversion, T_FP >= T
33  */
34 template<typename T, typename T_FP>
36 {
37  // Data staged during for delayed acceptRows
41  //Matrix<T> tempMat; // for debugging only
43  /// GPU copy of U, V, Binv, Ainv
48  // auxiliary arrays for B
50  // using host allocator
52  /// current number of delays, increase one for each acceptance, reset to 0 after updating Ainv
54 
56 
57  // the range of prefetched_Ainv_rows
59  // Ainv prefetch buffer
61 
63 
64  /// reset delay count to 0
65  inline void clearDelayCount()
66  {
67  delay_count = 0;
69  }
70 
71 public:
72  /// default constructor
74 
76 
77  /** resize the internal storage
78  * @param norb number of electrons/orbitals
79  * @param delay, maximum delay 0<delay<=norb
80  */
81  inline void resize(int norb, int delay)
82  {
83  //tempMat.resize(norb, delay);
84  V.resize(delay, norb);
85  U.resize(delay, norb);
86  p.resize(delay);
87  Binv.resize(delay, delay);
88  // prefetch 8% more rows corresponding to roughly 96% acceptance ratio
89  Ainv_buffer.resize(std::min(static_cast<int>(delay * 1.08), norb), norb);
90 
91  temp_gpu.resize(norb, delay);
92  delay_list.resize(delay);
93  U_gpu.resize(delay, norb);
94  V_gpu.resize(delay, norb);
95  Binv_gpu.resize(delay, delay);
96  //delay_list_gpu.resize(delay);
97  Ainv_gpu.resize(norb, norb);
98  }
99 
100  /** compute the inverse of the transpose of matrix A and its determinant value in log
101  * @tparam TREAL real type
102  */
103  template<typename TREAL>
104  void invert_transpose(const Matrix<T>& logdetT, Matrix<T>& Ainv, std::complex<TREAL>& log_value)
105  {
106  clearDelayCount();
107 
108  sycl_inverter_.invert_transpose(logdetT, Ainv, Ainv_gpu, log_value, m_queue_);
109  }
110 
111  /** initialize internal objects when Ainv is refreshed
112  * @param Ainv inverse matrix
113  */
114  inline void initializeInv(const Matrix<T>& Ainv)
115  {
116  // must be blocking due to potential consumption of Ainv_gpu
117  m_queue_.memcpy(Ainv_gpu.data(), Ainv.data(), Ainv.size() * sizeof(T)).wait();
118  clearDelayCount();
119  }
120 
121  inline int getDelayCount() const { return delay_count; }
122 
123  /** compute the row of up-to-date Ainv
124  * @param Ainv inverse matrix
125  * @param rowchanged the row id corresponding to the proposed electron
126  */
127  template<typename VVT>
128  inline void getInvRow(const Matrix<T>& Ainv, int rowchanged, VVT& invRow)
129  {
130  if (!prefetched_range.checkRange(rowchanged))
131  {
132  const int last_row = std::min(rowchanged + Ainv_buffer.rows(), Ainv.rows());
133  m_queue_.memcpy(Ainv_buffer.data(), Ainv_gpu[rowchanged], invRow.size() * (last_row - rowchanged) * sizeof(T))
134  .wait();
135  prefetched_range.setRange(rowchanged, last_row);
136  }
137 
138  // save AinvRow to new_AinvRow
139  std::copy_n(Ainv_buffer[prefetched_range.getOffset(rowchanged)], invRow.size(), invRow.data());
140  if (delay_count > 0)
141  {
142  constexpr T cone(1);
143  constexpr T czero(0);
144  const int norb = Ainv.rows();
145  const int lda_Binv = Binv.cols();
146  // multiply V (NxK) Binv(KxK) U(KxN) AinvRow right to the left
147  BLAS::gemv('T', norb, delay_count, cone, U.data(), norb, invRow.data(), 1, czero, p.data(), 1);
148  BLAS::gemv('N', delay_count, delay_count, -cone, Binv.data(), lda_Binv, p.data(), 1, czero, Binv[delay_count], 1);
149  BLAS::gemv('N', norb, delay_count, cone, V.data(), norb, Binv[delay_count], 1, cone, invRow.data(), 1);
150  }
151  }
152 
153  /** accept a move with the update delayed
154  * @param Ainv inverse matrix
155  * @param rowchanged the row id corresponding to the proposed electron
156  * @param psiV new orbital values
157  *
158  * Before delay_count reaches the maximum delay, only Binv is updated with a recursive algorithm
159  */
160  template<typename VVT, typename RATIOT>
161  inline void acceptRow(Matrix<T>& Ainv, int rowchanged, const VVT& psiV, const RATIOT ratio_new)
162  {
163  // update Binv from delay_count to delay_count+1
164  constexpr T cone(1);
165  constexpr T czero(0);
166  const int norb = Ainv.rows();
167  const int lda_Binv = Binv.cols();
169  std::copy_n(psiV.data(), norb, U[delay_count]);
170  delay_list[delay_count] = rowchanged;
171  // the new Binv is [[X Y] [Z sigma]]
172  BLAS::gemv('T', norb, delay_count + 1, -cone, V.data(), norb, psiV.data(), 1, czero, p.data(), 1);
173  // sigma
174  const T sigma = static_cast<T>(RATIOT(1) / ratio_new);
175  Binv[delay_count][delay_count] = sigma;
176  // Y
177  BLAS::gemv('T', delay_count, delay_count, sigma, Binv.data(), lda_Binv, p.data(), 1, czero,
178  Binv.data() + delay_count, lda_Binv);
179  // X
181  lda_Binv);
182  // Z
183  for (int i = 0; i < delay_count; i++)
184  Binv[delay_count][i] *= sigma;
185  delay_count++;
186  // update Ainv when maximal delay is reached
187  if (delay_count == lda_Binv)
188  updateInvMat(Ainv, false);
189  }
190 
191  /** update the full Ainv and reset delay_count
192  * @param Ainv inverse matrix
193  */
194  inline void updateInvMat(Matrix<T>& Ainv, bool transfer_to_host = true)
195  {
196  // update the inverse matrix
197  if (delay_count > 0)
198  {
199  constexpr T cone(1);
200  constexpr T czero(0);
201  const int norb = Ainv.rows();
202  const int lda_Binv = Binv.cols();
203 
204  m_queue_.memcpy(U_gpu.data(), U.data(), norb * delay_count * sizeof(T));
205  m_queue_.memcpy(Binv_gpu.data(), Binv.data(), lda_Binv * delay_count * sizeof(T));
206 
207  syclBLAS::gemm(m_queue_, 'T', 'N', delay_count, norb, norb, cone, U_gpu.data(), norb, Ainv_gpu.data(), norb,
208  czero, temp_gpu.data(), lda_Binv);
209 
210  applyW_stageV_sycl(m_queue_, delay_list.data(), delay_count, temp_gpu.data(), norb, temp_gpu.cols(), V_gpu.data(),
211  Ainv_gpu.data());
212 
213  syclBLAS::gemm(m_queue_, 'N', 'N', norb, delay_count, delay_count, cone, V_gpu.data(), norb, Binv_gpu.data(),
214  lda_Binv, czero, U_gpu.data(), norb);
215 
216 #ifdef SYCL_BLOCKING
217  syclBLAS::gemm(m_queue_, 'N', 'N', norb, norb, delay_count, -cone, U_gpu.data(), norb, temp_gpu.data(), lda_Binv,
218  cone, Ainv_gpu.data(), norb)
219  .wait();
220 #else
221  syclBLAS::gemm(m_queue_, 'N', 'N', norb, norb, delay_count, -cone, U_gpu.data(), norb, temp_gpu.data(), lda_Binv,
222  cone, Ainv_gpu.data(), norb);
223 #endif
224 
225  clearDelayCount();
226  }
227 
228  // transfer Ainv_gpu to Ainv and wait till completion
229  if (transfer_to_host)
230  m_queue_.memcpy(Ainv.data(), Ainv_gpu.data(), Ainv.size() * sizeof(T)).wait();
231  }
232 };
233 } // namespace qmcplusplus
234 
235 #endif // QMCPLUSPLUS_DELAYED_UPDATE_SYCL_H
sycl::queue createSYCLInOrderQueueOnDefaultDevice()
create an in-order queue using the default device
Definition: SYCLruntime.cpp:20
void resize(size_type n, Type_t val=Type_t())
Resize the container.
Definition: OhmmsVector.h:166
helper functions for EinsplineSetBuilder
Definition: Configuration.h:43
Matrix< T, SYCLAllocator< T > > U_gpu
GPU copy of U, V, Binv, Ainv.
int getOffset(int index) const
Matrix< T, SYCLAllocator< T > > Binv_gpu
constexpr std::complex< float > czero
Definition: BLAS.hpp:51
DelayedUpdateSYCL()
default constructor
constexpr std::complex< float > cone
Definition: BLAS.hpp:50
void setRange(int first_in, int last_in)
syclSolverInverter< T_FP > sycl_inverter_
void resize(size_type n, size_type m)
Resize the container.
Definition: OhmmsMatrix.h:99
Matrix< T, SYCLAllocator< T > > temp_gpu
T min(T a, T b)
size_type cols() const
Definition: OhmmsMatrix.h:78
Vector< int, SYCLHostAllocator< int > > delay_list
static void gemv(int n, int m, const double *restrict amat, const double *restrict x, double *restrict y)
Definition: BLAS.hpp:118
void getInvRow(const Matrix< T > &Ainv, int rowchanged, VVT &invRow)
compute the row of up-to-date Ainv
int delay_count
current number of delays, increase one for each acceptance, reset to 0 after updating Ainv ...
implements matrix inversion via cuSolverDN
void resize(int norb, int delay)
resize the internal storage
size_type size() const
Definition: OhmmsMatrix.h:76
sycl::event applyW_stageV_sycl(sycl::queue &aq, const int *restrict delay_list_gpu, const int delay_count, T *restrict temp_gpu, const int numorbs, const int ndelay, T *restrict V_gpu, const T *restrict Ainv, const std::vector< sycl::event > &dependencies)
Matrix< T, SYCLAllocator< T > > Ainv_gpu
void acceptRow(Matrix< T > &Ainv, int rowchanged, const VVT &psiV, const RATIOT ratio_new)
accept a move with the update delayed
static void ger(int m, int n, double alpha, const double *x, int incx, const double *y, int incy, double *a, int lda)
Definition: BLAS.hpp:437
sycl::event gemm(sycl::queue &handle, const char tA, const char tB, const int m, const int n, const int k, const T alpha, const T *A, const int lda, const T *B, const int ldb, const T beta, T *C, const int ldc, const std::vector< sycl::event > &events)
Definition: syclBLAS.cpp:275
size_type rows() const
Definition: OhmmsMatrix.h:77
Matrix< T, SYCLAllocator< T > > V_gpu
void invert_transpose(const Matrix< T > &logdetT, Matrix< T > &Ainv, std::complex< TREAL > &log_value)
compute the inverse of the transpose of matrix A and its determinant value in log ...
Declaraton of Vector<T,Alloc> Manage memory through Alloc directly and allow referencing an existing ...
sycl::event copy_n(sycl::queue &aq, const T1 *restrict VA, size_t array_size, T2 *restrict VC, const std::vector< sycl::event > &events)
Definition: syclBLAS.cpp:548
this file provides three C++ memory allocators using SYCL specific memory allocation functions...
bool checkRange(int index) const
void updateInvMat(Matrix< T > &Ainv, bool transfer_to_host=true)
update the full Ainv and reset delay_count
helper class for the prefetched range of a vector
void clearDelayCount()
reset delay count to 0
void initializeInv(const Matrix< T > &Ainv)
initialize internal objects when Ainv is refreshed
implements delayed update on Intel GPU using SYCL