QMCPACK
DelayedUpdateSYCL< T, T_FP > Class Template Reference

implements delayed update on Intel GPU using SYCL More...

+ Collaboration diagram for DelayedUpdateSYCL< T, T_FP >:

Public Member Functions

 DelayedUpdateSYCL ()
 default constructor More...
 
 ~DelayedUpdateSYCL ()
 
void resize (int norb, int delay)
 resize the internal storage More...
 
template<typename TREAL >
void invert_transpose (const Matrix< T > &logdetT, Matrix< T > &Ainv, std::complex< TREAL > &log_value)
 compute the inverse of the transpose of matrix A and its determinant value in log More...
 
void initializeInv (const Matrix< T > &Ainv)
 initialize internal objects when Ainv is refreshed More...
 
int getDelayCount () const
 
template<typename VVT >
void getInvRow (const Matrix< T > &Ainv, int rowchanged, VVT &invRow)
 compute the row of up-to-date Ainv More...
 
template<typename VVT , typename RATIOT >
void acceptRow (Matrix< T > &Ainv, int rowchanged, const VVT &psiV, const RATIOT ratio_new)
 accept a move with the update delayed More...
 
void updateInvMat (Matrix< T > &Ainv, bool transfer_to_host=true)
 update the full Ainv and reset delay_count More...
 

Private Member Functions

void clearDelayCount ()
 reset delay count to 0 More...
 

Private Attributes

Matrix< T > U
 
Matrix< T > Binv
 
Matrix< T > V
 
Matrix< T, SYCLAllocator< T > > temp_gpu
 
Matrix< T, SYCLAllocator< T > > U_gpu
 GPU copy of U, V, Binv, Ainv. More...
 
Matrix< T, SYCLAllocator< T > > V_gpu
 
Matrix< T, SYCLAllocator< T > > Binv_gpu
 
Matrix< T, SYCLAllocator< T > > Ainv_gpu
 
Vector< T > p
 
Vector< int, SYCLHostAllocator< int > > delay_list
 
int delay_count
 current number of delays, increase one for each acceptance, reset to 0 after updating Ainv More...
 
syclSolverInverter< T_FP > sycl_inverter_
 
PrefetchedRange prefetched_range
 
Matrix< T > Ainv_buffer
 
sycl::queue m_queue_
 

Detailed Description

template<typename T, typename T_FP>
class qmcplusplus::DelayedUpdateSYCL< T, T_FP >

implements delayed update on Intel GPU using SYCL

Template Parameters
Tbase precision for most computation
T_FPhigh precision for matrix inversion, T_FP >= T

Definition at line 35 of file DelayedUpdateSYCL.h.

Constructor & Destructor Documentation

◆ DelayedUpdateSYCL()

DelayedUpdateSYCL ( )
inline

default constructor

Definition at line 73 of file DelayedUpdateSYCL.h.

References qmcplusplus::createSYCLInOrderQueueOnDefaultDevice(), and DelayedUpdateSYCL< T, T_FP >::m_queue_.

sycl::queue createSYCLInOrderQueueOnDefaultDevice()
create an in-order queue using the default device
Definition: SYCLruntime.cpp:20
int delay_count
current number of delays, increase one for each acceptance, reset to 0 after updating Ainv ...

◆ ~DelayedUpdateSYCL()

Member Function Documentation

◆ acceptRow()

void acceptRow ( Matrix< T > &  Ainv,
int  rowchanged,
const VVT &  psiV,
const RATIOT  ratio_new 
)
inline

accept a move with the update delayed

Parameters
Ainvinverse matrix
rowchangedthe row id corresponding to the proposed electron
psiVnew orbital values

Before delay_count reaches the maximum delay, only Binv is updated with a recursive algorithm

Definition at line 161 of file DelayedUpdateSYCL.h.

References DelayedUpdateSYCL< T, T_FP >::Ainv_buffer, DelayedUpdateSYCL< T, T_FP >::Binv, Matrix< T, Alloc >::cols(), BLAS::cone, qmcplusplus::syclBLAS::copy_n(), BLAS::czero, Matrix< T, Alloc >::data(), Vector< T, Alloc >::data(), DelayedUpdateSYCL< T, T_FP >::delay_count, DelayedUpdateSYCL< T, T_FP >::delay_list, BLAS::gemv(), BLAS::ger(), PrefetchedRange::getOffset(), DelayedUpdateSYCL< T, T_FP >::p, DelayedUpdateSYCL< T, T_FP >::prefetched_range, Matrix< T, Alloc >::rows(), DelayedUpdateSYCL< T, T_FP >::U, DelayedUpdateSYCL< T, T_FP >::updateInvMat(), and DelayedUpdateSYCL< T, T_FP >::V.

162  {
163  // update Binv from delay_count to delay_count+1
164  constexpr T cone(1);
165  constexpr T czero(0);
166  const int norb = Ainv.rows();
167  const int lda_Binv = Binv.cols();
169  std::copy_n(psiV.data(), norb, U[delay_count]);
170  delay_list[delay_count] = rowchanged;
171  // the new Binv is [[X Y] [Z sigma]]
172  BLAS::gemv('T', norb, delay_count + 1, -cone, V.data(), norb, psiV.data(), 1, czero, p.data(), 1);
173  // sigma
174  const T sigma = static_cast<T>(RATIOT(1) / ratio_new);
175  Binv[delay_count][delay_count] = sigma;
176  // Y
177  BLAS::gemv('T', delay_count, delay_count, sigma, Binv.data(), lda_Binv, p.data(), 1, czero,
178  Binv.data() + delay_count, lda_Binv);
179  // X
181  lda_Binv);
182  // Z
183  for (int i = 0; i < delay_count; i++)
184  Binv[delay_count][i] *= sigma;
185  delay_count++;
186  // update Ainv when maximal delay is reached
187  if (delay_count == lda_Binv)
188  updateInvMat(Ainv, false);
189  }
int getOffset(int index) const
constexpr std::complex< float > czero
Definition: BLAS.hpp:51
constexpr std::complex< float > cone
Definition: BLAS.hpp:50
size_type cols() const
Definition: OhmmsMatrix.h:78
Vector< int, SYCLHostAllocator< int > > delay_list
static void gemv(int n, int m, const double *restrict amat, const double *restrict x, double *restrict y)
Definition: BLAS.hpp:118
int delay_count
current number of delays, increase one for each acceptance, reset to 0 after updating Ainv ...
static void ger(int m, int n, double alpha, const double *x, int incx, const double *y, int incy, double *a, int lda)
Definition: BLAS.hpp:437
size_type rows() const
Definition: OhmmsMatrix.h:77
sycl::event copy_n(sycl::queue &aq, const T1 *restrict VA, size_t array_size, T2 *restrict VC, const std::vector< sycl::event > &events)
Definition: syclBLAS.cpp:548
void updateInvMat(Matrix< T > &Ainv, bool transfer_to_host=true)
update the full Ainv and reset delay_count

◆ clearDelayCount()

void clearDelayCount ( )
inlineprivate

◆ getDelayCount()

int getDelayCount ( ) const
inline

Definition at line 121 of file DelayedUpdateSYCL.h.

References DelayedUpdateSYCL< T, T_FP >::delay_count.

121 { return delay_count; }
int delay_count
current number of delays, increase one for each acceptance, reset to 0 after updating Ainv ...

◆ getInvRow()

void getInvRow ( const Matrix< T > &  Ainv,
int  rowchanged,
VVT &  invRow 
)
inline

compute the row of up-to-date Ainv

Parameters
Ainvinverse matrix
rowchangedthe row id corresponding to the proposed electron

Definition at line 128 of file DelayedUpdateSYCL.h.

References DelayedUpdateSYCL< T, T_FP >::Ainv_buffer, DelayedUpdateSYCL< T, T_FP >::Ainv_gpu, DelayedUpdateSYCL< T, T_FP >::Binv, PrefetchedRange::checkRange(), Matrix< T, Alloc >::cols(), BLAS::cone, qmcplusplus::syclBLAS::copy_n(), BLAS::czero, Matrix< T, Alloc >::data(), Vector< T, Alloc >::data(), DelayedUpdateSYCL< T, T_FP >::delay_count, BLAS::gemv(), PrefetchedRange::getOffset(), DelayedUpdateSYCL< T, T_FP >::m_queue_, omptarget::min(), DelayedUpdateSYCL< T, T_FP >::p, DelayedUpdateSYCL< T, T_FP >::prefetched_range, Matrix< T, Alloc >::rows(), PrefetchedRange::setRange(), Matrix< T, Alloc >::size(), DelayedUpdateSYCL< T, T_FP >::U, and DelayedUpdateSYCL< T, T_FP >::V.

129  {
130  if (!prefetched_range.checkRange(rowchanged))
131  {
132  const int last_row = std::min(rowchanged + Ainv_buffer.rows(), Ainv.rows());
133  m_queue_.memcpy(Ainv_buffer.data(), Ainv_gpu[rowchanged], invRow.size() * (last_row - rowchanged) * sizeof(T))
134  .wait();
135  prefetched_range.setRange(rowchanged, last_row);
136  }
137 
138  // save AinvRow to new_AinvRow
139  std::copy_n(Ainv_buffer[prefetched_range.getOffset(rowchanged)], invRow.size(), invRow.data());
140  if (delay_count > 0)
141  {
142  constexpr T cone(1);
143  constexpr T czero(0);
144  const int norb = Ainv.rows();
145  const int lda_Binv = Binv.cols();
146  // multiply V (NxK) Binv(KxK) U(KxN) AinvRow right to the left
147  BLAS::gemv('T', norb, delay_count, cone, U.data(), norb, invRow.data(), 1, czero, p.data(), 1);
148  BLAS::gemv('N', delay_count, delay_count, -cone, Binv.data(), lda_Binv, p.data(), 1, czero, Binv[delay_count], 1);
149  BLAS::gemv('N', norb, delay_count, cone, V.data(), norb, Binv[delay_count], 1, cone, invRow.data(), 1);
150  }
151  }
int getOffset(int index) const
constexpr std::complex< float > czero
Definition: BLAS.hpp:51
constexpr std::complex< float > cone
Definition: BLAS.hpp:50
void setRange(int first_in, int last_in)
T min(T a, T b)
size_type cols() const
Definition: OhmmsMatrix.h:78
static void gemv(int n, int m, const double *restrict amat, const double *restrict x, double *restrict y)
Definition: BLAS.hpp:118
int delay_count
current number of delays, increase one for each acceptance, reset to 0 after updating Ainv ...
size_type size() const
Definition: OhmmsMatrix.h:76
Matrix< T, SYCLAllocator< T > > Ainv_gpu
size_type rows() const
Definition: OhmmsMatrix.h:77
sycl::event copy_n(sycl::queue &aq, const T1 *restrict VA, size_t array_size, T2 *restrict VC, const std::vector< sycl::event > &events)
Definition: syclBLAS.cpp:548
bool checkRange(int index) const

◆ initializeInv()

void initializeInv ( const Matrix< T > &  Ainv)
inline

initialize internal objects when Ainv is refreshed

Parameters
Ainvinverse matrix

Definition at line 114 of file DelayedUpdateSYCL.h.

References DelayedUpdateSYCL< T, T_FP >::Ainv_gpu, DelayedUpdateSYCL< T, T_FP >::clearDelayCount(), Matrix< T, Alloc >::data(), DelayedUpdateSYCL< T, T_FP >::m_queue_, and Matrix< T, Alloc >::size().

115  {
116  // must be blocking due to potential consumption of Ainv_gpu
117  m_queue_.memcpy(Ainv_gpu.data(), Ainv.data(), Ainv.size() * sizeof(T)).wait();
118  clearDelayCount();
119  }
size_type size() const
Definition: OhmmsMatrix.h:76
Matrix< T, SYCLAllocator< T > > Ainv_gpu
void clearDelayCount()
reset delay count to 0

◆ invert_transpose()

void invert_transpose ( const Matrix< T > &  logdetT,
Matrix< T > &  Ainv,
std::complex< TREAL > &  log_value 
)
inline

compute the inverse of the transpose of matrix A and its determinant value in log

Template Parameters
TREALreal type

Definition at line 104 of file DelayedUpdateSYCL.h.

References DelayedUpdateSYCL< T, T_FP >::Ainv_gpu, DelayedUpdateSYCL< T, T_FP >::clearDelayCount(), DelayedUpdateSYCL< T, T_FP >::m_queue_, and DelayedUpdateSYCL< T, T_FP >::sycl_inverter_.

105  {
106  clearDelayCount();
107 
108  sycl_inverter_.invert_transpose(logdetT, Ainv, Ainv_gpu, log_value, m_queue_);
109  }
syclSolverInverter< T_FP > sycl_inverter_
Matrix< T, SYCLAllocator< T > > Ainv_gpu
void clearDelayCount()
reset delay count to 0

◆ resize()

void resize ( int  norb,
int  delay 
)
inline

resize the internal storage

Parameters
norbnumber of electrons/orbitals
delay,maximumdelay 0<delay<=norb

Definition at line 81 of file DelayedUpdateSYCL.h.

References DelayedUpdateSYCL< T, T_FP >::Ainv_buffer, DelayedUpdateSYCL< T, T_FP >::Ainv_gpu, DelayedUpdateSYCL< T, T_FP >::Binv, DelayedUpdateSYCL< T, T_FP >::Binv_gpu, DelayedUpdateSYCL< T, T_FP >::delay_list, omptarget::min(), DelayedUpdateSYCL< T, T_FP >::p, Matrix< T, Alloc >::resize(), Vector< T, Alloc >::resize(), DelayedUpdateSYCL< T, T_FP >::temp_gpu, DelayedUpdateSYCL< T, T_FP >::U, DelayedUpdateSYCL< T, T_FP >::U_gpu, DelayedUpdateSYCL< T, T_FP >::V, and DelayedUpdateSYCL< T, T_FP >::V_gpu.

82  {
83  //tempMat.resize(norb, delay);
84  V.resize(delay, norb);
85  U.resize(delay, norb);
86  p.resize(delay);
87  Binv.resize(delay, delay);
88  // prefetch 8% more rows corresponding to roughly 96% acceptance ratio
89  Ainv_buffer.resize(std::min(static_cast<int>(delay * 1.08), norb), norb);
90 
91  temp_gpu.resize(norb, delay);
92  delay_list.resize(delay);
93  U_gpu.resize(delay, norb);
94  V_gpu.resize(delay, norb);
95  Binv_gpu.resize(delay, delay);
96  //delay_list_gpu.resize(delay);
97  Ainv_gpu.resize(norb, norb);
98  }
void resize(size_type n, Type_t val=Type_t())
Resize the container.
Definition: OhmmsVector.h:166
Matrix< T, SYCLAllocator< T > > U_gpu
GPU copy of U, V, Binv, Ainv.
Matrix< T, SYCLAllocator< T > > Binv_gpu
void resize(size_type n, size_type m)
Resize the container.
Definition: OhmmsMatrix.h:99
Matrix< T, SYCLAllocator< T > > temp_gpu
T min(T a, T b)
Vector< int, SYCLHostAllocator< int > > delay_list
Matrix< T, SYCLAllocator< T > > Ainv_gpu
Matrix< T, SYCLAllocator< T > > V_gpu

◆ updateInvMat()

void updateInvMat ( Matrix< T > &  Ainv,
bool  transfer_to_host = true 
)
inline

update the full Ainv and reset delay_count

Parameters
Ainvinverse matrix

Definition at line 194 of file DelayedUpdateSYCL.h.

References DelayedUpdateSYCL< T, T_FP >::Ainv_gpu, qmcplusplus::applyW_stageV_sycl(), DelayedUpdateSYCL< T, T_FP >::Binv, DelayedUpdateSYCL< T, T_FP >::Binv_gpu, DelayedUpdateSYCL< T, T_FP >::clearDelayCount(), Matrix< T, Alloc >::cols(), BLAS::cone, BLAS::czero, Matrix< T, Alloc >::data(), DelayedUpdateSYCL< T, T_FP >::delay_count, DelayedUpdateSYCL< T, T_FP >::delay_list, qmcplusplus::syclBLAS::gemm(), DelayedUpdateSYCL< T, T_FP >::m_queue_, Matrix< T, Alloc >::rows(), Matrix< T, Alloc >::size(), DelayedUpdateSYCL< T, T_FP >::temp_gpu, DelayedUpdateSYCL< T, T_FP >::U, DelayedUpdateSYCL< T, T_FP >::U_gpu, and DelayedUpdateSYCL< T, T_FP >::V_gpu.

Referenced by DelayedUpdateSYCL< T, T_FP >::acceptRow().

195  {
196  // update the inverse matrix
197  if (delay_count > 0)
198  {
199  constexpr T cone(1);
200  constexpr T czero(0);
201  const int norb = Ainv.rows();
202  const int lda_Binv = Binv.cols();
203 
204  m_queue_.memcpy(U_gpu.data(), U.data(), norb * delay_count * sizeof(T));
205  m_queue_.memcpy(Binv_gpu.data(), Binv.data(), lda_Binv * delay_count * sizeof(T));
206 
207  syclBLAS::gemm(m_queue_, 'T', 'N', delay_count, norb, norb, cone, U_gpu.data(), norb, Ainv_gpu.data(), norb,
208  czero, temp_gpu.data(), lda_Binv);
209 
210  applyW_stageV_sycl(m_queue_, delay_list.data(), delay_count, temp_gpu.data(), norb, temp_gpu.cols(), V_gpu.data(),
211  Ainv_gpu.data());
212 
213  syclBLAS::gemm(m_queue_, 'N', 'N', norb, delay_count, delay_count, cone, V_gpu.data(), norb, Binv_gpu.data(),
214  lda_Binv, czero, U_gpu.data(), norb);
215 
216 #ifdef SYCL_BLOCKING
217  syclBLAS::gemm(m_queue_, 'N', 'N', norb, norb, delay_count, -cone, U_gpu.data(), norb, temp_gpu.data(), lda_Binv,
218  cone, Ainv_gpu.data(), norb)
219  .wait();
220 #else
221  syclBLAS::gemm(m_queue_, 'N', 'N', norb, norb, delay_count, -cone, U_gpu.data(), norb, temp_gpu.data(), lda_Binv,
222  cone, Ainv_gpu.data(), norb);
223 #endif
224 
225  clearDelayCount();
226  }
227 
228  // transfer Ainv_gpu to Ainv and wait till completion
229  if (transfer_to_host)
230  m_queue_.memcpy(Ainv.data(), Ainv_gpu.data(), Ainv.size() * sizeof(T)).wait();
231  }
Matrix< T, SYCLAllocator< T > > U_gpu
GPU copy of U, V, Binv, Ainv.
Matrix< T, SYCLAllocator< T > > Binv_gpu
constexpr std::complex< float > czero
Definition: BLAS.hpp:51
constexpr std::complex< float > cone
Definition: BLAS.hpp:50
Matrix< T, SYCLAllocator< T > > temp_gpu
size_type cols() const
Definition: OhmmsMatrix.h:78
Vector< int, SYCLHostAllocator< int > > delay_list
int delay_count
current number of delays, increase one for each acceptance, reset to 0 after updating Ainv ...
size_type size() const
Definition: OhmmsMatrix.h:76
sycl::event applyW_stageV_sycl(sycl::queue &aq, const int *restrict delay_list_gpu, const int delay_count, T *restrict temp_gpu, const int numorbs, const int ndelay, T *restrict V_gpu, const T *restrict Ainv, const std::vector< sycl::event > &dependencies)
Matrix< T, SYCLAllocator< T > > Ainv_gpu
sycl::event gemm(sycl::queue &handle, const char tA, const char tB, const int m, const int n, const int k, const T alpha, const T *A, const int lda, const T *B, const int ldb, const T beta, T *C, const int ldc, const std::vector< sycl::event > &events)
Definition: syclBLAS.cpp:275
size_type rows() const
Definition: OhmmsMatrix.h:77
Matrix< T, SYCLAllocator< T > > V_gpu
void clearDelayCount()
reset delay count to 0

Member Data Documentation

◆ Ainv_buffer

◆ Ainv_gpu

◆ Binv

◆ Binv_gpu

◆ delay_count

int delay_count
private

◆ delay_list

◆ m_queue_

◆ p

◆ prefetched_range

◆ sycl_inverter_

syclSolverInverter<T_FP> sycl_inverter_
private

Definition at line 55 of file DelayedUpdateSYCL.h.

Referenced by DelayedUpdateSYCL< T, T_FP >::invert_transpose().

◆ temp_gpu

◆ U

◆ U_gpu

Matrix<T, SYCLAllocator<T> > U_gpu
private

GPU copy of U, V, Binv, Ainv.

Definition at line 44 of file DelayedUpdateSYCL.h.

Referenced by DelayedUpdateSYCL< T, T_FP >::resize(), and DelayedUpdateSYCL< T, T_FP >::updateInvMat().

◆ V

◆ V_gpu


The documentation for this class was generated from the following file: