d3/deb/a01619_source.html

 //////////////////////////////////////////////////////////////////////////////////////
 // This file is distributed under the University of Illinois/NCSA Open Source License.
 // See LICENSE file in top directory for details.
 //
 // Copyright (c) 2022 QMCPACK developers.
 //
 // File developed by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
 //                    Jeongnim Kim, jeongnim.kim@intel.com, Intel Corp.
 //
 // File created by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
 //////////////////////////////////////////////////////////////////////////////////////

 #ifndef QMCPLUSPLUS_DELAYED_UPDATE_SYCL_H
 #define QMCPLUSPLUS_DELAYED_UPDATE_SYCL_H

 #include "OhmmsPETE/OhmmsVector.h"
 #include "OhmmsPETE/OhmmsMatrix.h"
 #include "SYCL/SYCLallocator.hpp"
 #include "SYCL/syclBLAS.hpp"
 #include "QMCWaveFunctions/detail/SYCL/sycl_determinant_helper.hpp"
 #include "DiracMatrix.h"
 #include "PrefetchedRange.h"
 #include "syclSolverInverter.hpp"
 #include "SYCL/SYCLruntime.hpp"

 //#define SYCL_BLOCKING

 namespace qmcplusplus
 {
 /** implements delayed update on Intel GPU using SYCL
  * @tparam T base precision for most computation
  * @tparam T_FP high precision for matrix inversion, T_FP >= T
  */
 template<typename T, typename T_FP>
 class DelayedUpdateSYCL
 {
   // Data staged during for delayed acceptRows
   Matrix<T> U;
   Matrix<T> Binv;
   Matrix<T> V;
   //Matrix<T> tempMat; // for debugging only
   Matrix<T, SYCLAllocator<T>> temp_gpu;
   /// GPU copy of U, V, Binv, Ainv
   Matrix<T, SYCLAllocator<T>> U_gpu;
   Matrix<T, SYCLAllocator<T>> V_gpu;
   Matrix<T, SYCLAllocator<T>> Binv_gpu;
   Matrix<T, SYCLAllocator<T>> Ainv_gpu;
   // auxiliary arrays for B
   Vector<T> p;
   // using host allocator
   Vector<int, SYCLHostAllocator<int>> delay_list;
   /// current number of delays, increase one for each acceptance, reset to 0 after updating Ainv
   int delay_count;

   syclSolverInverter<T_FP> sycl_inverter_;

   // the range of prefetched_Ainv_rows
   PrefetchedRange prefetched_range;
   // Ainv prefetch buffer
   Matrix<T> Ainv_buffer;

   sycl::queue m_queue_;

   /// reset delay count to 0
   inline void clearDelayCount()
   {
     delay_count = 0;
     prefetched_range.clear();
   }

 public:
   /// default constructor
   DelayedUpdateSYCL() : delay_count(0) { m_queue_ = createSYCLInOrderQueueOnDefaultDevice(); }

   ~DelayedUpdateSYCL() { syclSolver::freeBuffer(); }

   /** resize the internal storage
    * @param norb number of electrons/orbitals
    * @param delay, maximum delay 0<delay<=norb
    */
   inline void resize(int norb, int delay)
   {
     //tempMat.resize(norb, delay);
     V.resize(delay, norb);
     U.resize(delay, norb);
     p.resize(delay);
     Binv.resize(delay, delay);
     // prefetch 8% more rows corresponding to roughly 96% acceptance ratio
     Ainv_buffer.resize(std::min(static_cast<int>(delay * 1.08), norb), norb);

     temp_gpu.resize(norb, delay);
     delay_list.resize(delay);
     U_gpu.resize(delay, norb);
     V_gpu.resize(delay, norb);
     Binv_gpu.resize(delay, delay);
     //delay_list_gpu.resize(delay);
     Ainv_gpu.resize(norb, norb);
   }

   /** compute the inverse of the transpose of matrix A and its determinant value in log
    * @tparam TREAL real type
    */
   template<typename TREAL>
   void invert_transpose(const Matrix<T>& logdetT, Matrix<T>& Ainv, std::complex<TREAL>& log_value)
   {
     clearDelayCount();

     sycl_inverter_.invert_transpose(logdetT, Ainv, Ainv_gpu, log_value, m_queue_);
   }

   /** initialize internal objects when Ainv is refreshed
    * @param Ainv inverse matrix
    */
   inline void initializeInv(const Matrix<T>& Ainv)
   {
     // must be blocking due to potential consumption of Ainv_gpu
     m_queue_.memcpy(Ainv_gpu.data(), Ainv.data(), Ainv.size() * sizeof(T)).wait();
     clearDelayCount();
   }

   inline int getDelayCount() const { return delay_count; }

   /** compute the row of up-to-date Ainv
    * @param Ainv inverse matrix
    * @param rowchanged the row id corresponding to the proposed electron
    */
   template<typename VVT>
   inline void getInvRow(const Matrix<T>& Ainv, int rowchanged, VVT& invRow)
   {
     if (!prefetched_range.checkRange(rowchanged))
     {
       const int last_row = std::min(rowchanged + Ainv_buffer.rows(), Ainv.rows());
       m_queue_.memcpy(Ainv_buffer.data(), Ainv_gpu[rowchanged], invRow.size() * (last_row - rowchanged) * sizeof(T))
           .wait();
       prefetched_range.setRange(rowchanged, last_row);
     }

     // save AinvRow to new_AinvRow
     std::copy_n(Ainv_buffer[prefetched_range.getOffset(rowchanged)], invRow.size(), invRow.data());
     if (delay_count > 0)
     {
       constexpr T cone(1);
       constexpr T czero(0);
       const int norb     = Ainv.rows();
       const int lda_Binv = Binv.cols();
       // multiply V (NxK) Binv(KxK) U(KxN) AinvRow right to the left
       BLAS::gemv('T', norb, delay_count, cone, U.data(), norb, invRow.data(), 1, czero, p.data(), 1);
       BLAS::gemv('N', delay_count, delay_count, -cone, Binv.data(), lda_Binv, p.data(), 1, czero, Binv[delay_count], 1);
       BLAS::gemv('N', norb, delay_count, cone, V.data(), norb, Binv[delay_count], 1, cone, invRow.data(), 1);
     }
   }

   /** accept a move with the update delayed
    * @param Ainv inverse matrix
    * @param rowchanged the row id corresponding to the proposed electron
    * @param psiV new orbital values
    *
    * Before delay_count reaches the maximum delay, only Binv is updated with a recursive algorithm
    */
   template<typename VVT, typename RATIOT>
   inline void acceptRow(Matrix<T>& Ainv, int rowchanged, const VVT& psiV, const RATIOT ratio_new)
   {
     // update Binv from delay_count to delay_count+1
     constexpr T cone(1);
     constexpr T czero(0);
     const int norb     = Ainv.rows();
     const int lda_Binv = Binv.cols();
     std::copy_n(Ainv_buffer[prefetched_range.getOffset(rowchanged)], norb, V[delay_count]);
     std::copy_n(psiV.data(), norb, U[delay_count]);
     delay_list[delay_count] = rowchanged;
     // the new Binv is [[X Y] [Z sigma]]
     BLAS::gemv('T', norb, delay_count + 1, -cone, V.data(), norb, psiV.data(), 1, czero, p.data(), 1);
     // sigma
     const T sigma                  = static_cast<T>(RATIOT(1) / ratio_new);
     Binv[delay_count][delay_count] = sigma;
     // Y
     BLAS::gemv('T', delay_count, delay_count, sigma, Binv.data(), lda_Binv, p.data(), 1, czero,
                Binv.data() + delay_count, lda_Binv);
     // X
     BLAS::ger(delay_count, delay_count, cone, Binv[delay_count], 1, Binv.data() + delay_count, lda_Binv, Binv.data(),
               lda_Binv);
     // Z
     for (int i = 0; i < delay_count; i++)
       Binv[delay_count][i] *= sigma;
     delay_count++;
     // update Ainv when maximal delay is reached
     if (delay_count == lda_Binv)
       updateInvMat(Ainv, false);
   }

   /** update the full Ainv and reset delay_count
    * @param Ainv inverse matrix
    */
   inline void updateInvMat(Matrix<T>& Ainv, bool transfer_to_host = true)
   {
     // update the inverse matrix
     if (delay_count > 0)
     {
       constexpr T cone(1);
       constexpr T czero(0);
       const int norb     = Ainv.rows();
       const int lda_Binv = Binv.cols();

       m_queue_.memcpy(U_gpu.data(), U.data(), norb * delay_count * sizeof(T));
       m_queue_.memcpy(Binv_gpu.data(), Binv.data(), lda_Binv * delay_count * sizeof(T));

       syclBLAS::gemm(m_queue_, 'T', 'N', delay_count, norb, norb, cone, U_gpu.data(), norb, Ainv_gpu.data(), norb,
                      czero, temp_gpu.data(), lda_Binv);

       applyW_stageV_sycl(m_queue_, delay_list.data(), delay_count, temp_gpu.data(), norb, temp_gpu.cols(), V_gpu.data(),
                          Ainv_gpu.data());

       syclBLAS::gemm(m_queue_, 'N', 'N', norb, delay_count, delay_count, cone, V_gpu.data(), norb, Binv_gpu.data(),
                      lda_Binv, czero, U_gpu.data(), norb);

 #ifdef SYCL_BLOCKING
       syclBLAS::gemm(m_queue_, 'N', 'N', norb, norb, delay_count, -cone, U_gpu.data(), norb, temp_gpu.data(), lda_Binv,
                      cone, Ainv_gpu.data(), norb)
           .wait();
 #else
       syclBLAS::gemm(m_queue_, 'N', 'N', norb, norb, delay_count, -cone, U_gpu.data(), norb, temp_gpu.data(), lda_Binv,
                      cone, Ainv_gpu.data(), norb);
 #endif

       clearDelayCount();
     }

     // transfer Ainv_gpu to Ainv and wait till completion
     if (transfer_to_host)
       m_queue_.memcpy(Ainv.data(), Ainv_gpu.data(), Ainv.size() * sizeof(T)).wait();
   }
 };
 } // namespace qmcplusplus

 #endif // QMCPLUSPLUS_DELAYED_UPDATE_SYCL_H
qmcplusplus::createSYCLInOrderQueueOnDefaultDevice
sycl::queue createSYCLInOrderQueueOnDefaultDevice()
create an in-order queue using the default device
Definition: SYCLruntime.cpp:20

qmcplusplus::Vector::resize
void resize(size_type n, Type_t val=Type_t())
Resize the container.
Definition: OhmmsVector.h:166

qmcplusplus::DataLocality::queue

qmcplusplus
helper functions for EinsplineSetBuilder
Definition: Configuration.h:43

qmcplusplus::PrefetchedRange::clear
void clear()
Definition: PrefetchedRange.h:30

qmcplusplus::DelayedUpdateSYCL::U_gpu
Matrix< T, SYCLAllocator< T > > U_gpu
GPU copy of U, V, Binv, Ainv.
Definition: DelayedUpdateSYCL.h:44

qmcplusplus::PrefetchedRange::getOffset
int getOffset(int index) const
Definition: PrefetchedRange.h:31

qmcplusplus::DelayedUpdateSYCL::Binv_gpu
Matrix< T, SYCLAllocator< T > > Binv_gpu
Definition: DelayedUpdateSYCL.h:46

qmcplusplus::DelayedUpdateSYCL::V
Matrix< T > V
Definition: DelayedUpdateSYCL.h:40

BLAS::czero
constexpr std::complex< float > czero
Definition: BLAS.hpp:51

qmcplusplus::DelayedUpdateSYCL::DelayedUpdateSYCL
DelayedUpdateSYCL()
default constructor
Definition: DelayedUpdateSYCL.h:73

BLAS::cone
constexpr std::complex< float > cone
Definition: BLAS.hpp:50

qmcplusplus::PrefetchedRange::setRange
void setRange(int first_in, int last_in)
Definition: PrefetchedRange.h:25

sycl_determinant_helper.hpp

qmcplusplus::Vector< T >

qmcplusplus::DelayedUpdateSYCL::sycl_inverter_
syclSolverInverter< T_FP > sycl_inverter_
Definition: DelayedUpdateSYCL.h:55

qmcplusplus::Matrix::resize
void resize(size_type n, size_type m)
Resize the container.
Definition: OhmmsMatrix.h:99

qmcplusplus::DelayedUpdateSYCL::temp_gpu
Matrix< T, SYCLAllocator< T > > temp_gpu
Definition: DelayedUpdateSYCL.h:42

qmcplusplus::Vector::data
pointer data()
Definition: OhmmsVector.h:238

omptarget::min
T min(T a, T b)
Definition: OMPTargetMath.hpp:36

qmcplusplus::Matrix::cols
size_type cols() const
Definition: OhmmsMatrix.h:78

qmcplusplus::DelayedUpdateSYCL::p
Vector< T > p
Definition: DelayedUpdateSYCL.h:49

qmcplusplus::DelayedUpdateSYCL::delay_list
Vector< int, SYCLHostAllocator< int > > delay_list
Definition: DelayedUpdateSYCL.h:51

BLAS::gemv
static void gemv(int n, int m, const double *restrict amat, const double *restrict x, double *restrict y)
Definition: BLAS.hpp:118

qmcplusplus::DelayedUpdateSYCL::getInvRow
void getInvRow(const Matrix< T > &Ainv, int rowchanged, VVT &invRow)
compute the row of up-to-date Ainv
Definition: DelayedUpdateSYCL.h:128

qmcplusplus::DelayedUpdateSYCL::delay_count
int delay_count
current number of delays, increase one for each acceptance, reset to 0 after updating Ainv ...
Definition: DelayedUpdateSYCL.h:53

qmcplusplus::syclSolverInverter
implements matrix inversion via cuSolverDN
Definition: syclSolverInverter.hpp:29

qmcplusplus::DelayedUpdateSYCL::resize
void resize(int norb, int delay)
resize the internal storage
Definition: DelayedUpdateSYCL.h:81

qmcplusplus::DelayedUpdateSYCL::U
Matrix< T > U
Definition: DelayedUpdateSYCL.h:38

qmcplusplus::Matrix::size
size_type size() const
Definition: OhmmsMatrix.h:76

qmcplusplus::applyW_stageV_sycl
sycl::event applyW_stageV_sycl(sycl::queue &aq, const int *restrict delay_list_gpu, const int delay_count, T *restrict temp_gpu, const int numorbs, const int ndelay, T *restrict V_gpu, const T *restrict Ainv, const std::vector< sycl::event > &dependencies)
Definition: sycl_determinant_helper.cpp:19

PrefetchedRange.h

qmcplusplus::DelayedUpdateSYCL::~DelayedUpdateSYCL
~DelayedUpdateSYCL()
Definition: DelayedUpdateSYCL.h:75

DiracMatrix.h

qmcplusplus::DelayedUpdateSYCL::Ainv_gpu
Matrix< T, SYCLAllocator< T > > Ainv_gpu
Definition: DelayedUpdateSYCL.h:47

qmcplusplus::DelayedUpdateSYCL::acceptRow
void acceptRow(Matrix< T > &Ainv, int rowchanged, const VVT &psiV, const RATIOT ratio_new)
accept a move with the update delayed
Definition: DelayedUpdateSYCL.h:161

qmcplusplus::syclSolver::freeBuffer
void freeBuffer()
Definition: syclSolver.hpp:24

BLAS::ger
static void ger(int m, int n, double alpha, const double *x, int incx, const double *y, int incy, double *a, int lda)
Definition: BLAS.hpp:437

SYCLruntime.hpp

syclBLAS.hpp

qmcplusplus::syclBLAS::gemm
sycl::event gemm(sycl::queue &handle, const char tA, const char tB, const int m, const int n, const int k, const T alpha, const T *A, const int lda, const T *B, const int ldb, const T beta, T *C, const int ldc, const std::vector< sycl::event > &events)
Definition: syclBLAS.cpp:275

qmcplusplus::Matrix::rows
size_type rows() const
Definition: OhmmsMatrix.h:77

qmcplusplus::DelayedUpdateSYCL::V_gpu
Matrix< T, SYCLAllocator< T > > V_gpu
Definition: DelayedUpdateSYCL.h:45

qmcplusplus::DelayedUpdateSYCL::invert_transpose
void invert_transpose(const Matrix< T > &logdetT, Matrix< T > &Ainv, std::complex< TREAL > &log_value)
compute the inverse of the transpose of matrix A and its determinant value in log ...
Definition: DelayedUpdateSYCL.h:104

qmcplusplus::DelayedUpdateSYCL::Ainv_buffer
Matrix< T > Ainv_buffer
Definition: DelayedUpdateSYCL.h:60

OhmmsVector.h
Declaraton of Vector<T,Alloc> Manage memory through Alloc directly and allow referencing an existing ...

qmcplusplus::syclBLAS::copy_n
sycl::event copy_n(sycl::queue &aq, const T1 *restrict VA, size_t array_size, T2 *restrict VC, const std::vector< sycl::event > &events)
Definition: syclBLAS.cpp:548

qmcplusplus::Matrix< T >

SYCLallocator.hpp
this file provides three C++ memory allocators using SYCL specific memory allocation functions...

qmcplusplus::DelayedUpdateSYCL::getDelayCount
int getDelayCount() const
Definition: DelayedUpdateSYCL.h:121

qmcplusplus::DelayedUpdateSYCL::Binv
Matrix< T > Binv
Definition: DelayedUpdateSYCL.h:39

qmcplusplus::PrefetchedRange::checkRange
bool checkRange(int index) const
Definition: PrefetchedRange.h:37

qmcplusplus::Matrix::data
pointer data()
Definition: OhmmsMatrix.h:182

qmcplusplus::DelayedUpdateSYCL::updateInvMat
void updateInvMat(Matrix< T > &Ainv, bool transfer_to_host=true)
update the full Ainv and reset delay_count
Definition: DelayedUpdateSYCL.h:194

qmcplusplus::DelayedUpdateSYCL::prefetched_range
PrefetchedRange prefetched_range
Definition: DelayedUpdateSYCL.h:58

qmcplusplus::PrefetchedRange
helper class for the prefetched range of a vector
Definition: PrefetchedRange.h:18

OhmmsMatrix.h

qmcplusplus::DelayedUpdateSYCL::clearDelayCount
void clearDelayCount()
reset delay count to 0
Definition: DelayedUpdateSYCL.h:65

qmcplusplus::DelayedUpdateSYCL::m_queue_
sycl::queue m_queue_
Definition: DelayedUpdateSYCL.h:62

qmcplusplus::DelayedUpdateSYCL::initializeInv
void initializeInv(const Matrix< T > &Ainv)
initialize internal objects when Ainv is refreshed
Definition: DelayedUpdateSYCL.h:114

syclSolverInverter.hpp

qmcplusplus::DelayedUpdateSYCL
implements delayed update on Intel GPU using SYCL
Definition: DelayedUpdateSYCL.h:35