d9/dfd/a01610_source.html

 //////////////////////////////////////////////////////////////////////////////////////
 // This file is distributed under the University of Illinois/NCSA Open Source License.
 // See LICENSE file in top directory for details.
 //
 // Copyright (c) 2019 QMCPACK developers.
 //
 // File developed by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
 //
 // File created by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
 //////////////////////////////////////////////////////////////////////////////////////

 #ifndef QMCPLUSPLUS_DELAYED_UPDATE_H
 #define QMCPLUSPLUS_DELAYED_UPDATE_H

 #include "OhmmsPETE/OhmmsVector.h"
 #include "OhmmsPETE/OhmmsMatrix.h"
 #include "CPU/BLAS.hpp"
 #include "CPU/BlasThreadingEnv.h"
 #include "DiracMatrix.h"
 #include "Concurrency/OpenMP.h"

 namespace qmcplusplus
 {
 /** implements delayed update on CPU using BLAS
  * @tparam T base precision for most computation
  * @tparam T_FP high precision for matrix inversion, T_FP >= T
  */
 template<typename T, typename T_FP>
 class DelayedUpdate
 {
   /// orbital values of delayed electrons
   Matrix<T> U;
   /// rows of Ainv corresponding to delayed electrons
   Matrix<T> V;
   /// Matrix inverse of B, at maximum KxK
   Matrix<T> Binv;
   /// scratch space, used during inverse update
   Matrix<T> tempMat;
   /// temporal scratch space used by SM-1
   Vector<T> temp;
   /// new column of B
   Vector<T> p;
   /// list of delayed electrons
   std::vector<int> delay_list;
   /// current number of delays, increase one for each acceptance, reset to 0 after updating Ainv
   int delay_count;
   /// matrix inversion engine
   DiracMatrix<T_FP> detEng;

 public:
   /// default constructor
   DelayedUpdate() : delay_count(0) {}

   /** resize the internal storage
    * @param norb number of electrons/orbitals
    * @param delay, maximum delay 0<delay<=norb
    */
   inline void resize(int norb, int delay)
   {
     V.resize(delay, norb);
     U.resize(delay, norb);
     p.resize(delay);
     temp.resize(norb);
     tempMat.resize(norb, delay);
     Binv.resize(delay, delay);
     delay_list.resize(delay);
   }

   /** compute the inverse of the transpose of matrix A
    * @param logdetT orbital value matrix
    * @param Ainv inverse matrix
    */
   template<typename TREAL>
   inline void invert_transpose(const Matrix<T>& logdetT, Matrix<T>& Ainv, std::complex<TREAL>& log_value)
   {
     detEng.invert_transpose(logdetT, Ainv, log_value);
     // safe mechanism
     delay_count = 0;
   }

   /** initialize internal objects when Ainv is refreshed
    * @param Ainv inverse matrix
    */
   inline void initializeInv(const Matrix<T>& Ainv)
   {
     // safe mechanism
     delay_count = 0;
   }

   inline int getDelayCount() const { return delay_count; }

   /** compute the row of up-to-date Ainv
    * @param Ainv inverse matrix
    * @param rowchanged the row id corresponding to the proposed electron
    */
   template<typename VVT>
   inline void getInvRow(const Matrix<T>& Ainv, int rowchanged, VVT& invRow)
   {
     if (delay_count == 0)
     {
       // Ainv is fresh, directly access Ainv
       std::copy_n(Ainv[rowchanged], invRow.size(), invRow.data());
       return;
     }
     constexpr T cone(1);
     constexpr T czero(0);
     const int norb     = Ainv.rows();
     const int lda_Binv = Binv.cols();
     // save Ainv[rowchanged] to invRow
     std::copy_n(Ainv[rowchanged], norb, invRow.data());
     // multiply V (NxK) Binv(KxK) U(KxN) invRow right to the left
     BLAS::gemv('T', norb, delay_count, cone, U.data(), norb, invRow.data(), 1, czero, p.data(), 1);
     BLAS::gemv('N', delay_count, delay_count, -cone, Binv.data(), lda_Binv, p.data(), 1, czero, Binv[delay_count], 1);
     BLAS::gemv('N', norb, delay_count, cone, V.data(), norb, Binv[delay_count], 1, cone, invRow.data(), 1);
   }

   /** accept a move with the update delayed
    * @param Ainv inverse matrix
    * @param rowchanged the row id corresponding to the proposed electron
    * @param psiV new orbital values
    *
    * Before delay_count reaches the maximum delay, only Binv is updated with a recursive algorithm
    */
   template<typename VVT, typename RATIOT>
   inline void acceptRow(Matrix<T>& Ainv, int rowchanged, const VVT& psiV, const RATIOT ratio_new)
   {
     constexpr T cone(1);
     constexpr T czero(0);
     const int norb     = Ainv.rows();
     const int lda_Binv = Binv.cols();
     std::copy_n(Ainv[rowchanged], norb, V[delay_count]);
     std::copy_n(psiV.data(), norb, U[delay_count]);
     delay_list[delay_count] = rowchanged;
     // the new Binv is [[X Y] [Z sigma]]
     BLAS::gemv('T', norb, delay_count + 1, -cone, V.data(), norb, psiV.data(), 1, czero, p.data(), 1);
     // sigma
     const T sigma                  = static_cast<T>(RATIOT(1) / ratio_new);
     Binv[delay_count][delay_count] = sigma;
     // Y
     BLAS::gemv('T', delay_count, delay_count, sigma, Binv.data(), lda_Binv, p.data(), 1, czero,
                Binv.data() + delay_count, lda_Binv);
     // X
     BLAS::ger(delay_count, delay_count, cone, Binv[delay_count], 1, Binv.data() + delay_count, lda_Binv, Binv.data(),
               lda_Binv);
     // Z
     for (int i = 0; i < delay_count; i++)
       Binv[delay_count][i] *= sigma;
     delay_count++;
     // update Ainv when maximal delay is reached
     if (delay_count == lda_Binv)
       updateInvMat(Ainv);
   }

   /** update the full Ainv and reset delay_count
    * @param Ainv inverse matrix
    */
   inline void updateInvMat(Matrix<T>& Ainv)
   {
     if (delay_count == 0)
       return;
     // update the inverse matrix
     constexpr T cone(1);
     constexpr T czero(0);
     const int norb = Ainv.rows();
     if (delay_count == 1)
     {
       // this is a special case invoking the Fahy's variant of Sherman-Morrison update.
       // Only use the first norb elements of tempMat as a temporal array
       BLAS::gemv('T', norb, norb, cone, Ainv.data(), norb, U[0], 1, czero, temp.data(), 1);
       temp[delay_list[0]] -= cone;
       BLAS::ger(norb, norb, -Binv[0][0], V[0], 1, temp.data(), 1, Ainv.data(), norb);
     }
     else
     {
       const int lda_Binv = Binv.cols();
       // number of threads at the next level, forced to 1 if the problem is small.
       const int num_threads = (norb < 256 ? 1 : getNextLevelNumThreads());
       if (num_threads == 1 || BlasThreadingEnv::NestedThreadingSupported())
       {
         // threading depends on BLAS
         BlasThreadingEnv knob(num_threads);
         BLAS::gemm('T', 'N', delay_count, norb, norb, cone, U.data(), norb, Ainv.data(), norb, czero, tempMat.data(),
                    lda_Binv);
         for (int i = 0; i < delay_count; i++)
           tempMat(delay_list[i], i) -= cone;
         BLAS::gemm('N', 'N', norb, delay_count, delay_count, cone, V.data(), norb, Binv.data(), lda_Binv, czero,
                    U.data(), norb);
         BLAS::gemm('N', 'N', norb, norb, delay_count, -cone, U.data(), norb, tempMat.data(), lda_Binv, cone,
                    Ainv.data(), norb);
       }
       else
       {
         // manually threaded version of the above GEMM calls
 #pragma omp parallel
         {
           const int block_size = getAlignedSize<T>((norb + num_threads - 1) / num_threads);
           int num_block        = (norb + block_size - 1) / block_size;
 #pragma omp for
           for (int ix = 0; ix < num_block; ix++)
           {
             int x_offset = ix * block_size;
             BLAS::gemm('T', 'N', delay_count, std::min(norb - x_offset, block_size), norb, cone, U.data(), norb,
                        Ainv[x_offset], norb, czero, tempMat[x_offset], lda_Binv);
           }
 #pragma omp master
           for (int i = 0; i < delay_count; i++)
             tempMat(delay_list[i], i) -= cone;
 #pragma omp for
           for (int iy = 0; iy < num_block; iy++)
           {
             int y_offset = iy * block_size;
             BLAS::gemm('N', 'N', std::min(norb - y_offset, block_size), delay_count, delay_count, cone,
                        V.data() + y_offset, norb, Binv.data(), lda_Binv, czero, U.data() + y_offset, norb);
           }
 #pragma omp for collapse(2) nowait
           for (int iy = 0; iy < num_block; iy++)
             for (int ix = 0; ix < num_block; ix++)
             {
               int x_offset = ix * block_size;
               int y_offset = iy * block_size;
               BLAS::gemm('N', 'N', std::min(norb - y_offset, block_size), std::min(norb - x_offset, block_size),
                          delay_count, -cone, U.data() + y_offset, norb, tempMat[x_offset], lda_Binv, cone,
                          Ainv[x_offset] + y_offset, norb);
             }
         }
       }
     }
     delay_count = 0;
   }
 };
 } // namespace qmcplusplus

 #endif // QMCPLUSPLUS_DELAYED_UPDATE_H
qmcplusplus::Vector::resize
void resize(size_type n, Type_t val=Type_t())
Resize the container.
Definition: OhmmsVector.h:166

BlasThreadingEnv.h

qmcplusplus
helper functions for EinsplineSetBuilder
Definition: Configuration.h:43

OpenMP.h

BLAS.hpp

qmcplusplus::DelayedUpdate::acceptRow
void acceptRow(Matrix< T > &Ainv, int rowchanged, const VVT &psiV, const RATIOT ratio_new)
accept a move with the update delayed
Definition: DelayedUpdate.h:125

qmcplusplus::BlasThreadingEnv
service class for explicitly managing the threading of BLAS/LAPACK calls from OpenMP parallel region ...
Definition: BlasThreadingEnv.h:22

qmcplusplus::DelayedUpdate::detEng
DiracMatrix< T_FP > detEng
matrix inversion engine
Definition: DelayedUpdate.h:48

BLAS::czero
constexpr std::complex< float > czero
Definition: BLAS.hpp:51

BLAS::cone
constexpr std::complex< float > cone
Definition: BLAS.hpp:50

qmcplusplus::DelayedUpdate::DelayedUpdate
DelayedUpdate()
default constructor
Definition: DelayedUpdate.h:52

qmcplusplus::BlasThreadingEnv::NestedThreadingSupported
static bool NestedThreadingSupported()
Definition: BlasThreadingEnv.cpp:38

qmcplusplus::Vector< T >

qmcplusplus::DelayedUpdate::invert_transpose
void invert_transpose(const Matrix< T > &logdetT, Matrix< T > &Ainv, std::complex< TREAL > &log_value)
compute the inverse of the transpose of matrix A
Definition: DelayedUpdate.h:74

qmcplusplus::Matrix::resize
void resize(size_type n, size_type m)
Resize the container.
Definition: OhmmsMatrix.h:99

qmcplusplus::DelayedUpdate
implements delayed update on CPU using BLAS
Definition: DelayedUpdate.h:29

qmcplusplus::Vector::data
pointer data()
Definition: OhmmsVector.h:238

omptarget::min
T min(T a, T b)
Definition: OMPTargetMath.hpp:36

qmcplusplus::Matrix::cols
size_type cols() const
Definition: OhmmsMatrix.h:78

BLAS::gemv
static void gemv(int n, int m, const double *restrict amat, const double *restrict x, double *restrict y)
Definition: BLAS.hpp:118

qmcplusplus::Matrix::size
size_type size() const
Definition: OhmmsMatrix.h:76

qmcplusplus::DelayedUpdate::Binv
Matrix< T > Binv
Matrix inverse of B, at maximum KxK.
Definition: DelayedUpdate.h:36

DiracMatrix.h

qmcplusplus::DiracMatrix
helper class to compute matrix inversion and the log value of determinant
Definition: DiracMatrix.h:111

qmcplusplus::DelayedUpdate::delay_list
std::vector< int > delay_list
list of delayed electrons
Definition: DelayedUpdate.h:44

BLAS::ger
static void ger(int m, int n, double alpha, const double *x, int incx, const double *y, int incy, double *a, int lda)
Definition: BLAS.hpp:437

qmcplusplus::DelayedUpdate::initializeInv
void initializeInv(const Matrix< T > &Ainv)
initialize internal objects when Ainv is refreshed
Definition: DelayedUpdate.h:84

qmcplusplus::DelayedUpdate::updateInvMat
void updateInvMat(Matrix< T > &Ainv)
update the full Ainv and reset delay_count
Definition: DelayedUpdate.h:157

qmcplusplus::Matrix::rows
size_type rows() const
Definition: OhmmsMatrix.h:77

qmcplusplus::DelayedUpdate::temp
Vector< T > temp
temporal scratch space used by SM-1
Definition: DelayedUpdate.h:40

OhmmsVector.h
Declaraton of Vector<T,Alloc> Manage memory through Alloc directly and allow referencing an existing ...

qmcplusplus::DelayedUpdate::delay_count
int delay_count
current number of delays, increase one for each acceptance, reset to 0 after updating Ainv ...
Definition: DelayedUpdate.h:46

qmcplusplus::syclBLAS::copy_n
sycl::event copy_n(sycl::queue &aq, const T1 *restrict VA, size_t array_size, T2 *restrict VC, const std::vector< sycl::event > &events)
Definition: syclBLAS.cpp:548

qmcplusplus::Matrix< T >

qmcplusplus::DelayedUpdate::getDelayCount
int getDelayCount() const
Definition: DelayedUpdate.h:90

qmcplusplus::DelayedUpdate::resize
void resize(int norb, int delay)
resize the internal storage
Definition: DelayedUpdate.h:58

qmcplusplus::DelayedUpdate::tempMat
Matrix< T > tempMat
scratch space, used during inverse update
Definition: DelayedUpdate.h:38

qmcplusplus::Matrix::data
pointer data()
Definition: OhmmsMatrix.h:182

qmcplusplus::DelayedUpdate::U
Matrix< T > U
orbital values of delayed electrons
Definition: DelayedUpdate.h:32

BLAS::gemm
static void gemm(char Atrans, char Btrans, int M, int N, int K, double alpha, const double *A, int lda, const double *restrict B, int ldb, double beta, double *restrict C, int ldc)
Definition: BLAS.hpp:235

qmcplusplus::DelayedUpdate::V
Matrix< T > V
rows of Ainv corresponding to delayed electrons
Definition: DelayedUpdate.h:34

getNextLevelNumThreads
int getNextLevelNumThreads()
get the number of threads at the next parallel level
Definition: OpenMP.h:35

OhmmsMatrix.h

qmcplusplus::DelayedUpdate::p
Vector< T > p
new column of B
Definition: DelayedUpdate.h:42

qmcplusplus::DelayedUpdate::getInvRow
void getInvRow(const Matrix< T > &Ainv, int rowchanged, VVT &invRow)
compute the row of up-to-date Ainv
Definition: DelayedUpdate.h:97