QMCPACK
DelayedUpdate.h
Go to the documentation of this file.
1 //////////////////////////////////////////////////////////////////////////////////////
2 // This file is distributed under the University of Illinois/NCSA Open Source License.
3 // See LICENSE file in top directory for details.
4 //
5 // Copyright (c) 2019 QMCPACK developers.
6 //
7 // File developed by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
8 //
9 // File created by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
10 //////////////////////////////////////////////////////////////////////////////////////
11 
12 #ifndef QMCPLUSPLUS_DELAYED_UPDATE_H
13 #define QMCPLUSPLUS_DELAYED_UPDATE_H
14 
15 #include "OhmmsPETE/OhmmsVector.h"
16 #include "OhmmsPETE/OhmmsMatrix.h"
17 #include "CPU/BLAS.hpp"
18 #include "CPU/BlasThreadingEnv.h"
19 #include "DiracMatrix.h"
20 #include "Concurrency/OpenMP.h"
21 
22 namespace qmcplusplus
23 {
24 /** implements delayed update on CPU using BLAS
25  * @tparam T base precision for most computation
26  * @tparam T_FP high precision for matrix inversion, T_FP >= T
27  */
28 template<typename T, typename T_FP>
30 {
31  /// orbital values of delayed electrons
33  /// rows of Ainv corresponding to delayed electrons
35  /// Matrix inverse of B, at maximum KxK
37  /// scratch space, used during inverse update
39  /// temporal scratch space used by SM-1
41  /// new column of B
43  /// list of delayed electrons
44  std::vector<int> delay_list;
45  /// current number of delays, increase one for each acceptance, reset to 0 after updating Ainv
47  /// matrix inversion engine
49 
50 public:
51  /// default constructor
53 
54  /** resize the internal storage
55  * @param norb number of electrons/orbitals
56  * @param delay, maximum delay 0<delay<=norb
57  */
58  inline void resize(int norb, int delay)
59  {
60  V.resize(delay, norb);
61  U.resize(delay, norb);
62  p.resize(delay);
63  temp.resize(norb);
64  tempMat.resize(norb, delay);
65  Binv.resize(delay, delay);
66  delay_list.resize(delay);
67  }
68 
69  /** compute the inverse of the transpose of matrix A
70  * @param logdetT orbital value matrix
71  * @param Ainv inverse matrix
72  */
73  template<typename TREAL>
74  inline void invert_transpose(const Matrix<T>& logdetT, Matrix<T>& Ainv, std::complex<TREAL>& log_value)
75  {
76  detEng.invert_transpose(logdetT, Ainv, log_value);
77  // safe mechanism
78  delay_count = 0;
79  }
80 
81  /** initialize internal objects when Ainv is refreshed
82  * @param Ainv inverse matrix
83  */
84  inline void initializeInv(const Matrix<T>& Ainv)
85  {
86  // safe mechanism
87  delay_count = 0;
88  }
89 
90  inline int getDelayCount() const { return delay_count; }
91 
92  /** compute the row of up-to-date Ainv
93  * @param Ainv inverse matrix
94  * @param rowchanged the row id corresponding to the proposed electron
95  */
96  template<typename VVT>
97  inline void getInvRow(const Matrix<T>& Ainv, int rowchanged, VVT& invRow)
98  {
99  if (delay_count == 0)
100  {
101  // Ainv is fresh, directly access Ainv
102  std::copy_n(Ainv[rowchanged], invRow.size(), invRow.data());
103  return;
104  }
105  constexpr T cone(1);
106  constexpr T czero(0);
107  const int norb = Ainv.rows();
108  const int lda_Binv = Binv.cols();
109  // save Ainv[rowchanged] to invRow
110  std::copy_n(Ainv[rowchanged], norb, invRow.data());
111  // multiply V (NxK) Binv(KxK) U(KxN) invRow right to the left
112  BLAS::gemv('T', norb, delay_count, cone, U.data(), norb, invRow.data(), 1, czero, p.data(), 1);
113  BLAS::gemv('N', delay_count, delay_count, -cone, Binv.data(), lda_Binv, p.data(), 1, czero, Binv[delay_count], 1);
114  BLAS::gemv('N', norb, delay_count, cone, V.data(), norb, Binv[delay_count], 1, cone, invRow.data(), 1);
115  }
116 
117  /** accept a move with the update delayed
118  * @param Ainv inverse matrix
119  * @param rowchanged the row id corresponding to the proposed electron
120  * @param psiV new orbital values
121  *
122  * Before delay_count reaches the maximum delay, only Binv is updated with a recursive algorithm
123  */
124  template<typename VVT, typename RATIOT>
125  inline void acceptRow(Matrix<T>& Ainv, int rowchanged, const VVT& psiV, const RATIOT ratio_new)
126  {
127  constexpr T cone(1);
128  constexpr T czero(0);
129  const int norb = Ainv.rows();
130  const int lda_Binv = Binv.cols();
131  std::copy_n(Ainv[rowchanged], norb, V[delay_count]);
132  std::copy_n(psiV.data(), norb, U[delay_count]);
133  delay_list[delay_count] = rowchanged;
134  // the new Binv is [[X Y] [Z sigma]]
135  BLAS::gemv('T', norb, delay_count + 1, -cone, V.data(), norb, psiV.data(), 1, czero, p.data(), 1);
136  // sigma
137  const T sigma = static_cast<T>(RATIOT(1) / ratio_new);
138  Binv[delay_count][delay_count] = sigma;
139  // Y
140  BLAS::gemv('T', delay_count, delay_count, sigma, Binv.data(), lda_Binv, p.data(), 1, czero,
141  Binv.data() + delay_count, lda_Binv);
142  // X
144  lda_Binv);
145  // Z
146  for (int i = 0; i < delay_count; i++)
147  Binv[delay_count][i] *= sigma;
148  delay_count++;
149  // update Ainv when maximal delay is reached
150  if (delay_count == lda_Binv)
151  updateInvMat(Ainv);
152  }
153 
154  /** update the full Ainv and reset delay_count
155  * @param Ainv inverse matrix
156  */
157  inline void updateInvMat(Matrix<T>& Ainv)
158  {
159  if (delay_count == 0)
160  return;
161  // update the inverse matrix
162  constexpr T cone(1);
163  constexpr T czero(0);
164  const int norb = Ainv.rows();
165  if (delay_count == 1)
166  {
167  // this is a special case invoking the Fahy's variant of Sherman-Morrison update.
168  // Only use the first norb elements of tempMat as a temporal array
169  BLAS::gemv('T', norb, norb, cone, Ainv.data(), norb, U[0], 1, czero, temp.data(), 1);
170  temp[delay_list[0]] -= cone;
171  BLAS::ger(norb, norb, -Binv[0][0], V[0], 1, temp.data(), 1, Ainv.data(), norb);
172  }
173  else
174  {
175  const int lda_Binv = Binv.cols();
176  // number of threads at the next level, forced to 1 if the problem is small.
177  const int num_threads = (norb < 256 ? 1 : getNextLevelNumThreads());
178  if (num_threads == 1 || BlasThreadingEnv::NestedThreadingSupported())
179  {
180  // threading depends on BLAS
181  BlasThreadingEnv knob(num_threads);
182  BLAS::gemm('T', 'N', delay_count, norb, norb, cone, U.data(), norb, Ainv.data(), norb, czero, tempMat.data(),
183  lda_Binv);
184  for (int i = 0; i < delay_count; i++)
185  tempMat(delay_list[i], i) -= cone;
186  BLAS::gemm('N', 'N', norb, delay_count, delay_count, cone, V.data(), norb, Binv.data(), lda_Binv, czero,
187  U.data(), norb);
188  BLAS::gemm('N', 'N', norb, norb, delay_count, -cone, U.data(), norb, tempMat.data(), lda_Binv, cone,
189  Ainv.data(), norb);
190  }
191  else
192  {
193  // manually threaded version of the above GEMM calls
194 #pragma omp parallel
195  {
196  const int block_size = getAlignedSize<T>((norb + num_threads - 1) / num_threads);
197  int num_block = (norb + block_size - 1) / block_size;
198 #pragma omp for
199  for (int ix = 0; ix < num_block; ix++)
200  {
201  int x_offset = ix * block_size;
202  BLAS::gemm('T', 'N', delay_count, std::min(norb - x_offset, block_size), norb, cone, U.data(), norb,
203  Ainv[x_offset], norb, czero, tempMat[x_offset], lda_Binv);
204  }
205 #pragma omp master
206  for (int i = 0; i < delay_count; i++)
207  tempMat(delay_list[i], i) -= cone;
208 #pragma omp for
209  for (int iy = 0; iy < num_block; iy++)
210  {
211  int y_offset = iy * block_size;
212  BLAS::gemm('N', 'N', std::min(norb - y_offset, block_size), delay_count, delay_count, cone,
213  V.data() + y_offset, norb, Binv.data(), lda_Binv, czero, U.data() + y_offset, norb);
214  }
215 #pragma omp for collapse(2) nowait
216  for (int iy = 0; iy < num_block; iy++)
217  for (int ix = 0; ix < num_block; ix++)
218  {
219  int x_offset = ix * block_size;
220  int y_offset = iy * block_size;
221  BLAS::gemm('N', 'N', std::min(norb - y_offset, block_size), std::min(norb - x_offset, block_size),
222  delay_count, -cone, U.data() + y_offset, norb, tempMat[x_offset], lda_Binv, cone,
223  Ainv[x_offset] + y_offset, norb);
224  }
225  }
226  }
227  }
228  delay_count = 0;
229  }
230 };
231 } // namespace qmcplusplus
232 
233 #endif // QMCPLUSPLUS_DELAYED_UPDATE_H
void resize(size_type n, Type_t val=Type_t())
Resize the container.
Definition: OhmmsVector.h:166
helper functions for EinsplineSetBuilder
Definition: Configuration.h:43
void acceptRow(Matrix< T > &Ainv, int rowchanged, const VVT &psiV, const RATIOT ratio_new)
accept a move with the update delayed
service class for explicitly managing the threading of BLAS/LAPACK calls from OpenMP parallel region ...
DiracMatrix< T_FP > detEng
matrix inversion engine
Definition: DelayedUpdate.h:48
constexpr std::complex< float > czero
Definition: BLAS.hpp:51
constexpr std::complex< float > cone
Definition: BLAS.hpp:50
DelayedUpdate()
default constructor
Definition: DelayedUpdate.h:52
void invert_transpose(const Matrix< T > &logdetT, Matrix< T > &Ainv, std::complex< TREAL > &log_value)
compute the inverse of the transpose of matrix A
Definition: DelayedUpdate.h:74
void resize(size_type n, size_type m)
Resize the container.
Definition: OhmmsMatrix.h:99
implements delayed update on CPU using BLAS
Definition: DelayedUpdate.h:29
T min(T a, T b)
size_type cols() const
Definition: OhmmsMatrix.h:78
static void gemv(int n, int m, const double *restrict amat, const double *restrict x, double *restrict y)
Definition: BLAS.hpp:118
size_type size() const
Definition: OhmmsMatrix.h:76
Matrix< T > Binv
Matrix inverse of B, at maximum KxK.
Definition: DelayedUpdate.h:36
helper class to compute matrix inversion and the log value of determinant
Definition: DiracMatrix.h:111
std::vector< int > delay_list
list of delayed electrons
Definition: DelayedUpdate.h:44
static void ger(int m, int n, double alpha, const double *x, int incx, const double *y, int incy, double *a, int lda)
Definition: BLAS.hpp:437
void initializeInv(const Matrix< T > &Ainv)
initialize internal objects when Ainv is refreshed
Definition: DelayedUpdate.h:84
void updateInvMat(Matrix< T > &Ainv)
update the full Ainv and reset delay_count
size_type rows() const
Definition: OhmmsMatrix.h:77
Vector< T > temp
temporal scratch space used by SM-1
Definition: DelayedUpdate.h:40
Declaraton of Vector<T,Alloc> Manage memory through Alloc directly and allow referencing an existing ...
int delay_count
current number of delays, increase one for each acceptance, reset to 0 after updating Ainv ...
Definition: DelayedUpdate.h:46
sycl::event copy_n(sycl::queue &aq, const T1 *restrict VA, size_t array_size, T2 *restrict VC, const std::vector< sycl::event > &events)
Definition: syclBLAS.cpp:548
void resize(int norb, int delay)
resize the internal storage
Definition: DelayedUpdate.h:58
Matrix< T > tempMat
scratch space, used during inverse update
Definition: DelayedUpdate.h:38
Matrix< T > U
orbital values of delayed electrons
Definition: DelayedUpdate.h:32
static void gemm(char Atrans, char Btrans, int M, int N, int K, double alpha, const double *A, int lda, const double *restrict B, int ldb, double beta, double *restrict C, int ldc)
Definition: BLAS.hpp:235
Matrix< T > V
rows of Ainv corresponding to delayed electrons
Definition: DelayedUpdate.h:34
int getNextLevelNumThreads()
get the number of threads at the next parallel level
Definition: OpenMP.h:35
Vector< T > p
new column of B
Definition: DelayedUpdate.h:42
void getInvRow(const Matrix< T > &Ainv, int rowchanged, VVT &invRow)
compute the row of up-to-date Ainv
Definition: DelayedUpdate.h:97