QMCPACK
DelayedUpdateBatched.h
Go to the documentation of this file.
1 //////////////////////////////////////////////////////////////////////////////////////
2 // This file is distributed under the University of Illinois/NCSA Open Source License.
3 // See LICENSE file in top directory for details.
4 //
5 // Copyright (c) 2024 QMCPACK developers.
6 //
7 // File developed by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
8 // Peter Doak, doakpw@ornl.gov, Oak Ridge National Laboratory
9 //
10 // File created by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
11 //////////////////////////////////////////////////////////////////////////////////////
12 
13 #ifndef QMCPLUSPLUS_DELAYED_UPDATE_BATCHED_H
14 #define QMCPLUSPLUS_DELAYED_UPDATE_BATCHED_H
15 
16 #include "OhmmsPETE/OhmmsVector.h"
17 #include "OhmmsPETE/OhmmsMatrix.h"
21 #include "WaveFunctionTypes.hpp"
22 #include "QueueAliases.hpp"
23 #include "AccelBLAS.hpp"
24 
25 namespace qmcplusplus
26 {
27 
28 /** implements dirac matrix delayed update using OpenMP offload and CUDA.
29  * It is used as DET_ENGINE in DiracDeterminantBatched.
30  * This is a 1 per walker class
31  *
32  * @tparam T base precision for most computation
33  * @tparam T_FP high precision for matrix inversion, T_FP >= T
34  */
35 template<PlatformKind PL, typename VALUE>
37 {
38 public:
40  using Value = VALUE;
42  using Complex = std::complex<Real>;
43 
44  // containers
45  template<typename DT>
47  template<typename DT>
49  template<typename DT>
51  template<typename DT>
53  template<typename DT>
55  template<typename DT>
57 
59  {
60  // CUDA stream, cublas handle object
63 
64  // constant array value VALUE(1)
66  // constant array value VALUE(-1)
68  // constant array value VALUE(0)
70  // multi walker of grads for transfer needs.
72  // multi walker of spingrads for transfer needs.
74  // mw_updateRow pointer buffer
76  // mw_prepareInvRow pointer buffer
78  // mw_accept_rejectRow pointer buffer
80  // mw_updateInv pointer buffer
82  // mw_evalGrad pointer buffer
84  /// scratch space for rank-1 update
86  // scratch space for keeping one row of Ainv
88 
90 
92  {
93  if (cone_vec.size() < nw)
94  {
95  // cone
96  cone_vec.resize(nw);
97  std::fill_n(cone_vec.data(), nw, Value(1));
98  cone_vec.updateTo();
99  // cminusone
100  cminusone_vec.resize(nw);
101  std::fill_n(cminusone_vec.data(), nw, Value(-1));
102  cminusone_vec.updateTo();
103  // czero
104  czero_vec.resize(nw);
105  std::fill_n(czero_vec.data(), nw, Value(0));
106  czero_vec.updateTo();
107  }
108  }
109  };
110 
111 private:
112  /// scratch space for rank-1 update
114  /// row of up-to-date Ainv
116  /** row id correspond to the up-to-date invRow. [0 norb), invRow is ready; -1, invRow is not valid.
117  * This id is set after calling getInvRow indicating invRow has been prepared for the invRow_id row
118  * ratioGrad checks if invRow_id is consistent. If not, invRow needs to be recomputed.
119  * acceptMove and completeUpdates mark invRow invalid by setting invRow_id to -1
120  */
122  // scratch space for keeping one row of Ainv
124 
125  template<typename DT>
127  template<typename DT>
129  /// orbital values of delayed electrons
131  /// rows of Ainv corresponding to delayed electrons
133  /// Matrix inverse of B, at maximum KxK
135  /// scratch space, used during inverse update
137  /// new column of B
139  /// list of delayed electrons
141  /// current number of delays, increase one for each acceptance, reset to 0 after updating Ainv
143  /// if true, updates are not delayed.
144  const bool no_delayed_update_;
145 
146  /** resize the internal storage
147  * @param norb number of electrons/orbitals
148  * @param delay, maximum delay 0<delay<=norb
149  */
150  inline void resize(int norb, int delay)
151  {
152  V_gpu.resize(delay, norb);
153  U_gpu.resize(delay, norb);
154  p_gpu.resize(delay);
155  tempMat_gpu.resize(norb, delay);
156  Binv_gpu.resize(delay, delay);
157  delay_list_gpu.resize(delay);
158  invRow.resize(norb);
159  }
160 
161  /** ensure no previous delay left.
162  * This looks like it should be an assert
163  */
164  inline void guard_no_delay() const
165  {
166  if (delay_count != 0)
167  throw std::runtime_error("BUG: unexpected call sequence delay_count is not 0");
168  }
169 
170  /** compute the row of up-to-date Ainv
171  * @param Ainv inverse matrix
172  * @param rowchanged the row id corresponding to the proposed electron
173  */
174  static void mw_prepareInvRow(const RefVectorWithLeader<This_t>& engines,
175  MultiWalkerResource& mw_rsc,
176  const RefVector<DualMatrix<Value>>& psiMinv_refs,
177  const int rowchanged)
178  {
179  auto& engine_leader = engines.getLeader();
180  auto& blas_handle = mw_rsc.blas_handle;
181  auto& queue = mw_rsc.queue;
182  auto& cminusone_vec = mw_rsc.cminusone_vec;
183  auto& cone_vec = mw_rsc.cone_vec;
184  auto& czero_vec = mw_rsc.czero_vec;
185  auto& prepare_inv_row_buffer_H2D = mw_rsc.prepare_inv_row_buffer_H2D;
186  const int norb = engine_leader.invRow.size();
187  const int nw = engines.size();
188  int& delay_count = engine_leader.delay_count;
189 
190  constexpr size_t num_ptrs_packed = 7; // it must match packing and unpacking
191  prepare_inv_row_buffer_H2D.resize(sizeof(Value*) * num_ptrs_packed * nw);
192  mw_rsc.resize_fill_constant_arrays(nw);
193 
194  const int lda_Binv = engine_leader.Binv_gpu.cols();
195  Matrix<Value*> ptr_buffer(reinterpret_cast<Value**>(prepare_inv_row_buffer_H2D.data()), num_ptrs_packed, nw);
196  for (int iw = 0; iw < nw; iw++)
197  {
198  This_t& engine = engines[iw];
199  DualMatrix<Value>& psiMinv = psiMinv_refs[iw];
200  ptr_buffer[0][iw] = psiMinv.device_data() + rowchanged * psiMinv.cols();
201  ptr_buffer[1][iw] = engine.invRow.device_data();
202  ptr_buffer[2][iw] = engine.U_gpu.data();
203  ptr_buffer[3][iw] = engine.p_gpu.data();
204  ptr_buffer[4][iw] = engine.Binv_gpu.data();
205  ptr_buffer[5][iw] = engine.Binv_gpu.data() + delay_count * lda_Binv;
206  ptr_buffer[6][iw] = engine.V_gpu.data();
207  }
208 
209  queue.enqueueH2D(prepare_inv_row_buffer_H2D);
210 
211  Value** oldRow_mw_ptr = reinterpret_cast<Value**>(prepare_inv_row_buffer_H2D.device_data());
212  Value** invRow_mw_ptr = reinterpret_cast<Value**>(prepare_inv_row_buffer_H2D.device_data() + sizeof(Value*) * nw);
213  Value** U_mw_ptr = reinterpret_cast<Value**>(prepare_inv_row_buffer_H2D.device_data() + sizeof(Value*) * nw * 2);
214  Value** p_mw_ptr = reinterpret_cast<Value**>(prepare_inv_row_buffer_H2D.device_data() + sizeof(Value*) * nw * 3);
215  Value** Binv_mw_ptr = reinterpret_cast<Value**>(prepare_inv_row_buffer_H2D.device_data() + sizeof(Value*) * nw * 4);
216  Value** BinvRow_mw_ptr =
217  reinterpret_cast<Value**>(prepare_inv_row_buffer_H2D.device_data() + sizeof(Value*) * nw * 5);
218  Value** V_mw_ptr = reinterpret_cast<Value**>(prepare_inv_row_buffer_H2D.device_data() + sizeof(Value*) * nw * 6);
219 
220  // save Ainv[rowchanged] to invRow
221  //std::copy_n(Ainv[rowchanged], norb, invRow.data());
222  compute::BLAS::copy_batched(blas_handle, norb, oldRow_mw_ptr, 1, invRow_mw_ptr, 1, nw);
223  // multiply V (NxK) Binv(KxK) U(KxN) invRow right to the left
224  //BLAS::gemv('T', norb, delay_count, cone, U_gpu.data(), norb, invRow.data(), 1, czero, p_gpu.data(), 1);
225  //BLAS::gemv('N', delay_count, delay_count, -cone, Binv.data(), lda_Binv, p.data(), 1, czero, Binv[delay_count], 1);
226  //BLAS::gemv('N', norb, delay_count, cone, V.data(), norb, Binv[delay_count], 1, cone, invRow.data(), 1);
227  compute::BLAS::gemv_batched(blas_handle, 'T', norb, delay_count, cone_vec.device_data(), U_mw_ptr, norb,
228  invRow_mw_ptr, 1, czero_vec.device_data(), p_mw_ptr, 1, nw);
229  compute::BLAS::gemv_batched(blas_handle, 'N', delay_count, delay_count, cminusone_vec.device_data(), Binv_mw_ptr,
230  lda_Binv, p_mw_ptr, 1, czero_vec.device_data(), BinvRow_mw_ptr, 1, nw);
231  compute::BLAS::gemv_batched(blas_handle, 'N', norb, delay_count, cone_vec.device_data(), V_mw_ptr, norb,
232  BinvRow_mw_ptr, 1, cone_vec.device_data(), invRow_mw_ptr, 1, nw);
233  // mark row prepared
234  engine_leader.invRow_id = rowchanged;
235  }
236 
237  /** Do complete row updates
238  * many of these const arguments provide pointers or references
239  * somewhere in here is an update that doesn't get where it belongs resulting in a 0
240  * gradient later.
241  * Sad example of OpenMP target code that is far from clear and a poor substitute for a
242  * clear CPU reference implementation.
243  *
244  * \param[in] engines
245  * \param[in] rowchanged
246  * \param[in] psiM_g_list device ptrs
247  * \param[in] psiM_l_list device ptrs
248  * \param[in] isAccepted bool but wait some lists are also filtered
249  * \param[in] phi_vgl_v multiple walker orbital VGL
250  * \param[inout] ratios
251  */
252  static void mw_updateRow(const RefVectorWithLeader<This_t>& engines,
253  MultiWalkerResource& mw_rsc,
254  const RefVector<DualMatrix<Value>>& psiMinv_refs,
255  const int rowchanged,
256  const std::vector<Value*>& psiM_g_list,
257  const std::vector<Value*>& psiM_l_list,
258  const std::vector<bool>& isAccepted,
259  const OffloadMWVGLArray<Value>& phi_vgl_v,
260  const std::vector<Value>& ratios)
261  {
262  auto& engine_leader = engines.getLeader();
263  engine_leader.guard_no_delay();
264 
265  const size_t n_accepted = psiM_g_list.size();
266 #ifndef NDEBUG
267  size_t n_true = std::count_if(isAccepted.begin(), isAccepted.end(), [](bool accepted) { return accepted; });
268  assert(n_accepted == n_true);
269 #endif
270  if (n_accepted == 0)
271  return;
272 
273  auto& queue = mw_rsc.queue;
274  auto& blas_handle = mw_rsc.blas_handle;
275  auto& updateRow_buffer_H2D = mw_rsc.updateRow_buffer_H2D;
276  auto& mw_temp = mw_rsc.mw_temp;
277  auto& mw_rcopy = mw_rsc.mw_rcopy;
278  auto& cone_vec = mw_rsc.cone_vec;
279  auto& czero_vec = mw_rsc.czero_vec;
280  const int norb = engine_leader.invRow.size();
281  const int lda = psiMinv_refs[0].get().cols();
282  const int nw = engines.size();
283  const size_t phi_vgl_stride = nw * norb;
284  mw_temp.resize(norb * n_accepted);
285  mw_rcopy.resize(norb * n_accepted);
286 
287  constexpr size_t num_ptrs_packed = 6; // it must match packing and unpacking
288  updateRow_buffer_H2D.resize((sizeof(Value*) * num_ptrs_packed + sizeof(Value)) * n_accepted);
289 
290  // to handle T** of Ainv, psi_v, temp, rcopy
291  Matrix<Value*> ptr_buffer(reinterpret_cast<Value**>(updateRow_buffer_H2D.data()), num_ptrs_packed, n_accepted);
292  Value* c_ratio_inv =
293  reinterpret_cast<Value*>(updateRow_buffer_H2D.data() + sizeof(Value*) * num_ptrs_packed * n_accepted);
294  for (int iw = 0, count = 0; iw < isAccepted.size(); iw++)
295  if (isAccepted[iw])
296  {
297  ptr_buffer[0][count] = psiMinv_refs[iw].get().device_data();
298  ptr_buffer[1][count] = const_cast<Value*>(phi_vgl_v.device_data_at(0, iw, 0));
299  ptr_buffer[2][count] = mw_temp.device_data() + norb * count;
300  ptr_buffer[3][count] = mw_rcopy.device_data() + norb * count;
301  ptr_buffer[4][count] = psiM_g_list[count];
302  ptr_buffer[5][count] = psiM_l_list[count];
303 
304  c_ratio_inv[count] = Value(-1) / ratios[iw];
305  count++;
306  }
307 
308  // update the inverse matrix
309  mw_rsc.resize_fill_constant_arrays(n_accepted);
310 
311  queue.enqueueH2D(updateRow_buffer_H2D);
312 
313  {
314  Value** Ainv_mw_ptr = reinterpret_cast<Value**>(updateRow_buffer_H2D.device_data());
315  Value** phiVGL_mw_ptr =
316  reinterpret_cast<Value**>(updateRow_buffer_H2D.device_data() + sizeof(Value*) * n_accepted);
317  Value** temp_mw_ptr =
318  reinterpret_cast<Value**>(updateRow_buffer_H2D.device_data() + sizeof(Value*) * n_accepted * 2);
319  Value** rcopy_mw_ptr =
320  reinterpret_cast<Value**>(updateRow_buffer_H2D.device_data() + sizeof(Value*) * n_accepted * 3);
321  Value** dpsiM_mw_out =
322  reinterpret_cast<Value**>(updateRow_buffer_H2D.device_data() + sizeof(Value*) * n_accepted * 4);
323  Value** d2psiM_mw_out =
324  reinterpret_cast<Value**>(updateRow_buffer_H2D.device_data() + sizeof(Value*) * n_accepted * 5);
325  Value* ratio_inv_mw =
326  reinterpret_cast<Value*>(updateRow_buffer_H2D.device_data() + sizeof(Value*) * n_accepted * 6);
327 
328 
329  // invoke the Fahy's variant of Sherman-Morrison update.
330  compute::BLAS::gemv_batched(blas_handle, 'T', norb, norb, cone_vec.device_data(), Ainv_mw_ptr, lda, phiVGL_mw_ptr,
331  1, czero_vec.device_data(), temp_mw_ptr, 1, n_accepted);
332 
333  compute::copyAinvRow_saveGL_batched(queue, rowchanged, norb, Ainv_mw_ptr, lda, temp_mw_ptr, rcopy_mw_ptr,
334  phiVGL_mw_ptr, phi_vgl_stride, dpsiM_mw_out, d2psiM_mw_out, n_accepted);
335 
336 
337  compute::BLAS::ger_batched(blas_handle, norb, norb, ratio_inv_mw, rcopy_mw_ptr, 1, temp_mw_ptr, 1, Ainv_mw_ptr,
338  lda, n_accepted);
339  }
340  }
341 
342 public:
343  /// default constructor
344  DelayedUpdateBatched(size_t norb, size_t max_delay)
345  : invRow_id(-1), delay_count(0), no_delayed_update_(max_delay == 1)
346  {
347  resize(norb, max_delay);
348  }
349 
351 
352  // prepare invRow and compute the old gradients.
353  template<typename GT>
354  static void mw_evalGrad(const RefVectorWithLeader<This_t>& engines,
355  MultiWalkerResource& mw_rsc,
356  const RefVector<DualMatrix<Value>>& psiMinv_refs,
357  const std::vector<const Value*>& dpsiM_row_list,
358  const int rowchanged,
359  std::vector<GT>& grad_now)
360  {
361  auto& engine_leader = engines.getLeader();
362  if (!engine_leader.no_delayed_update_)
363  mw_prepareInvRow(engines, mw_rsc, psiMinv_refs, rowchanged);
364 
365  auto& queue = mw_rsc.queue;
366  auto& evalGrad_buffer_H2D = mw_rsc.evalGrad_buffer_H2D;
367  auto& grads_value_v = mw_rsc.grads_value_v;
368 
369  const int nw = engines.size();
370  constexpr size_t num_ptrs_packed = 2; // it must match packing and unpacking
371  evalGrad_buffer_H2D.resize(sizeof(Value*) * num_ptrs_packed * nw);
372  Matrix<const Value*> ptr_buffer(reinterpret_cast<const Value**>(evalGrad_buffer_H2D.data()), num_ptrs_packed, nw);
373  for (int iw = 0; iw < nw; iw++)
374  {
375  if (engine_leader.no_delayed_update_)
376  {
377  DualMatrix<Value>& psiMinv = psiMinv_refs[iw];
378  ptr_buffer[0][iw] = psiMinv.device_data() + rowchanged * psiMinv.cols();
379  }
380  else
381  ptr_buffer[0][iw] = engines[iw].invRow.device_data();
382  ptr_buffer[1][iw] = dpsiM_row_list[iw];
383  }
384 
385  queue.enqueueH2D(evalGrad_buffer_H2D);
386 
387  if (grads_value_v.rows() != nw || grads_value_v.cols() != GT::Size)
388  grads_value_v.resize(nw, GT::Size);
389 
390  const Value** invRow_ptr = reinterpret_cast<const Value**>(evalGrad_buffer_H2D.device_data());
391  const Value** dpsiM_row_ptr = reinterpret_cast<const Value**>(evalGrad_buffer_H2D.device_data()) + nw;
392 
393  compute::calcGradients_batched(queue, engine_leader.invRow.size(), invRow_ptr, dpsiM_row_ptr,
394  grads_value_v.device_data(), nw);
395  queue.enqueueD2H(grads_value_v);
396  queue.sync();
397 
398  for (int iw = 0; iw < nw; iw++)
399  grad_now[iw] = {grads_value_v[iw][0], grads_value_v[iw][1], grads_value_v[iw][2]};
400  }
401 
402  template<typename GT>
404  MultiWalkerResource& mw_rsc,
405  const RefVector<DualMatrix<Value>>& psiMinv_refs,
406  const std::vector<const Value*>& dpsiM_row_list,
407  OffloadMatrix<Complex>& mw_dspin,
408  const int rowchanged,
409  std::vector<GT>& grad_now,
410  std::vector<Complex>& spingrad_now)
411  {
412  auto& engine_leader = engines.getLeader();
413  auto& buffer_H2D = mw_rsc.evalGrad_buffer_H2D;
414  auto& grads_value_v = mw_rsc.grads_value_v;
415  auto& spingrads_value_v = mw_rsc.spingrads_value_v;
416 
417  //Need to pack these into a transfer buffer since psiMinv and dpsiM_row_list are not multiwalker data
418  //i.e. each engine has its own psiMinv which is an OffloadMatrix instead of the leader having the data for all the walkers in the crowd.
419  //Wouldn't have to do this if dpsiM and psiMinv were part of the mw_rsc_handle_ with data across all walkers in the crowd and could just use use_device_ptr for the offload.
420  //That is how mw_dspin is handled below
421  const int norb = psiMinv_refs[0].get().rows();
422  const int nw = engines.size();
423  constexpr size_t num_ptrs_packed = 2; // it must match packing and unpacking
424  buffer_H2D.resize(sizeof(Value*) * num_ptrs_packed * nw);
425  Matrix<const Value*> ptr_buffer(reinterpret_cast<const Value**>(buffer_H2D.data()), num_ptrs_packed, nw);
426  for (int iw = 0; iw < nw; iw++)
427  {
428  DualMatrix<Value>& psiMinv = psiMinv_refs[iw];
429  ptr_buffer[0][iw] = psiMinv.device_data() + rowchanged * psiMinv.cols();
430  ptr_buffer[1][iw] = dpsiM_row_list[iw];
431  }
432 
433  constexpr unsigned DIM = GT::Size;
434  grads_value_v.resize(nw, DIM);
435  spingrads_value_v.resize(nw);
436  auto* __restrict__ grads_value_v_ptr = grads_value_v.data();
437  auto* __restrict__ spingrads_value_v_ptr = spingrads_value_v.data();
438  auto* buffer_H2D_ptr = buffer_H2D.data();
439  auto* mw_dspin_ptr = mw_dspin.data();
440 
441  //Note that mw_dspin should already be in sync between device and host...updateTo was called in
442  //SPOSet::mw_evaluateVGLWithSpin to sync
443  //Also note that since mw_dspin is Dual, I can just use mw_dpsin.data() above and then use directly inside
444  //then offload region. OMP will figure out the correct translation to the device address, i.e. no
445  //need to include in the PRAGMA_OFFLOAD below
446  PRAGMA_OFFLOAD("omp target teams distribute num_teams(nw) \
447  map(always, to: buffer_H2D_ptr[:buffer_H2D.size()]) \
448  map(always, from: grads_value_v_ptr[:grads_value_v.size()]) \
449  map(always, from: spingrads_value_v_ptr[:spingrads_value_v.size()])")
450  for (int iw = 0; iw < nw; iw++)
451  {
452  const Value* __restrict__ invRow_ptr = reinterpret_cast<const Value**>(buffer_H2D_ptr)[iw];
453  const Value* __restrict__ dpsiM_row_ptr = reinterpret_cast<const Value**>(buffer_H2D_ptr)[nw + iw];
454  Value grad_x(0), grad_y(0), grad_z(0);
455  Complex spingrad(0);
456 #if defined(QMC_COMPLEX)
457  // COMPILER WORKAROUND
458  // This was causing a llvm-link error in icpx due to the lack of declare reduction on complex datatypes.
459  // Keep real builds free of any reduction on a complex datatype. Just serialize the reduction.
460  // Because mw_evalGradWithSpin is only being called in complex builds in simulations, the impact of this workaround is basically zero.
461  // It is still beneficial to keep it functional in real builds.
462  PRAGMA_OFFLOAD("omp parallel for reduction(+: grad_x, grad_y, grad_z, spingrad)")
463 #endif
464  for (int iorb = 0; iorb < norb; iorb++)
465  {
466  grad_x += invRow_ptr[iorb] * dpsiM_row_ptr[iorb * DIM];
467  grad_y += invRow_ptr[iorb] * dpsiM_row_ptr[iorb * DIM + 1];
468  grad_z += invRow_ptr[iorb] * dpsiM_row_ptr[iorb * DIM + 2];
469  spingrad += invRow_ptr[iorb] * mw_dspin_ptr[iw * norb + iorb];
470  }
471  grads_value_v_ptr[iw * DIM] = grad_x;
472  grads_value_v_ptr[iw * DIM + 1] = grad_y;
473  grads_value_v_ptr[iw * DIM + 2] = grad_z;
474  spingrads_value_v_ptr[iw] = spingrad;
475  }
476 
477  for (int iw = 0; iw < nw; iw++)
478  {
479  grad_now[iw] = {grads_value_v[iw][0], grads_value_v[iw][1], grads_value_v[iw][2]};
480  spingrad_now[iw] = spingrads_value_v[iw];
481  }
482  }
483 
484  /** Update the "local" psiMinv_ on the device.
485  * Side Effect Transfers:
486  * * psiMinv_ is transferred back to host since single calls from QMCHamitonian and others
487  * * expect it to be.
488  *
489  * Forced to use OpenMP target since resources are banned for single walker functions APIs
490  * and the acquireRelease pattern for a single DDB was removed by #3324
491  */
492  template<typename VVT, typename FPVT>
493  void updateRow(DualMatrix<Value>& Ainv, int rowchanged, const VVT& phiV, FPVT c_ratio_in)
494  {
495  guard_no_delay();
496  // update the inverse matrix
497  constexpr Value cone(1), czero(0);
498  const int norb = Ainv.rows();
499  const int lda = Ainv.cols();
500  temp.resize(norb);
501  rcopy.resize(norb);
502  // invoke the Fahy's variant of Sherman-Morrison update.
503  int dummy_handle = 0;
504  const Value* phiV_ptr = phiV.data();
505  Value* Ainv_ptr = Ainv.data();
506  Value* temp_ptr = temp.data();
507  Value* rcopy_ptr = rcopy.data();
508  // psiMinv_ must be update-to-date on both host and device
509  PRAGMA_OFFLOAD("omp target data map(always, tofrom: Ainv_ptr[:Ainv.size()]) \
510  use_device_ptr(phiV_ptr, Ainv_ptr, temp_ptr, rcopy_ptr)")
511  {
512  int success = ompBLAS::gemv(dummy_handle, 'T', norb, norb, cone, Ainv_ptr, lda, phiV_ptr, 1, czero, temp_ptr, 1);
513  if (success != 0)
514  throw std::runtime_error("ompBLAS::gemv failed.");
515 
516  PRAGMA_OFFLOAD("omp target parallel for simd is_device_ptr(Ainv_ptr, temp_ptr, rcopy_ptr)")
517  for (int i = 0; i < norb; i++)
518  {
519  rcopy_ptr[i] = Ainv_ptr[rowchanged * lda + i];
520  if (i == 0)
521  temp_ptr[rowchanged] -= cone;
522  }
523 
524  success = ompBLAS::ger(dummy_handle, norb, norb, static_cast<Value>(FPVT(-1) / c_ratio_in), rcopy_ptr, 1,
525  temp_ptr, 1, Ainv_ptr, lda);
526  if (success != 0)
527  throw std::runtime_error("ompBLAS::ger failed.");
528  }
529  }
530 
531  /** Accept or Reject row updates
532  * many of these const arguments provide pointers or references
533  * to objects that do get modified.
534  * \param[in] engines
535  * \param[in] rowchanged
536  * \param[in] psiM_g_list
537  * \param[in] psiM_l_list
538  * \param[in] isAccepted
539  * \param[in] phi_vgl_v multiple walker orbital VGL
540  * \param[inout] ratios
541  */
543  MultiWalkerResource& mw_rsc,
544  const RefVector<DualMatrix<Value>>& psiMinv_refs,
545  const int rowchanged,
546  const std::vector<Value*>& psiM_g_list,
547  const std::vector<Value*>& psiM_l_list,
548  const std::vector<bool>& isAccepted,
549  const OffloadMWVGLArray<Value>& phi_vgl_v,
550  const std::vector<Value>& ratios)
551  {
552  auto& engine_leader = engines.getLeader();
553  // invRow consumed, mark invRow_id unset
554  engine_leader.invRow_id = -1;
555 
556  if (engine_leader.no_delayed_update_)
557  {
558  mw_updateRow(engines, mw_rsc, psiMinv_refs, rowchanged, psiM_g_list, psiM_l_list, isAccepted, phi_vgl_v, ratios);
559  return;
560  }
561 
562  auto& queue = mw_rsc.queue;
563  auto& blas_handle = mw_rsc.blas_handle;
564  auto& cminusone_vec = mw_rsc.cminusone_vec;
565  auto& cone_vec = mw_rsc.cone_vec;
566  auto& czero_vec = mw_rsc.czero_vec;
567  auto& accept_rejectRow_buffer_H2D = mw_rsc.accept_rejectRow_buffer_H2D;
568  int& delay_count = engine_leader.delay_count;
569  const int lda_Binv = engine_leader.Binv_gpu.cols();
570  const int norb = engine_leader.invRow.size();
571  const int nw = engines.size();
572  const int n_accepted = psiM_g_list.size();
573  const size_t phi_vgl_stride = nw * norb;
574 
575  constexpr size_t num_ptrs_packed = 12; // it must match packing and unpacking
576  accept_rejectRow_buffer_H2D.resize((sizeof(Value*) * num_ptrs_packed + sizeof(Value)) * nw);
577  mw_rsc.resize_fill_constant_arrays(nw);
578 
579  Matrix<Value*> ptr_buffer(reinterpret_cast<Value**>(accept_rejectRow_buffer_H2D.data()), num_ptrs_packed, nw);
580  Value* c_ratio_inv =
581  reinterpret_cast<Value*>(accept_rejectRow_buffer_H2D.data() + sizeof(Value*) * num_ptrs_packed * nw);
582  for (int iw = 0, count_accepted = 0, count_rejected = 0; iw < nw; iw++)
583  {
584  DualMatrix<Value>& psiMinv = psiMinv_refs[iw];
585  const int lda = psiMinv.cols();
586  This_t& engine = engines[iw];
587  if (isAccepted[iw])
588  {
589  ptr_buffer[0][count_accepted] = psiMinv.device_data() + lda * rowchanged;
590  ptr_buffer[1][count_accepted] = engine.V_gpu.data();
591  ptr_buffer[2][count_accepted] = engine.U_gpu.data() + norb * delay_count;
592  ptr_buffer[3][count_accepted] = engine.p_gpu.data();
593  ptr_buffer[4][count_accepted] = engine.Binv_gpu.data();
594  ptr_buffer[5][count_accepted] = engine.Binv_gpu.data() + delay_count * lda_Binv;
595  ptr_buffer[6][count_accepted] = engine.Binv_gpu.data() + delay_count;
596  ptr_buffer[7][count_accepted] = reinterpret_cast<Value*>(engine.delay_list_gpu.data());
597  ptr_buffer[8][count_accepted] = engine.V_gpu.data() + norb * delay_count;
598  ptr_buffer[9][count_accepted] = const_cast<Value*>(phi_vgl_v.device_data_at(0, iw, 0));
599  ptr_buffer[10][count_accepted] = psiM_g_list[count_accepted];
600  ptr_buffer[11][count_accepted] = psiM_l_list[count_accepted];
601  c_ratio_inv[count_accepted] = Value(1) / ratios[iw];
602  count_accepted++;
603  }
604  else
605  {
606  ptr_buffer[0][n_accepted + count_rejected] = psiMinv.device_data() + lda * rowchanged;
607  ptr_buffer[1][n_accepted + count_rejected] = engine.V_gpu.data();
608  ptr_buffer[2][n_accepted + count_rejected] = engine.U_gpu.data() + norb * delay_count;
609  ptr_buffer[3][n_accepted + count_rejected] = engine.p_gpu.data();
610  ptr_buffer[4][n_accepted + count_rejected] = engine.Binv_gpu.data();
611  ptr_buffer[5][n_accepted + count_rejected] = engine.Binv_gpu.data() + delay_count * lda_Binv;
612  ptr_buffer[6][n_accepted + count_rejected] = engine.Binv_gpu.data() + delay_count;
613  ptr_buffer[7][n_accepted + count_rejected] = reinterpret_cast<Value*>(engine.delay_list_gpu.data());
614  ptr_buffer[8][n_accepted + count_rejected] = engine.V_gpu.data() + norb * delay_count;
615  count_rejected++;
616  }
617  }
618 
619  queue.enqueueH2D(accept_rejectRow_buffer_H2D);
620 
621  Value** invRow_mw_ptr = reinterpret_cast<Value**>(accept_rejectRow_buffer_H2D.device_data());
622  Value** V_mw_ptr = reinterpret_cast<Value**>(accept_rejectRow_buffer_H2D.device_data() + sizeof(Value*) * nw);
623  Value** U_row_mw_ptr =
624  reinterpret_cast<Value**>(accept_rejectRow_buffer_H2D.device_data() + sizeof(Value*) * nw * 2);
625  Value** p_mw_ptr = reinterpret_cast<Value**>(accept_rejectRow_buffer_H2D.device_data() + sizeof(Value*) * nw * 3);
626  Value** Binv_mw_ptr =
627  reinterpret_cast<Value**>(accept_rejectRow_buffer_H2D.device_data() + sizeof(Value*) * nw * 4);
628  Value** BinvRow_mw_ptr =
629  reinterpret_cast<Value**>(accept_rejectRow_buffer_H2D.device_data() + sizeof(Value*) * nw * 5);
630  Value** BinvCol_mw_ptr =
631  reinterpret_cast<Value**>(accept_rejectRow_buffer_H2D.device_data() + sizeof(Value*) * nw * 6);
632  int** delay_list_mw_ptr =
633  reinterpret_cast<int**>(accept_rejectRow_buffer_H2D.device_data() + sizeof(Value*) * nw * 7);
634  Value** V_row_mw_ptr =
635  reinterpret_cast<Value**>(accept_rejectRow_buffer_H2D.device_data() + sizeof(Value*) * nw * 8);
636  Value** phiVGL_mw_ptr =
637  reinterpret_cast<Value**>(accept_rejectRow_buffer_H2D.device_data() + sizeof(Value*) * nw * 9);
638  Value** dpsiM_mw_out =
639  reinterpret_cast<Value**>(accept_rejectRow_buffer_H2D.device_data() + sizeof(Value*) * nw * 10);
640  Value** d2psiM_mw_out =
641  reinterpret_cast<Value**>(accept_rejectRow_buffer_H2D.device_data() + sizeof(Value*) * nw * 11);
642  Value* ratio_inv_mw_ptr =
643  reinterpret_cast<Value*>(accept_rejectRow_buffer_H2D.device_data() + sizeof(Value*) * nw * 12);
644 
645  //std::copy_n(Ainv[rowchanged], norb, V[delay_count]);
646  compute::BLAS::copy_batched(blas_handle, norb, invRow_mw_ptr, 1, V_row_mw_ptr, 1, nw);
647  // handle accepted walkers
648  // the new Binv is [[X Y] [Z sigma]]
649  //BLAS::gemv('T', norb, delay_count + 1, cminusone, V.data(), norb, psiV.data(), 1, czero, p.data(), 1);
650  compute::BLAS::gemv_batched(blas_handle, 'T', norb, delay_count, cminusone_vec.device_data(), V_mw_ptr, norb,
651  phiVGL_mw_ptr, 1, czero_vec.device_data(), p_mw_ptr, 1, n_accepted);
652  // Y
653  //BLAS::gemv('T', delay_count, delay_count, sigma, Binv.data(), lda_Binv, p.data(), 1, czero, Binv.data() + delay_count,
654  // lda_Binv);
655  compute::BLAS::gemv_batched(blas_handle, 'T', delay_count, delay_count, ratio_inv_mw_ptr, Binv_mw_ptr, lda_Binv,
656  p_mw_ptr, 1, czero_vec.device_data(), BinvCol_mw_ptr, lda_Binv, n_accepted);
657  // X
658  //BLAS::ger(delay_count, delay_count, cone, Binv[delay_count], 1, Binv.data() + delay_count, lda_Binv,
659  // Binv.data(), lda_Binv);
660  compute::BLAS::ger_batched(blas_handle, delay_count, delay_count, cone_vec.device_data(), BinvRow_mw_ptr, 1,
661  BinvCol_mw_ptr, lda_Binv, Binv_mw_ptr, lda_Binv, n_accepted);
662  // sigma and Z
663  compute::add_delay_list_save_sigma_VGL_batched(queue, delay_list_mw_ptr, rowchanged, delay_count, Binv_mw_ptr,
664  lda_Binv, ratio_inv_mw_ptr, phiVGL_mw_ptr, phi_vgl_stride,
665  U_row_mw_ptr, dpsiM_mw_out, d2psiM_mw_out, norb, n_accepted, nw);
666  delay_count++;
667  // update Ainv when maximal delay is reached
668  if (delay_count == lda_Binv)
669  mw_updateInvMat(engines, mw_rsc, psiMinv_refs);
670  }
671 
672  /** update the full Ainv and reset delay_count
673  * @param Ainv inverse matrix
674  */
675  static void mw_updateInvMat(const RefVectorWithLeader<This_t>& engines,
676  MultiWalkerResource& mw_rsc,
677  const RefVector<DualMatrix<Value>>& psiMinv_refs)
678  {
679  auto& engine_leader = engines.getLeader();
680  int& delay_count = engine_leader.delay_count;
681  if (delay_count == 0)
682  return;
683  // update the inverse matrix
684  auto& queue = mw_rsc.queue;
685  auto& blas_handle = mw_rsc.blas_handle;
686  auto& updateInv_buffer_H2D = mw_rsc.updateInv_buffer_H2D;
687  const int norb = engine_leader.invRow.size();
688  const int lda = psiMinv_refs[0].get().cols();
689  const int nw = engines.size();
690 
691  constexpr size_t num_ptrs_packed = 6; // it must match packing and unpacking
692  updateInv_buffer_H2D.resize(sizeof(Value*) * num_ptrs_packed * nw);
693  mw_rsc.resize_fill_constant_arrays(nw);
694 
695  Matrix<Value*> ptr_buffer(reinterpret_cast<Value**>(updateInv_buffer_H2D.data()), num_ptrs_packed, nw);
696  for (int iw = 0; iw < nw; iw++)
697  {
698  This_t& engine = engines[iw];
699  ptr_buffer[0][iw] = engine.U_gpu.data();
700  ptr_buffer[1][iw] = psiMinv_refs[iw].get().device_data();
701  ptr_buffer[2][iw] = engine.tempMat_gpu.data();
702  ptr_buffer[3][iw] = reinterpret_cast<Value*>(engine.delay_list_gpu.data());
703  ptr_buffer[4][iw] = engine.V_gpu.data();
704  ptr_buffer[5][iw] = engine.Binv_gpu.data();
705  }
706 
707  queue.enqueueH2D(updateInv_buffer_H2D);
708 
709  Value** U_mw_ptr = reinterpret_cast<Value**>(updateInv_buffer_H2D.device_data());
710  Value** Ainv_mw_ptr = reinterpret_cast<Value**>(updateInv_buffer_H2D.device_data() + sizeof(Value*) * nw);
711  Value** tempMat_mw_ptr = reinterpret_cast<Value**>(updateInv_buffer_H2D.device_data() + sizeof(Value*) * nw * 2);
712  int** delay_list_mw_ptr = reinterpret_cast<int**>(updateInv_buffer_H2D.device_data() + sizeof(Value*) * nw * 3);
713  Value** V_mw_ptr = reinterpret_cast<Value**>(updateInv_buffer_H2D.device_data() + sizeof(Value*) * nw * 4);
714  Value** Binv_mw_ptr = reinterpret_cast<Value**>(updateInv_buffer_H2D.device_data() + sizeof(Value*) * nw * 5);
715 
716  /*
717  if (delay_count == 1)
718  {
719  // this is a special case invoking the Fahy's variant of Sherman-Morrison update.
720  // Only use the first norb elements of tempMat as a temporal array
721  BLAS::gemv('T', norb, norb, cone, Ainv.data(), norb, U[0], 1, czero, temp.data(), 1);
722  temp[delay_list[0]] -= cone;
723  BLAS::ger(norb, norb, -Binv[0][0], V[0], 1, temp.data(), 1, Ainv.data(), norb);
724  }
725  else
726 */
727  {
728  const int lda_Binv = engine_leader.Binv_gpu.cols();
729  compute::BLAS::gemm_batched(blas_handle, 'T', 'N', delay_count, norb, norb, Value(1), U_mw_ptr, norb, Ainv_mw_ptr,
730  lda, Value(0), tempMat_mw_ptr, lda_Binv, nw);
731  compute::applyW_batched(queue, delay_list_mw_ptr, delay_count, tempMat_mw_ptr, lda_Binv, nw);
732  compute::BLAS::gemm_batched(blas_handle, 'N', 'N', norb, delay_count, delay_count, Value(1), V_mw_ptr, norb,
733  Binv_mw_ptr, lda_Binv, Value(0), U_mw_ptr, norb, nw);
734  compute::BLAS::gemm_batched(blas_handle, 'N', 'N', norb, norb, delay_count, Value(-1), U_mw_ptr, norb,
735  tempMat_mw_ptr, lda_Binv, Value(1), Ainv_mw_ptr, lda, nw);
736  }
737  delay_count = 0;
738  }
739 
740  /*
741  inline void print_Ainv(const RefVector<This_t>& engines)
742  {
743  for (This_t& engine : engines)
744  {
745  std::cout << "debug Ainv host " << engine.get_psiMinv()[0][0] << " " << engine.get_psiMinv()[0][32] << " "
746  << engine.get_psiMinv()[1][0] << " " << engine.get_psiMinv()[1][32] << std::endl;
747  auto* temp_ptr = engine.get_psiMinv().data();
748  PRAGMA_OFFLOAD("omp target update from(temp_ptr[:psiMinv_.size()])")
749  std::cout << "debug Ainv devi";
750  for (int row = 0; row < psiMinv_.rows(); row++)
751  {
752  for (int col = 0; col < psiMinv_.cols(); col++)
753  std::cout << " " << row << " " << col << " " << engine.get_psiMinv()[row][col];
754  std::cout << std::endl;
755  }
756  }
757  }
758 */
759 
760  /** return invRow host or device pointers based on on_host request
761  * prepare invRow if not already.
762  */
763  static std::vector<const Value*> mw_getInvRow(const RefVectorWithLeader<This_t>& engines,
764  MultiWalkerResource& mw_rsc,
765  const RefVector<DualMatrix<Value>>& psiMinv_refs,
766  const int row_id,
767  bool on_host)
768  {
769  auto& engine_leader = engines.getLeader();
770  auto& queue = mw_rsc.queue;
771  if (engine_leader.no_delayed_update_)
772  queue.sync();
773  else if (engine_leader.invRow_id != row_id)
774  {
775  // this can be skipped if mw_evalGrad gets called already.
776  mw_prepareInvRow(engines, mw_rsc, psiMinv_refs, row_id);
777  queue.sync();
778  }
779 
780  std::vector<const Value*> row_ptr_list;
781  row_ptr_list.reserve(psiMinv_refs.size());
782  if (on_host)
783  {
784  // copy values to host and return host pointer
785  if (engine_leader.no_delayed_update_)
786  for (DualMatrix<Value>& psiMinv : psiMinv_refs)
787  {
788  const size_t ncols = psiMinv.cols();
789  psiMinv.updateFrom(ncols, row_id * ncols);
790  row_ptr_list.push_back(psiMinv.data() + row_id * ncols);
791  }
792  else
793  for (This_t& engine : engines)
794  {
795  engine.invRow.updateFrom();
796  row_ptr_list.push_back(engine.invRow.data());
797  }
798  }
799  else
800  {
801  // return device pointer
802  if (engine_leader.no_delayed_update_)
803  for (DualMatrix<Value>& psiMinv : psiMinv_refs)
804  row_ptr_list.push_back(psiMinv.device_data() + row_id * psiMinv.cols());
805  else
806  for (This_t& engine : engines)
807  row_ptr_list.push_back(engine.invRow.device_data());
808  }
809  return row_ptr_list;
810  }
811 
812  /// transfer Ainv to the host
814  MultiWalkerResource& mw_rsc,
815  const RefVector<DualMatrix<Value>>& psiMinv_refs)
816  {
817  auto& engine_leader = engines.getLeader();
818  auto& queue = mw_rsc.queue;
819  engine_leader.guard_no_delay();
820 
821  for (DualMatrix<Value>& psiMinv : psiMinv_refs)
822  queue.enqueueD2H(psiMinv);
823  queue.sync();
824  }
825 };
826 } // namespace qmcplusplus
827 
828 #endif // QMCPLUSPLUS_DELAYED_UPDATE_BATCHED_H
static void mw_prepareInvRow(const RefVectorWithLeader< This_t > &engines, MultiWalkerResource &mw_rsc, const RefVector< DualMatrix< Value >> &psiMinv_refs, const int rowchanged)
compute the row of up-to-date Ainv
implements dirac matrix delayed update using OpenMP offload and CUDA.
void resize(int norb, int delay)
resize the internal storage
void add_delay_list_save_sigma_VGL_batched(Queue< PlatformKind::CUDA > &queue, int *const delay_list[], const int rowchanged, const int delay_count, T *const binv[], const int binv_lda, const T *const ratio_inv, const T *const phi_vgl_in[], const size_t phi_vgl_stride, T *const phi_out[], T *const dphi_out[], T *const d2phi_out[], const int norb, const int n_accepted, const int batch_count)
ompBLAS_status gemv(ompBLAS_handle &handle, const char trans, const int m, const int n, const T alpha, const T *const A, const int lda, const T *const x, const int incx, const T beta, T *const y, const int incy)
DelayedUpdateBatched(size_t norb, size_t max_delay)
default constructor
void applyW_batched(Queue< PlatformKind::CUDA > &queue, const int *const delay_list[], const int delay_count, T *const tempMat[], const int lda, const int batch_count)
static void mw_updateRow(const RefVectorWithLeader< This_t > &engines, MultiWalkerResource &mw_rsc, const RefVector< DualMatrix< Value >> &psiMinv_refs, const int rowchanged, const std::vector< Value *> &psiM_g_list, const std::vector< Value *> &psiM_l_list, const std::vector< bool > &isAccepted, const OffloadMWVGLArray< Value > &phi_vgl_v, const std::vector< Value > &ratios)
Do complete row updates many of these const arguments provide pointers or references somewhere in her...
helper functions for EinsplineSetBuilder
Definition: Configuration.h:43
int delay_count
current number of delays, increase one for each acceptance, reset to 0 after updating Ainv ...
void fill_n(T *x, size_t count, const T &value)
Definition: OMPstd.hpp:21
Vector< char, OffloadPinnedAllocator< char > > updateInv_buffer_H2D
DeviceVector< int > delay_list_gpu
list of delayed electrons
constexpr std::complex< float > czero
Definition: BLAS.hpp:51
constexpr std::complex< float > cone
Definition: BLAS.hpp:50
static void mw_transferAinv_D2H(const RefVectorWithLeader< This_t > &engines, MultiWalkerResource &mw_rsc, const RefVector< DualMatrix< Value >> &psiMinv_refs)
transfer Ainv to the host
static std::vector< const Value * > mw_getInvRow(const RefVectorWithLeader< This_t > &engines, MultiWalkerResource &mw_rsc, const RefVector< DualMatrix< Value >> &psiMinv_refs, const int row_id, bool on_host)
return invRow host or device pointers based on on_host request prepare invRow if not already...
static void mw_evalGrad(const RefVectorWithLeader< This_t > &engines, MultiWalkerResource &mw_rsc, const RefVector< DualMatrix< Value >> &psiMinv_refs, const std::vector< const Value *> &dpsiM_row_list, const int rowchanged, std::vector< GT > &grad_now)
SoA adaptor class for Vector<TinyVector<T,D> >
UnpinnedDualVector< Value > temp
scratch space for rank-1 update
size_type cols() const
Definition: OhmmsMatrix.h:78
static void mw_updateInvMat(const RefVectorWithLeader< This_t > &engines, MultiWalkerResource &mw_rsc, const RefVector< DualMatrix< Value >> &psiMinv_refs)
update the full Ainv and reset delay_count
static void mw_evalGradWithSpin(const RefVectorWithLeader< This_t > &engines, MultiWalkerResource &mw_rsc, const RefVector< DualMatrix< Value >> &psiMinv_refs, const std::vector< const Value *> &dpsiM_row_list, OffloadMatrix< Complex > &mw_dspin, const int rowchanged, std::vector< GT > &grad_now, std::vector< Complex > &spingrad_now)
UnpinnedDualVector< Value > invRow
row of up-to-date Ainv
int invRow_id
row id correspond to the up-to-date invRow.
DeviceMatrix< Value > V_gpu
rows of Ainv corresponding to delayed electrons
Type_t * device_data_at(const std::array< SIZET, D > &indices)
Definition: OhmmsArray.h:117
void calcGradients_batched(Queue< PlatformKind::CUDA > &queue, const int n, const T *const Ainvrow[], const T *const dpsiMrow[], T *const grads_now, const int batch_count)
void gemv_batched(BLASHandle< PlatformKind::CUDA > &handle, const char trans, const int m, const int n, const T *alpha, const T *const A[], const int lda, const T *const x[], const int incx, const T *beta, T *const y[], const int incy, const int batch_count)
void gemm_batched(BLASHandle< PlatformKind::CUDA > &handle, const char transa, const char transb, int m, int n, int k, const float &alpha, const float *const A[], int lda, const float *const B[], int ldb, const float &beta, float *const C[], int ldc, int batchCount)
Vector< char, OffloadPinnedAllocator< char > > accept_rejectRow_buffer_H2D
void copyAinvRow_saveGL_batched(Queue< PlatformKind::CUDA > &queue, const int rowchanged, const int n, const T *const Ainv[], const int lda, T *const temp[], T *const rcopy[], const T *const phi_vgl_in[], const size_t phi_vgl_stride, T *const dphi_out[], T *const d2phi_out[], const int batch_count)
UnpinnedDualVector< Value > mw_temp
scratch space for rank-1 update
size_type rows() const
Definition: OhmmsMatrix.h:77
pointer device_data()
Definition: OhmmsMatrix.h:188
Vector< char, OffloadPinnedAllocator< char > > updateRow_buffer_H2D
void guard_no_delay() const
ensure no previous delay left.
std::vector< std::reference_wrapper< T > > RefVector
void copy_batched(BLASHandle< PlatformKind::CUDA > &handle, const int n, const T *const in[], const int incx, T *const out[], const int incy, const int batch_count)
Declaraton of Vector<T,Alloc> Manage memory through Alloc directly and allow referencing an existing ...
Vector< char, OffloadPinnedAllocator< char > > prepare_inv_row_buffer_H2D
const bool no_delayed_update_
if true, updates are not delayed.
typename RealAlias_impl< T >::value_type RealAlias
If you have a function templated on a value that can be real or complex and you need to get the base ...
static void mw_accept_rejectRow(const RefVectorWithLeader< This_t > &engines, MultiWalkerResource &mw_rsc, const RefVector< DualMatrix< Value >> &psiMinv_refs, const int rowchanged, const std::vector< Value *> &psiM_g_list, const std::vector< Value *> &psiM_l_list, const std::vector< bool > &isAccepted, const OffloadMWVGLArray< Value > &phi_vgl_v, const std::vector< Value > &ratios)
Accept or Reject row updates many of these const arguments provide pointers or references to objects ...
Vector< char, OffloadPinnedAllocator< char > > evalGrad_buffer_H2D
ompBLAS_status ger(ompBLAS_handle &handle, const int m, const int n, const T alpha, const T *const x, const int incx, const T *const y, const int incy, T *const A, const int lda)
UnpinnedDualVector< Value > rcopy
void ger_batched(BLASHandle< PlatformKind::CUDA > &handle, const int m, const int n, const T *alpha, const T *const x[], const int incx, const T *const y[], const int incy, T *const A[], const int lda, const int batch_count)
DeviceMatrix< Value > U_gpu
orbital values of delayed electrons
A D-dimensional Array class based on PETE.
Definition: OhmmsArray.h:25
DeviceMatrix< Value > tempMat_gpu
scratch space, used during inverse update
DeviceMatrix< Value > Binv_gpu
Matrix inverse of B, at maximum KxK.
DeviceVector< Value > p_gpu
new column of B
void updateRow(DualMatrix< Value > &Ainv, int rowchanged, const VVT &phiV, FPVT c_ratio_in)
Update the "local" psiMinv_ on the device.