13 #ifndef QMCPLUSPLUS_DELAYED_UPDATE_BATCHED_H 14 #define QMCPLUSPLUS_DELAYED_UPDATE_BATCHED_H 35 template<PlatformKind PL,
typename VALUE>
125 template<
typename DT>
127 template<
typename DT>
152 V_gpu.resize(delay, norb);
153 U_gpu.resize(delay, norb);
167 throw std::runtime_error(
"BUG: unexpected call sequence delay_count is not 0");
177 const int rowchanged)
179 auto& engine_leader = engines.
getLeader();
186 const int norb = engine_leader.invRow.size();
187 const int nw = engines.size();
190 constexpr
size_t num_ptrs_packed = 7;
191 prepare_inv_row_buffer_H2D.resize(
sizeof(
Value*) * num_ptrs_packed * nw);
194 const int lda_Binv = engine_leader.Binv_gpu.cols();
195 Matrix<Value*> ptr_buffer(reinterpret_cast<Value**>(prepare_inv_row_buffer_H2D.data()), num_ptrs_packed, nw);
196 for (
int iw = 0; iw < nw; iw++)
198 This_t& engine = engines[iw];
200 ptr_buffer[0][iw] = psiMinv.
device_data() + rowchanged * psiMinv.
cols();
201 ptr_buffer[1][iw] = engine.
invRow.device_data();
202 ptr_buffer[2][iw] = engine.
U_gpu.data();
203 ptr_buffer[3][iw] = engine.
p_gpu.data();
204 ptr_buffer[4][iw] = engine.
Binv_gpu.data();
206 ptr_buffer[6][iw] = engine.
V_gpu.data();
209 queue.enqueueH2D(prepare_inv_row_buffer_H2D);
211 Value** oldRow_mw_ptr =
reinterpret_cast<Value**
>(prepare_inv_row_buffer_H2D.device_data());
212 Value** invRow_mw_ptr =
reinterpret_cast<Value**
>(prepare_inv_row_buffer_H2D.device_data() +
sizeof(
Value*) * nw);
213 Value** U_mw_ptr =
reinterpret_cast<Value**
>(prepare_inv_row_buffer_H2D.device_data() +
sizeof(
Value*) * nw * 2);
214 Value** p_mw_ptr =
reinterpret_cast<Value**
>(prepare_inv_row_buffer_H2D.device_data() +
sizeof(
Value*) * nw * 3);
215 Value** Binv_mw_ptr =
reinterpret_cast<Value**
>(prepare_inv_row_buffer_H2D.device_data() +
sizeof(
Value*) * nw * 4);
216 Value** BinvRow_mw_ptr =
217 reinterpret_cast<Value**
>(prepare_inv_row_buffer_H2D.device_data() +
sizeof(
Value*) * nw * 5);
218 Value** V_mw_ptr =
reinterpret_cast<Value**
>(prepare_inv_row_buffer_H2D.device_data() +
sizeof(
Value*) * nw * 6);
228 invRow_mw_ptr, 1, czero_vec.device_data(), p_mw_ptr, 1, nw);
230 lda_Binv, p_mw_ptr, 1, czero_vec.device_data(), BinvRow_mw_ptr, 1, nw);
232 BinvRow_mw_ptr, 1, cone_vec.device_data(), invRow_mw_ptr, 1, nw);
234 engine_leader.invRow_id = rowchanged;
255 const int rowchanged,
256 const std::vector<Value*>& psiM_g_list,
257 const std::vector<Value*>& psiM_l_list,
258 const std::vector<bool>& isAccepted,
260 const std::vector<Value>& ratios)
262 auto& engine_leader = engines.
getLeader();
265 const size_t n_accepted = psiM_g_list.size();
267 size_t n_true = std::count_if(isAccepted.begin(), isAccepted.end(), [](
bool accepted) {
return accepted; });
268 assert(n_accepted == n_true);
276 auto& mw_temp = mw_rsc.
mw_temp;
280 const int norb = engine_leader.invRow.size();
281 const int lda = psiMinv_refs[0].get().cols();
282 const int nw = engines.size();
283 const size_t phi_vgl_stride = nw * norb;
284 mw_temp.resize(norb * n_accepted);
285 mw_rcopy.resize(norb * n_accepted);
287 constexpr
size_t num_ptrs_packed = 6;
288 updateRow_buffer_H2D.resize((
sizeof(
Value*) * num_ptrs_packed +
sizeof(
Value)) * n_accepted);
291 Matrix<Value*> ptr_buffer(reinterpret_cast<Value**>(updateRow_buffer_H2D.data()), num_ptrs_packed, n_accepted);
293 reinterpret_cast<Value*
>(updateRow_buffer_H2D.data() +
sizeof(
Value*) * num_ptrs_packed * n_accepted);
294 for (
int iw = 0, count = 0; iw < isAccepted.size(); iw++)
297 ptr_buffer[0][count] = psiMinv_refs[iw].get().
device_data();
299 ptr_buffer[2][count] = mw_temp.
device_data() + norb * count;
300 ptr_buffer[3][count] = mw_rcopy.
device_data() + norb * count;
301 ptr_buffer[4][count] = psiM_g_list[count];
302 ptr_buffer[5][count] = psiM_l_list[count];
304 c_ratio_inv[count] =
Value(-1) / ratios[iw];
311 queue.enqueueH2D(updateRow_buffer_H2D);
314 Value** Ainv_mw_ptr =
reinterpret_cast<Value**
>(updateRow_buffer_H2D.device_data());
315 Value** phiVGL_mw_ptr =
316 reinterpret_cast<Value**
>(updateRow_buffer_H2D.device_data() +
sizeof(
Value*) * n_accepted);
317 Value** temp_mw_ptr =
318 reinterpret_cast<Value**
>(updateRow_buffer_H2D.device_data() +
sizeof(
Value*) * n_accepted * 2);
319 Value** rcopy_mw_ptr =
320 reinterpret_cast<Value**
>(updateRow_buffer_H2D.device_data() +
sizeof(
Value*) * n_accepted * 3);
321 Value** dpsiM_mw_out =
322 reinterpret_cast<Value**
>(updateRow_buffer_H2D.device_data() +
sizeof(
Value*) * n_accepted * 4);
323 Value** d2psiM_mw_out =
324 reinterpret_cast<Value**
>(updateRow_buffer_H2D.device_data() +
sizeof(
Value*) * n_accepted * 5);
325 Value* ratio_inv_mw =
326 reinterpret_cast<Value*
>(updateRow_buffer_H2D.device_data() +
sizeof(
Value*) * n_accepted * 6);
331 1, czero_vec.device_data(), temp_mw_ptr, 1, n_accepted);
334 phiVGL_mw_ptr, phi_vgl_stride, dpsiM_mw_out, d2psiM_mw_out, n_accepted);
353 template<
typename GT>
357 const std::vector<const Value*>& dpsiM_row_list,
358 const int rowchanged,
359 std::vector<GT>& grad_now)
361 auto& engine_leader = engines.
getLeader();
362 if (!engine_leader.no_delayed_update_)
369 const int nw = engines.size();
370 constexpr
size_t num_ptrs_packed = 2;
371 evalGrad_buffer_H2D.resize(
sizeof(
Value*) * num_ptrs_packed * nw);
372 Matrix<const Value*> ptr_buffer(reinterpret_cast<const Value**>(evalGrad_buffer_H2D.data()), num_ptrs_packed, nw);
373 for (
int iw = 0; iw < nw; iw++)
375 if (engine_leader.no_delayed_update_)
378 ptr_buffer[0][iw] = psiMinv.
device_data() + rowchanged * psiMinv.
cols();
381 ptr_buffer[0][iw] = engines[iw].invRow.device_data();
382 ptr_buffer[1][iw] = dpsiM_row_list[iw];
385 queue.enqueueH2D(evalGrad_buffer_H2D);
387 if (grads_value_v.rows() != nw || grads_value_v.cols() != GT::Size)
388 grads_value_v.resize(nw, GT::Size);
390 const Value** invRow_ptr =
reinterpret_cast<const Value**
>(evalGrad_buffer_H2D.device_data());
391 const Value** dpsiM_row_ptr =
reinterpret_cast<const Value**
>(evalGrad_buffer_H2D.device_data()) + nw;
394 grads_value_v.device_data(), nw);
395 queue.enqueueD2H(grads_value_v);
398 for (
int iw = 0; iw < nw; iw++)
399 grad_now[iw] = {grads_value_v[iw][0], grads_value_v[iw][1], grads_value_v[iw][2]};
402 template<
typename GT>
406 const std::vector<const Value*>& dpsiM_row_list,
408 const int rowchanged,
409 std::vector<GT>& grad_now,
410 std::vector<Complex>& spingrad_now)
412 auto& engine_leader = engines.
getLeader();
421 const int norb = psiMinv_refs[0].get().rows();
422 const int nw = engines.size();
423 constexpr
size_t num_ptrs_packed = 2;
424 buffer_H2D.resize(
sizeof(
Value*) * num_ptrs_packed * nw);
425 Matrix<const Value*> ptr_buffer(reinterpret_cast<const Value**>(buffer_H2D.data()), num_ptrs_packed, nw);
426 for (
int iw = 0; iw < nw; iw++)
429 ptr_buffer[0][iw] = psiMinv.
device_data() + rowchanged * psiMinv.
cols();
430 ptr_buffer[1][iw] = dpsiM_row_list[iw];
433 constexpr
unsigned DIM = GT::Size;
434 grads_value_v.resize(nw,
DIM);
435 spingrads_value_v.resize(nw);
436 auto* __restrict__ grads_value_v_ptr = grads_value_v.data();
437 auto* __restrict__ spingrads_value_v_ptr = spingrads_value_v.data();
438 auto* buffer_H2D_ptr = buffer_H2D.data();
439 auto* mw_dspin_ptr = mw_dspin.
data();
446 PRAGMA_OFFLOAD(
"omp target teams distribute num_teams(nw) \ 447 map(always, to: buffer_H2D_ptr[:buffer_H2D.size()]) \ 448 map(always, from: grads_value_v_ptr[:grads_value_v.size()]) \ 449 map(always, from: spingrads_value_v_ptr[:spingrads_value_v.size()])")
450 for (
int iw = 0; iw < nw; iw++)
452 const Value* __restrict__ invRow_ptr =
reinterpret_cast<const Value**
>(buffer_H2D_ptr)[iw];
453 const Value* __restrict__ dpsiM_row_ptr =
reinterpret_cast<const Value**
>(buffer_H2D_ptr)[nw + iw];
454 Value grad_x(0), grad_y(0), grad_z(0);
456 #if defined(QMC_COMPLEX) 462 PRAGMA_OFFLOAD(
"omp parallel for reduction(+: grad_x, grad_y, grad_z, spingrad)")
464 for (
int iorb = 0; iorb < norb; iorb++)
466 grad_x += invRow_ptr[iorb] * dpsiM_row_ptr[iorb *
DIM];
467 grad_y += invRow_ptr[iorb] * dpsiM_row_ptr[iorb *
DIM + 1];
468 grad_z += invRow_ptr[iorb] * dpsiM_row_ptr[iorb *
DIM + 2];
469 spingrad += invRow_ptr[iorb] * mw_dspin_ptr[iw * norb + iorb];
471 grads_value_v_ptr[iw *
DIM] = grad_x;
472 grads_value_v_ptr[iw *
DIM + 1] = grad_y;
473 grads_value_v_ptr[iw *
DIM + 2] = grad_z;
474 spingrads_value_v_ptr[iw] = spingrad;
477 for (
int iw = 0; iw < nw; iw++)
479 grad_now[iw] = {grads_value_v[iw][0], grads_value_v[iw][1], grads_value_v[iw][2]};
480 spingrad_now[iw] = spingrads_value_v[iw];
492 template<
typename VVT,
typename FPVT>
498 const int norb = Ainv.
rows();
503 int dummy_handle = 0;
504 const Value* phiV_ptr = phiV.data();
509 PRAGMA_OFFLOAD(
"omp target data map(always, tofrom: Ainv_ptr[:Ainv.size()]) \ 510 use_device_ptr(phiV_ptr, Ainv_ptr, temp_ptr, rcopy_ptr)")
512 int success =
ompBLAS::gemv(dummy_handle,
'T', norb, norb,
cone, Ainv_ptr,
lda, phiV_ptr, 1,
czero, temp_ptr, 1);
514 throw std::runtime_error(
"ompBLAS::gemv failed.");
516 PRAGMA_OFFLOAD(
"omp target parallel for simd is_device_ptr(Ainv_ptr, temp_ptr, rcopy_ptr)")
517 for (
int i = 0; i < norb; i++)
519 rcopy_ptr[i] = Ainv_ptr[rowchanged *
lda + i];
521 temp_ptr[rowchanged] -=
cone;
524 success =
ompBLAS::ger(dummy_handle, norb, norb, static_cast<Value>(FPVT(-1) / c_ratio_in), rcopy_ptr, 1,
525 temp_ptr, 1, Ainv_ptr,
lda);
527 throw std::runtime_error(
"ompBLAS::ger failed.");
545 const int rowchanged,
546 const std::vector<Value*>& psiM_g_list,
547 const std::vector<Value*>& psiM_l_list,
548 const std::vector<bool>& isAccepted,
550 const std::vector<Value>& ratios)
552 auto& engine_leader = engines.
getLeader();
556 if (engine_leader.no_delayed_update_)
558 mw_updateRow(engines, mw_rsc, psiMinv_refs, rowchanged, psiM_g_list, psiM_l_list, isAccepted, phi_vgl_v, ratios);
569 const int lda_Binv = engine_leader.Binv_gpu.cols();
570 const int norb = engine_leader.invRow.size();
571 const int nw = engines.size();
572 const int n_accepted = psiM_g_list.size();
573 const size_t phi_vgl_stride = nw * norb;
575 constexpr
size_t num_ptrs_packed = 12;
576 accept_rejectRow_buffer_H2D.resize((
sizeof(
Value*) * num_ptrs_packed +
sizeof(
Value)) * nw);
579 Matrix<Value*> ptr_buffer(reinterpret_cast<Value**>(accept_rejectRow_buffer_H2D.data()), num_ptrs_packed, nw);
581 reinterpret_cast<Value*
>(accept_rejectRow_buffer_H2D.data() +
sizeof(
Value*) * num_ptrs_packed * nw);
582 for (
int iw = 0, count_accepted = 0, count_rejected = 0; iw < nw; iw++)
585 const int lda = psiMinv.
cols();
586 This_t& engine = engines[iw];
589 ptr_buffer[0][count_accepted] = psiMinv.
device_data() +
lda * rowchanged;
590 ptr_buffer[1][count_accepted] = engine.
V_gpu.data();
592 ptr_buffer[3][count_accepted] = engine.
p_gpu.data();
593 ptr_buffer[4][count_accepted] = engine.
Binv_gpu.data();
599 ptr_buffer[10][count_accepted] = psiM_g_list[count_accepted];
600 ptr_buffer[11][count_accepted] = psiM_l_list[count_accepted];
601 c_ratio_inv[count_accepted] =
Value(1) / ratios[iw];
606 ptr_buffer[0][n_accepted + count_rejected] = psiMinv.
device_data() +
lda * rowchanged;
607 ptr_buffer[1][n_accepted + count_rejected] = engine.
V_gpu.data();
608 ptr_buffer[2][n_accepted + count_rejected] = engine.
U_gpu.data() + norb *
delay_count;
609 ptr_buffer[3][n_accepted + count_rejected] = engine.
p_gpu.data();
610 ptr_buffer[4][n_accepted + count_rejected] = engine.
Binv_gpu.data();
611 ptr_buffer[5][n_accepted + count_rejected] = engine.
Binv_gpu.data() +
delay_count * lda_Binv;
613 ptr_buffer[7][n_accepted + count_rejected] =
reinterpret_cast<Value*
>(engine.
delay_list_gpu.data());
614 ptr_buffer[8][n_accepted + count_rejected] = engine.
V_gpu.data() + norb *
delay_count;
619 queue.enqueueH2D(accept_rejectRow_buffer_H2D);
621 Value** invRow_mw_ptr =
reinterpret_cast<Value**
>(accept_rejectRow_buffer_H2D.device_data());
622 Value** V_mw_ptr =
reinterpret_cast<Value**
>(accept_rejectRow_buffer_H2D.device_data() +
sizeof(
Value*) * nw);
623 Value** U_row_mw_ptr =
624 reinterpret_cast<Value**
>(accept_rejectRow_buffer_H2D.device_data() +
sizeof(
Value*) * nw * 2);
625 Value** p_mw_ptr =
reinterpret_cast<Value**
>(accept_rejectRow_buffer_H2D.device_data() +
sizeof(
Value*) * nw * 3);
626 Value** Binv_mw_ptr =
627 reinterpret_cast<Value**
>(accept_rejectRow_buffer_H2D.device_data() +
sizeof(
Value*) * nw * 4);
628 Value** BinvRow_mw_ptr =
629 reinterpret_cast<Value**
>(accept_rejectRow_buffer_H2D.device_data() +
sizeof(
Value*) * nw * 5);
630 Value** BinvCol_mw_ptr =
631 reinterpret_cast<Value**
>(accept_rejectRow_buffer_H2D.device_data() +
sizeof(
Value*) * nw * 6);
632 int** delay_list_mw_ptr =
633 reinterpret_cast<int**
>(accept_rejectRow_buffer_H2D.device_data() +
sizeof(
Value*) * nw * 7);
634 Value** V_row_mw_ptr =
635 reinterpret_cast<Value**
>(accept_rejectRow_buffer_H2D.device_data() +
sizeof(
Value*) * nw * 8);
636 Value** phiVGL_mw_ptr =
637 reinterpret_cast<Value**
>(accept_rejectRow_buffer_H2D.device_data() +
sizeof(
Value*) * nw * 9);
638 Value** dpsiM_mw_out =
639 reinterpret_cast<Value**
>(accept_rejectRow_buffer_H2D.device_data() +
sizeof(
Value*) * nw * 10);
640 Value** d2psiM_mw_out =
641 reinterpret_cast<Value**
>(accept_rejectRow_buffer_H2D.device_data() +
sizeof(
Value*) * nw * 11);
642 Value* ratio_inv_mw_ptr =
643 reinterpret_cast<Value*
>(accept_rejectRow_buffer_H2D.device_data() +
sizeof(
Value*) * nw * 12);
651 phiVGL_mw_ptr, 1, czero_vec.device_data(), p_mw_ptr, 1, n_accepted);
656 p_mw_ptr, 1, czero_vec.device_data(), BinvCol_mw_ptr, lda_Binv, n_accepted);
661 BinvCol_mw_ptr, lda_Binv, Binv_mw_ptr, lda_Binv, n_accepted);
664 lda_Binv, ratio_inv_mw_ptr, phiVGL_mw_ptr, phi_vgl_stride,
665 U_row_mw_ptr, dpsiM_mw_out, d2psiM_mw_out, norb, n_accepted, nw);
679 auto& engine_leader = engines.
getLeader();
687 const int norb = engine_leader.invRow.size();
688 const int lda = psiMinv_refs[0].get().cols();
689 const int nw = engines.size();
691 constexpr
size_t num_ptrs_packed = 6;
692 updateInv_buffer_H2D.resize(
sizeof(
Value*) * num_ptrs_packed * nw);
695 Matrix<Value*> ptr_buffer(reinterpret_cast<Value**>(updateInv_buffer_H2D.data()), num_ptrs_packed, nw);
696 for (
int iw = 0; iw < nw; iw++)
698 This_t& engine = engines[iw];
699 ptr_buffer[0][iw] = engine.
U_gpu.data();
700 ptr_buffer[1][iw] = psiMinv_refs[iw].get().
device_data();
703 ptr_buffer[4][iw] = engine.
V_gpu.data();
704 ptr_buffer[5][iw] = engine.
Binv_gpu.data();
707 queue.enqueueH2D(updateInv_buffer_H2D);
709 Value** U_mw_ptr =
reinterpret_cast<Value**
>(updateInv_buffer_H2D.device_data());
710 Value** Ainv_mw_ptr =
reinterpret_cast<Value**
>(updateInv_buffer_H2D.device_data() +
sizeof(
Value*) * nw);
711 Value** tempMat_mw_ptr =
reinterpret_cast<Value**
>(updateInv_buffer_H2D.device_data() +
sizeof(
Value*) * nw * 2);
712 int** delay_list_mw_ptr =
reinterpret_cast<int**
>(updateInv_buffer_H2D.device_data() +
sizeof(
Value*) * nw * 3);
713 Value** V_mw_ptr =
reinterpret_cast<Value**
>(updateInv_buffer_H2D.device_data() +
sizeof(
Value*) * nw * 4);
714 Value** Binv_mw_ptr =
reinterpret_cast<Value**
>(updateInv_buffer_H2D.device_data() +
sizeof(
Value*) * nw * 5);
728 const int lda_Binv = engine_leader.Binv_gpu.cols();
729 compute::BLAS::gemm_batched(blas_handle,
'T',
'N',
delay_count, norb, norb,
Value(1), U_mw_ptr, norb, Ainv_mw_ptr,
730 lda,
Value(0), tempMat_mw_ptr, lda_Binv, nw);
733 Binv_mw_ptr, lda_Binv,
Value(0), U_mw_ptr, norb, nw);
735 tempMat_mw_ptr, lda_Binv,
Value(1), Ainv_mw_ptr,
lda, nw);
769 auto& engine_leader = engines.
getLeader();
771 if (engine_leader.no_delayed_update_)
773 else if (engine_leader.invRow_id != row_id)
780 std::vector<const Value*> row_ptr_list;
781 row_ptr_list.reserve(psiMinv_refs.size());
785 if (engine_leader.no_delayed_update_)
788 const size_t ncols = psiMinv.cols();
789 psiMinv.updateFrom(ncols, row_id * ncols);
790 row_ptr_list.push_back(psiMinv.data() + row_id * ncols);
793 for (
This_t& engine : engines)
795 engine.invRow.updateFrom();
796 row_ptr_list.push_back(engine.invRow.data());
802 if (engine_leader.no_delayed_update_)
804 row_ptr_list.push_back(psiMinv.device_data() + row_id * psiMinv.cols());
806 for (
This_t& engine : engines)
807 row_ptr_list.push_back(engine.invRow.device_data());
817 auto& engine_leader = engines.
getLeader();
819 engine_leader.guard_no_delay();
822 queue.enqueueD2H(psiMinv);
828 #endif // QMCPLUSPLUS_DELAYED_UPDATE_BATCHED_H static void mw_prepareInvRow(const RefVectorWithLeader< This_t > &engines, MultiWalkerResource &mw_rsc, const RefVector< DualMatrix< Value >> &psiMinv_refs, const int rowchanged)
compute the row of up-to-date Ainv
implements dirac matrix delayed update using OpenMP offload and CUDA.
void resize(int norb, int delay)
resize the internal storage
void add_delay_list_save_sigma_VGL_batched(Queue< PlatformKind::CUDA > &queue, int *const delay_list[], const int rowchanged, const int delay_count, T *const binv[], const int binv_lda, const T *const ratio_inv, const T *const phi_vgl_in[], const size_t phi_vgl_stride, T *const phi_out[], T *const dphi_out[], T *const d2phi_out[], const int norb, const int n_accepted, const int batch_count)
ompBLAS_status gemv(ompBLAS_handle &handle, const char trans, const int m, const int n, const T alpha, const T *const A, const int lda, const T *const x, const int incx, const T beta, T *const y, const int incy)
DelayedUpdateBatched(size_t norb, size_t max_delay)
default constructor
void applyW_batched(Queue< PlatformKind::CUDA > &queue, const int *const delay_list[], const int delay_count, T *const tempMat[], const int lda, const int batch_count)
static void mw_updateRow(const RefVectorWithLeader< This_t > &engines, MultiWalkerResource &mw_rsc, const RefVector< DualMatrix< Value >> &psiMinv_refs, const int rowchanged, const std::vector< Value *> &psiM_g_list, const std::vector< Value *> &psiM_l_list, const std::vector< bool > &isAccepted, const OffloadMWVGLArray< Value > &phi_vgl_v, const std::vector< Value > &ratios)
Do complete row updates many of these const arguments provide pointers or references somewhere in her...
helper functions for EinsplineSetBuilder
int delay_count
current number of delays, increase one for each acceptance, reset to 0 after updating Ainv ...
void fill_n(T *x, size_t count, const T &value)
compute::BLASHandle< PL > blas_handle
Vector< char, OffloadPinnedAllocator< char > > updateInv_buffer_H2D
DeviceVector< int > delay_list_gpu
list of delayed electrons
constexpr std::complex< float > czero
constexpr std::complex< float > cone
static void mw_transferAinv_D2H(const RefVectorWithLeader< This_t > &engines, MultiWalkerResource &mw_rsc, const RefVector< DualMatrix< Value >> &psiMinv_refs)
transfer Ainv to the host
static std::vector< const Value * > mw_getInvRow(const RefVectorWithLeader< This_t > &engines, MultiWalkerResource &mw_rsc, const RefVector< DualMatrix< Value >> &psiMinv_refs, const int row_id, bool on_host)
return invRow host or device pointers based on on_host request prepare invRow if not already...
static void mw_evalGrad(const RefVectorWithLeader< This_t > &engines, MultiWalkerResource &mw_rsc, const RefVector< DualMatrix< Value >> &psiMinv_refs, const std::vector< const Value *> &dpsiM_row_list, const int rowchanged, std::vector< GT > &grad_now)
SoA adaptor class for Vector<TinyVector<T,D> >
UnpinnedDualVector< Value > temp
scratch space for rank-1 update
static void mw_updateInvMat(const RefVectorWithLeader< This_t > &engines, MultiWalkerResource &mw_rsc, const RefVector< DualMatrix< Value >> &psiMinv_refs)
update the full Ainv and reset delay_count
static void mw_evalGradWithSpin(const RefVectorWithLeader< This_t > &engines, MultiWalkerResource &mw_rsc, const RefVector< DualMatrix< Value >> &psiMinv_refs, const std::vector< const Value *> &dpsiM_row_list, OffloadMatrix< Complex > &mw_dspin, const int rowchanged, std::vector< GT > &grad_now, std::vector< Complex > &spingrad_now)
UnpinnedDualVector< Value > invRow
row of up-to-date Ainv
int invRow_id
row id correspond to the up-to-date invRow.
DeviceMatrix< Value > V_gpu
rows of Ainv corresponding to delayed electrons
Type_t * device_data_at(const std::array< SIZET, D > &indices)
void calcGradients_batched(Queue< PlatformKind::CUDA > &queue, const int n, const T *const Ainvrow[], const T *const dpsiMrow[], T *const grads_now, const int batch_count)
void gemv_batched(BLASHandle< PlatformKind::CUDA > &handle, const char trans, const int m, const int n, const T *alpha, const T *const A[], const int lda, const T *const x[], const int incx, const T *beta, T *const y[], const int incy, const int batch_count)
void gemm_batched(BLASHandle< PlatformKind::CUDA > &handle, const char transa, const char transb, int m, int n, int k, const float &alpha, const float *const A[], int lda, const float *const B[], int ldb, const float &beta, float *const C[], int ldc, int batchCount)
Vector< char, OffloadPinnedAllocator< char > > accept_rejectRow_buffer_H2D
UnpinnedDualVector< Value > mw_rcopy
void copyAinvRow_saveGL_batched(Queue< PlatformKind::CUDA > &queue, const int rowchanged, const int n, const T *const Ainv[], const int lda, T *const temp[], T *const rcopy[], const T *const phi_vgl_in[], const size_t phi_vgl_stride, T *const dphi_out[], T *const d2phi_out[], const int batch_count)
UnpinnedDualVector< Value > mw_temp
scratch space for rank-1 update
UnpinnedDualVector< Value > czero_vec
void resize_fill_constant_arrays(size_t nw)
Vector< char, OffloadPinnedAllocator< char > > updateRow_buffer_H2D
void guard_no_delay() const
ensure no previous delay left.
std::vector< std::reference_wrapper< T > > RefVector
void copy_batched(BLASHandle< PlatformKind::CUDA > &handle, const int n, const T *const in[], const int incx, T *const out[], const int incy, const int batch_count)
Declaraton of Vector<T,Alloc> Manage memory through Alloc directly and allow referencing an existing ...
UnpinnedDualVector< Value > cminusone_vec
Vector< char, OffloadPinnedAllocator< char > > prepare_inv_row_buffer_H2D
const bool no_delayed_update_
if true, updates are not delayed.
std::complex< Real > Complex
typename RealAlias_impl< T >::value_type RealAlias
If you have a function templated on a value that can be real or complex and you need to get the base ...
static void mw_accept_rejectRow(const RefVectorWithLeader< This_t > &engines, MultiWalkerResource &mw_rsc, const RefVector< DualMatrix< Value >> &psiMinv_refs, const int rowchanged, const std::vector< Value *> &psiM_g_list, const std::vector< Value *> &psiM_l_list, const std::vector< bool > &isAccepted, const OffloadMWVGLArray< Value > &phi_vgl_v, const std::vector< Value > &ratios)
Accept or Reject row updates many of these const arguments provide pointers or references to objects ...
Vector< char, OffloadPinnedAllocator< char > > evalGrad_buffer_H2D
ompBLAS_status ger(ompBLAS_handle &handle, const int m, const int n, const T alpha, const T *const x, const int incx, const T *const y, const int incy, T *const A, const int lda)
UnpinnedDualVector< Value > rcopy
void ger_batched(BLASHandle< PlatformKind::CUDA > &handle, const int m, const int n, const T *alpha, const T *const x[], const int incx, const T *const y[], const int incy, T *const A[], const int lda, const int batch_count)
DeviceMatrix< Value > U_gpu
orbital values of delayed electrons
A D-dimensional Array class based on PETE.
DeviceMatrix< Value > tempMat_gpu
scratch space, used during inverse update
UnpinnedDualVector< Value > cone_vec
compute::Queue< PL > queue
DeviceMatrix< Value > Binv_gpu
Matrix inverse of B, at maximum KxK.
DeviceVector< Value > p_gpu
new column of B
DualVector< Complex > spingrads_value_v
DualMatrix< Value > grads_value_v
void updateRow(DualMatrix< Value > &Ainv, int rowchanged, const VVT &phiV, FPVT c_ratio_in)
Update the "local" psiMinv_ on the device.