QMCPACK
MultiDiracDeterminant.2.cpp
Go to the documentation of this file.
1 //////////////////////////////////////////////////////////////////////////////////////
2 // This file is distributed under the University of Illinois/NCSA Open Source License.
3 // See LICENSE file in top directory for details.
4 //
5 // Copyright (c) 2016 Jeongnim Kim and QMCPACK developers.
6 //
7 // File developed by: Bryan Clark, bclark@Princeton.edu, Princeton University
8 // Miguel Morales, moralessilva2@llnl.gov, Lawrence Livermore National Laboratory
9 // Jeongnim Kim, jeongnim.kim@gmail.com, University of Illinois at Urbana-Champaign
10 // Jeremy McMinnis, jmcminis@gmail.com, University of Illinois at Urbana-Champaign
11 // Mark A. Berrill, berrillma@ornl.gov, Oak Ridge National Laboratory
12 //
13 // File created by: Bryan Clark, bclark@Princeton.edu, Princeton University
14 //////////////////////////////////////////////////////////////////////////////////////
15 
16 
17 /**@file
18  * @brief Implement build functions: Function bodies are too big to be in a header file
19  */
22 #include "OMPTarget/ompBLAS.hpp"
25 
26 namespace qmcplusplus
27 {
28 /** shared function used by buildTableMatrix_calculateRatios */
30  int ref,
31  ValueType det0,
32  ValueType* restrict ratios,
33  const OffloadMatrix<ValueType>& psiinv,
34  const OffloadMatrix<ValueType>& psi,
35  OffloadMatrix<ValueType>& table_matrix,
36  const OffloadVector<int>& data,
37  const VectorSoaContainer<int, 2, OffloadPinnedAllocator<int>>& pairs,
39 {
40  {
42  const size_t num = psi.extent(1);
43  const size_t npairs = pairs.size();
44  //MatrixOperators::product_ABt(psiinv,psi,table_matrix);
45  const int* first = pairs.data(0);
46  const int* second = pairs.data(1);
47  for (size_t i = 0; i < npairs; ++i)
48  {
49  const int I = first[i];
50  const int J = second[i];
51  table_matrix(I, J) = simd::dot(psiinv[I], psi[J], num);
52  }
53  }
54 
55  {
57  const int* it2 = data.data();
58  const size_t nitems = sign.size();
59  const size_t nb_cols = table_matrix.cols();
60  // explore Inclusive Scan for OpenMP
61  for (size_t count = 0; count < nitems; ++count)
62  {
63  const size_t n = *it2;
64  if (count != ref)
65  ratios[count] = sign[count] * det0 *
67  : calcSmallDeterminant(n, table_matrix.data(), it2 + 1, nb_cols));
68  it2 += 3 * n + 1;
69  }
70 
71  ratios[ref] = det0;
72  }
73 }
74 
77  int ref,
78  const OffloadVector<ValueType>& det0_list,
79  const RefVector<OffloadMatrix<ValueType>>& psiinv_list,
80  const RefVector<OffloadMatrix<ValueType>>& psi_list,
81  const OffloadVector<int>& data,
82  const VectorSoaContainer<int, 2, OffloadPinnedAllocator<int>>& pairs,
84  const RefVector<OffloadMatrix<ValueType>>& table_matrix_list,
85  const RefVector<OffloadVector<ValueType>>& ratios_list)
86 {
87  const size_t nw = ratios_list.size();
88  auto& psiinv_deviceptr_list = mw_res.psiinv_deviceptr_list;
89  auto& psi_deviceptr_list = mw_res.psi_deviceptr_list;
90  auto& table_matrix_deviceptr_list = mw_res.table_matrix_deviceptr_list;
91  auto& ratios_deviceptr_list = mw_res.ratios_deviceptr_list;
92  const size_t nb_cols_table_matrix(table_matrix_list[0].get().cols());
93 
94  {
96  const size_t npairs = pairs.size();
97  const size_t num = psi_list[0].get().extent(1);
98  const size_t nitems = sign.size();
99 
100  const int* first = pairs.data(0);
101  const int* second = pairs.data(1);
102 
103  psiinv_deviceptr_list.resize(nw);
104  psi_deviceptr_list.resize(nw);
105  table_matrix_deviceptr_list.resize(nw);
106  ratios_deviceptr_list.resize(nw);
107 
108  for (size_t iw = 0; iw < nw; iw++)
109  {
110  psiinv_deviceptr_list[iw] = psiinv_list[iw].get().device_data();
111  psi_deviceptr_list[iw] = psi_list[iw].get().device_data();
112  table_matrix_deviceptr_list[iw] = table_matrix_list[iw].get().device_data();
113  ratios_deviceptr_list[iw] = ratios_list[iw].get().device_data();
114  }
115 
116  const size_t nb_cols_psi(psi_list[0].get().cols());
117  const size_t nb_cols_psiinv(psiinv_list[0].get().cols());
118 
119  auto* ratios_list_ptr = ratios_deviceptr_list.data();
120  auto* table_matrix_list_ptr = table_matrix_deviceptr_list.data();
121  const auto* psiinv_list_ptr = psiinv_deviceptr_list.data();
122  const auto* psi_list_ptr = psi_deviceptr_list.data();
123 
124  {
125  ScopedTimer local_timer(offload_timer);
126  PRAGMA_OFFLOAD("omp target teams distribute parallel for collapse(2) \
127  map(always, to: psiinv_list_ptr[:nw], psi_list_ptr[:nw]) \
128  map(always, to: ratios_list_ptr[:nw], table_matrix_list_ptr[:nw]) \
129  map(to:first[:npairs], second[:npairs])")
130  for (uint32_t iw = 0; iw < nw; iw++)
131  for (uint32_t i = 0; i < npairs; ++i)
132  {
133  const int I = first[i];
134  const int J = second[i];
135 
136  ValueType table_matrix_local = 0.0;
137  for (uint32_t ind = 0; ind < num; ind++)
138  table_matrix_local +=
139  psiinv_list_ptr[iw][I * nb_cols_psiinv + ind] * psi_list_ptr[iw][J * nb_cols_psi + ind];
140  table_matrix_list_ptr[iw][I * nb_cols_table_matrix + J] = table_matrix_local;
141  }
142  }
143  }
144  {
146  const int max_ext_level = ndets_per_excitation_level_->size() - 1;
147 
148  // Compute workload changes drastically as the excitation level increases.
149  // this may need different parallelization strategy.
150  size_t det_offset = 1;
151  size_t data_offset = 1;
152 
153  auto update_offsets = [&](size_t ext_level) {
154  det_offset += (*ndets_per_excitation_level_)[ext_level];
155  data_offset += (*ndets_per_excitation_level_)[ext_level] * (3 * ext_level + 1);
156  };
157 
158  mw_updateRatios_det0(det0_list, ratios_deviceptr_list);
159 
160  if (max_ext_level >= 1)
161  {
162  mw_updateRatios<1>(det_offset, data_offset, data, sign, table_matrix_deviceptr_list, nb_cols_table_matrix,
163  ratios_deviceptr_list);
164  update_offsets(1);
165  }
166 
167  if (max_ext_level >= 2)
168  {
169  mw_updateRatios<2>(det_offset, data_offset, data, sign, table_matrix_deviceptr_list, nb_cols_table_matrix,
170  ratios_deviceptr_list);
171  update_offsets(2);
172  }
173 
174  if (max_ext_level >= 3)
175  {
176  mw_updateRatios<3>(det_offset, data_offset, data, sign, table_matrix_deviceptr_list, nb_cols_table_matrix,
177  ratios_deviceptr_list);
178  update_offsets(3);
179  }
180 
181  if (max_ext_level >= 4)
182  {
183  mw_updateRatios<4>(det_offset, data_offset, data, sign, table_matrix_deviceptr_list, nb_cols_table_matrix,
184  ratios_deviceptr_list);
185  update_offsets(4);
186  }
187 
188  if (max_ext_level >= 5)
189  {
190  mw_updateRatios<5>(det_offset, data_offset, data, sign, table_matrix_deviceptr_list, nb_cols_table_matrix,
191  ratios_deviceptr_list);
192  update_offsets(5);
193  }
194 
195  if (max_ext_level >= 6)
196  {
197  for (size_t iw = 0; iw < nw; iw++)
198  table_matrix_list[iw].get().updateFrom();
199  for (size_t ext_level = 6; ext_level <= max_ext_level; ext_level++)
200  {
201  mw_updateRatios_generic(ext_level, det_offset, data_offset, det_calculator_, data, sign, table_matrix_list,
202  ratios_list);
203  update_offsets(ext_level);
204  }
205  // FIXME need to transfer the part of det ratios ext_level >= 6 to the device.
206  }
207  }
208 }
209 
210 
212  int ref,
213  const OffloadMatrix<ValueType>& psiinv,
214  const OffloadMatrix<ValueType>& psi,
215  const OffloadVector<int>& data,
216  const VectorSoaContainer<int, 2, OffloadPinnedAllocator<int>>& pairs,
218  OffloadMatrix<ValueType>& table_matrix,
219  OffloadVector<ValueType>& ratios)
220 {
221  ScopedTimer local_timer(calculateRatios_timer);
222  buildTableMatrix_calculateRatios_impl(ref, ValueType(1), ratios.data(), psiinv, psi, table_matrix, data, pairs, sign);
223 }
224 
227  int ref,
228  const OffloadVector<ValueType>& det0_list,
229  const RefVector<OffloadMatrix<ValueType>>& psiinv_list,
230  const RefVector<OffloadMatrix<ValueType>>& psi_list,
231  const OffloadVector<int>& data,
232  const VectorSoaContainer<int, 2, OffloadPinnedAllocator<int>>& pairs,
234  const RefVector<OffloadMatrix<ValueType>>& table_matrix_list,
235  const RefVector<OffloadVector<ValueType>>& ratios_list)
236 {
237  ScopedTimer local_timer(calculateRatios_timer);
238  mw_buildTableMatrix_calculateRatios_impl(mw_res, ref, det0_list, psiinv_list, psi_list, data, pairs, sign,
239  table_matrix_list, ratios_list);
240 
241  {
242  ScopedTimer local_timer(transferD2H_timer);
243  for (size_t iw = 0; iw < ratios_list.size(); iw++)
244  ratios_list[iw].get().updateFrom();
245  }
246 }
247 
249  int ref,
250  const OffloadMatrix<ValueType>& psiinv,
251  const OffloadMatrix<ValueType>& psi,
252  const OffloadVector<int>& data,
253  const VectorSoaContainer<int, 2, OffloadPinnedAllocator<int>>& pairs,
255  const ValueType& det0_grad,
256  OffloadMatrix<ValueType>& table_matrix,
257  int dx,
258  int iat,
259  Matrix<GradType>& grads)
260 {
262  buildTableMatrix_calculateRatios_impl(ref, det0_grad, WorkSpace.data(), psiinv, psi, table_matrix, data, pairs, sign);
263  for (size_t count = 0; count < getNumDets(); ++count)
264  grads(count, iat)[dx] = WorkSpace[count];
265 }
266 
269  int ref,
270  int iat,
271  int dx,
272  int getNumDets,
273  const OffloadVector<ValueType>& det0_grad_list,
274  const RefVector<OffloadMatrix<ValueType>>& psiinv_list,
275  const RefVector<OffloadMatrix<ValueType>>& psi_list,
276  const OffloadVector<int>& data,
277  const VectorSoaContainer<int, 2, OffloadPinnedAllocator<int>>& pairs,
279  const RefVector<OffloadVector<ValueType>>& WorkSpace_list,
280  const RefVector<OffloadMatrix<ValueType>>& table_matrix_list,
282 {
284  mw_buildTableMatrix_calculateRatios_impl(mw_res, ref, det0_grad_list, psiinv_list, psi_list, data, pairs, sign,
285  table_matrix_list, WorkSpace_list);
286 
287  const size_t nw = WorkSpace_list.size();
288  OffloadVector<ValueType*> WorkSpace_deviceptr_list(nw);
289  for (size_t iw = 0; iw < nw; iw++)
290  WorkSpace_deviceptr_list[iw] = WorkSpace_list[iw].get().device_data();
291 
292  auto* WorkSpace_list_ptr = WorkSpace_deviceptr_list.data();
293  auto* mw_grads_ptr = mw_grads.data();
294  const size_t Grads_cols = mw_grads.cols();
295 
296  PRAGMA_OFFLOAD("omp target teams distribute parallel for collapse(2) map(from:mw_grads_ptr[:mw_grads.size()]) \
297  map(always, to:WorkSpace_list_ptr[:nw])")
298  for (uint32_t iw = 0; iw < nw; iw++)
299  for (uint32_t count = 0; count < getNumDets; ++count)
300  mw_grads_ptr[(3 * iw + dx) * Grads_cols + count] = WorkSpace_list_ptr[iw][count];
301 }
302 
304  int ref,
305  const OffloadMatrix<ValueType>& psiinv,
306  const OffloadMatrix<ValueType>& psi,
307  const OffloadVector<int>& data,
308  const VectorSoaContainer<int, 2, OffloadPinnedAllocator<int>>& pairs,
310  OffloadMatrix<ValueType>& table_matrix,
311  int iat,
312  Matrix<ValueType>& ratios)
313 {
314  const ValueType det0 = ratios(ref, iat);
315  buildTableMatrix_calculateRatios_impl(ref, det0, WorkSpace.data(), psiinv, psi, table_matrix, data, pairs, sign);
316  //splatt
317  for (size_t count = 0; count < getNumDets(); ++count)
318  ratios(count, iat) = WorkSpace[count];
319 #if 0
320  ValueType det0 = ratios(ref,iat);
321  int num=psi.extent(1);
322  std::vector<std::pair<int,int> >::iterator it(pairs.begin()), last(pairs.end());
323  while(it != last)
324  {
325  table_matrix((*it).first,(*it).second) = simd::dot(psiinv[(*it).first],psi[(*it).second],num);
326  it++;
327  }
328  std::vector<int>::iterator it2 = data.begin();
329  int count= 0; // number of determinants processed
330  while(it2 != data.end())
331  {
332  int n = *it2; // number of excitations
333  if(count == ref)
334  {
335  it2+=3*n+1; // number of integers used to encode the current excitation
336  count++;
337  continue;
338  }
339  ratios(count,iat) = sign[count]*det0*CustomizedMatrixDet(n,table_matrix,it2+1);
340  count++;
341  it2+=3*n+1;
342  }
343 #endif
344 }
345 
347  const RefVectorWithLeader<ParticleSet>& P_list,
348  int iat)
349 {
350  const int nw = det_list.size();
351  MultiDiracDeterminant& det_leader = det_list.getLeader();
352  RefVectorWithLeader<SPOSet> phi_list(*det_leader.getPhi());
353 
354  ScopedTimer local_timer(det_leader.evaluateDetsForPtclMove_timer);
355 
356  RefVector<OffloadVector<ValueType>> psiV_list, new_ratios_to_ref_list;
357  RefVector<OffloadMatrix<ValueType>> TpsiM_list, psiM_list, table_matrix_list;
358  RefVector<OffloadMatrix<ValueType>> psiMinv_temp_list, psiMinv_list;
359 
360  phi_list.reserve(nw);
361  psiV_list.reserve(nw);
362  psiMinv_list.reserve(nw);
363  psiMinv_temp_list.reserve(nw);
364  table_matrix_list.reserve(nw);
365  TpsiM_list.reserve(nw);
366  psiM_list.reserve(nw);
367  new_ratios_to_ref_list.reserve(nw);
368 
369  for (size_t iw = 0; iw < nw; iw++)
370  {
371  MultiDiracDeterminant& det = (det_list[iw]);
372  det.UpdateMode = ORB_PBYP_PARTIAL;
373  phi_list.push_back(*det.Phi);
374  psiV_list.push_back(det.psiV);
375  psiMinv_list.push_back(det.psiMinv);
376  psiM_list.push_back(det.psiM);
377  psiMinv_temp_list.push_back(det.psiMinv_temp);
378  new_ratios_to_ref_list.push_back(det.new_ratios_to_ref_);
379  table_matrix_list.push_back(det.table_matrix);
380  TpsiM_list.push_back(det.TpsiM);
381  }
382 
383 
384  det_leader.UpdateMode = ORB_PBYP_RATIO;
385  const int WorkingIndex = iat - det_leader.FirstIndex;
386 
387  {
388  ScopedTimer local_timer(det_leader.evalOrbValue_timer);
389  for (size_t iw = 0; iw < nw; iw++)
390  {
391  MultiDiracDeterminant& det = (det_list[iw]);
392  Vector<ValueType> psiV_list_host_view(psiV_list[iw].get().data(), psiV_list[iw].get().size());
393  det.getPhi()->evaluateValue(P_list[iw], iat, psiV_list_host_view);
394  ///Transfer of data from host to Device
395  {
396  ScopedTimer local_timer(det.transferH2D_timer);
397  psiV_list[iw].get().updateTo();
398  }
399  }
400  }
401 
402  size_t success = 0;
403  int dummy_handle = 0;
404  const auto psiMinv_rows = psiMinv_list[0].get().rows();
405  const auto psiMinv_cols = psiMinv_list[0].get().cols();
406  const auto TpsiM_cols = TpsiM_list[0].get().cols();
407  const auto psiM_cols = psiM_list[0].get().cols();
408  const auto TpsiM_rows = TpsiM_list[0].get().rows();
409  const auto NumPtcls = det_leader.NumPtcls;
410  const auto NumOrbitals = det_leader.NumOrbitals;
411  const auto& confgList = *det_leader.ciConfigList;
412 
413  auto& mw_res = det_leader.mw_res_handle_.getResource();
414  auto* psiV_list_devptr = mw_res.psiV_deviceptr_list.device_data();
415  auto* psiV_temp_list_ptr = mw_res.psiV_temp_deviceptr_list.data();
416 
417  auto* psiMinv_list_devptr = mw_res.psiMinv_deviceptr_list.device_data();
418  auto* psiMinv_temp_list_devptr = mw_res.psiMinv_temp_deviceptr_list.device_data();
419 
420  auto* TpsiM_list_devptr = mw_res.TpsiM_deviceptr_list.device_data();
421  auto* psiM_list_ptr = mw_res.psiM_deviceptr_list.data();
422 
423  auto& curRatio_list = mw_res.curRatio_list;
424  curRatio_list.resize(nw);
425  auto* curRatio_list_ptr = curRatio_list.data();
426 
427  auto& inv_curRatio_list = mw_res.inv_curRatio_list;
428  inv_curRatio_list.resize(nw);
429  auto* inv_curRatio_list_ptr = inv_curRatio_list.data();
430 
431  auto* confgListOccup_ptr = det_leader.refdet_occup->data();
432 
433  {
434  success = ompBLAS::copy_batched(dummy_handle, psiMinv_rows * psiMinv_cols, psiMinv_list_devptr, 1,
435  psiMinv_temp_list_devptr, 1, nw);
436  if (success != 0)
437  throw std::runtime_error("In MultiDiracDeterminant ompBLAS::copy_batched_offset failed.");
438 
439  success = ompBLAS::copy_batched_offset(dummy_handle, det_leader.NumOrbitals, psiV_list_devptr, 0, 1,
440  TpsiM_list_devptr, WorkingIndex, TpsiM_cols, nw);
441  if (success != 0)
442  throw std::runtime_error("In MultiDiracDeterminant ompBLAS::copy_batched_offset failed.");
443 
444  PRAGMA_OFFLOAD("omp target teams distribute map(always, from:curRatio_list_ptr[:nw]) \
445  is_device_ptr(psiV_list_devptr, psiMinv_temp_list_devptr)")
446  for (uint32_t iw = 0; iw < nw; iw++)
447  {
448  ValueType c_ratio = 0.0;
449  PRAGMA_OFFLOAD("omp parallel for reduction(+ : c_ratio)")
450  for (uint32_t jc = 0; jc < NumPtcls; jc++)
451  {
452  const size_t J = confgListOccup_ptr[jc];
453  psiV_temp_list_ptr[iw][jc] = psiV_list_devptr[iw][J];
454  size_t ic = jc * psiMinv_cols;
455  c_ratio += (psiMinv_temp_list_devptr[iw] + WorkingIndex)[ic] * psiV_temp_list_ptr[iw][jc];
456  }
457  curRatio_list_ptr[iw] = c_ratio;
458  inv_curRatio_list_ptr[iw] = ValueType(1) / c_ratio;
459  }
460 
461  det_leader.mw_InverseUpdateByColumn(det_leader.mw_res_handle_, WorkingIndex, inv_curRatio_list,
462  mw_res.psiV_temp_deviceptr_list, mw_res.psiMinv_temp_deviceptr_list,
463  psiMinv_rows);
464  ///This is needed by acceptMove. Eventually acceptMove will need to become mw_acceptMove.
465  {
466  ScopedTimer local_timer(det_leader.transferD2H_timer);
467  for (size_t iw = 0; iw < nw; iw++)
468  psiMinv_temp_list[iw].get().updateFrom();
469  }
470 
471  auto& det0_list = mw_res.cone_vec;
473  det0_list, psiMinv_temp_list, TpsiM_list, *det_leader.detData,
474  *det_leader.uniquePairs, *det_leader.DetSigns, table_matrix_list,
475  new_ratios_to_ref_list);
476 
477  // restore the modified column of TpsiM.
478  PRAGMA_OFFLOAD("omp target teams distribute parallel for collapse(2) is_device_ptr(TpsiM_list_devptr) \
479  map(always, to:psiM_list_ptr[:nw])")
480  for (uint32_t iw = 0; iw < nw; iw++)
481  for (uint32_t i = 0; i < NumOrbitals; i++)
482  TpsiM_list_devptr[iw][i * TpsiM_cols + WorkingIndex] = psiM_list_ptr[iw][i + psiM_cols * WorkingIndex];
483  }
484 
485  for (size_t iw = 0; iw < nw; iw++)
486  {
487  MultiDiracDeterminant& det = (det_list[iw]);
488  det.curRatio = curRatio_list_ptr[iw];
489  }
490 }
491 
493 {
495 
497  {
498  ScopedTimer orb_timer(evalOrbValue_timer);
499  Vector<ValueType> psiV_host_view(psiV.data(), psiV.size());
500  Phi->evaluateValue(P, iat, psiV_host_view);
501  }
502  const int WorkingIndex = (refPtcl < 0 ? iat : refPtcl) - FirstIndex;
503  assert(WorkingIndex >= 0 && WorkingIndex < LastIndex - FirstIndex);
504  const auto& confgList = *ciConfigList;
505  //std::vector<int>::iterator it(confgList[ReferenceDeterminant].occup.begin());
506  auto it(confgList[ReferenceDeterminant].occup.begin());
507  // mmorales: the only reason this is here is because
508  // NonlocalECP do not necessarily call rejectMove after
509  // calling ratio(), and even if the move is rejected
510  // this matrix needs to be restored
511  // If we always restore after ratio, then this is not needed
512  // For efficiency reasons, I don't do this for ratioGrad or ratio(P,dG,dL)
513  {
516  for (size_t i = 0; i < NumPtcls; i++)
517  psiV_temp[i] = psiV[*(it++)];
520  for (size_t i = 0; i < NumOrbitals; i++)
521  TpsiM(i, WorkingIndex) = psiV[i];
522  }
525  // check comment above
526  for (size_t i = 0; i < NumOrbitals; i++)
527  TpsiM(i, WorkingIndex) = psiM(WorkingIndex, i);
528 }
529 
531 {
534  {
535  ScopedTimer local_timer(evalOrbVGL_timer);
536  // Using Host Views for Phi-evaluateVGL since not ported to GPU
537  Vector<ValueType> psiV_host_view(psiV.data(), psiV.size());
538  Vector<GradType> dpsiV_host_view(dpsiV.data(), dpsiV.size());
539  Vector<ValueType> d2psiV_host_view(d2psiV.data(), d2psiV.size());
540  Phi->evaluateVGL(P, iat, psiV_host_view, dpsiV_host_view, d2psiV_host_view);
541  }
542  const int WorkingIndex = iat - FirstIndex;
543  const auto& confgList = *ciConfigList;
544  assert(WorkingIndex >= 0 && WorkingIndex < LastIndex - FirstIndex);
545 
546  GradType ratioGradRef;
547  {
549  //mmorales: check comment above
551  //std::vector<int>::iterator it(confgList[ReferenceDeterminant].occup.begin());
552  auto it(confgList[ReferenceDeterminant].occup.begin());
553  for (size_t i = 0; i < NumPtcls; i++)
554  {
555  psiV_temp[i] = psiV[*it];
556  ratioGradRef += psiMinv_temp(i, WorkingIndex) * dpsiV[*it];
557  it++;
558  }
561  for (size_t i = 0; i < NumOrbitals; i++)
562  TpsiM(i, WorkingIndex) = psiV[i];
563  }
566  for (size_t idim = 0; idim < OHMMS_DIM; idim++)
567  {
568  {
570  //dpsiMinv = psiMinv_temp;
571  dpsiMinv = psiMinv;
572  auto it(confgList[ReferenceDeterminant].occup.begin());
573  for (size_t i = 0; i < NumPtcls; i++)
574  psiV_temp[i] = dpsiV[*(it++)][idim];
575  InverseUpdateByColumn(dpsiMinv, psiV_temp, workV1, workV2, WorkingIndex, ratioGradRef[idim]);
576  for (size_t i = 0; i < NumOrbitals; i++)
577  TpsiM(i, WorkingIndex) = dpsiV[i][idim];
578  }
580  ratioGradRef[idim] / curRatio, table_matrix, idim, WorkingIndex, new_grads);
581  }
582  // check comment above
583  for (int i = 0; i < NumOrbitals; i++)
584  TpsiM(i, WorkingIndex) = psiM(WorkingIndex, i);
585 }
586 
588 {
589  assert(P.isSpinor() == is_spinor_);
592  {
593  ScopedTimer orb_timer(evalOrbVGL_timer);
594  // Creating Host view to call Phi->evaluateVGL
595  Vector<ValueType> psiV_host_view(psiV.data(), psiV.size());
596  Vector<GradType> dpsiV_host_view(dpsiV.data(), dpsiV.size());
597  Vector<ValueType> d2psiV_host_view(d2psiV.data(), d2psiV.size());
598  Phi->evaluateVGL_spin(P, iat, psiV_host_view, dpsiV_host_view, d2psiV_host_view, dspin_psiV);
599  }
600  const int WorkingIndex = iat - FirstIndex;
601  assert(WorkingIndex >= 0 && WorkingIndex < LastIndex - FirstIndex);
602  const auto& confgList = *ciConfigList;
603  GradType ratioGradRef;
604  ValueType ratioSpinGradRef = 0.0;
605  {
607  //mmorales: check comment above
609  //std::vector<int>::iterator it(confgList[ReferenceDeterminant].occup.begin());
610  auto it(confgList[ReferenceDeterminant].occup.begin());
611  for (size_t i = 0; i < NumPtcls; i++)
612  {
613  psiV_temp[i] = psiV[*it];
614  ratioGradRef += psiMinv_temp(i, WorkingIndex) * dpsiV[*it];
615  ratioSpinGradRef += psiMinv_temp(i, WorkingIndex) * dspin_psiV[*it];
616  it++;
617  }
619  new_spingrads(ReferenceDeterminant, WorkingIndex) = ratioSpinGradRef / curRatio;
621  for (size_t i = 0; i < NumOrbitals; i++)
622  TpsiM(i, WorkingIndex) = psiV[i];
623  }
626  for (size_t idim = 0; idim < OHMMS_DIM; idim++)
627  {
628  {
630  //dpsiMinv = psiMinv_temp;
631  dpsiMinv = psiMinv;
632  auto it(confgList[ReferenceDeterminant].occup.begin());
633  for (size_t i = 0; i < NumPtcls; i++)
634  psiV_temp[i] = dpsiV[*(it++)][idim];
635  InverseUpdateByColumn(dpsiMinv, psiV_temp, workV1, workV2, WorkingIndex, ratioGradRef[idim]);
636  for (size_t i = 0; i < NumOrbitals; i++)
637  TpsiM(i, WorkingIndex) = dpsiV[i][idim];
638  }
640  ratioGradRef[idim] / curRatio, table_matrix, idim, WorkingIndex, new_grads);
641  }
642 
643  //Now compute the spin gradient, same procedure as normal gradient components above
644  {
646  dpsiMinv = psiMinv;
647  auto it(confgList[ReferenceDeterminant].occup.begin());
648  for (size_t i = 0; i < NumPtcls; i++)
649  psiV_temp[i] = dspin_psiV[*(it++)];
650  InverseUpdateByColumn(dpsiMinv, psiV_temp, workV1, workV2, WorkingIndex, ratioSpinGradRef);
651  for (size_t i = 0; i < NumOrbitals; i++)
652  TpsiM(i, WorkingIndex) = dspin_psiV[i];
653  }
655  *DetSigns, table_matrix, WorkingIndex, new_spingrads);
656 
657  // check comment above
658  for (int i = 0; i < NumOrbitals; i++)
659  TpsiM(i, WorkingIndex) = psiM(WorkingIndex, i);
660 }
661 
664  const RefVectorWithLeader<ParticleSet>& P_list,
665  int iat,
667 {
668  const int nw = det_list.size();
669  MultiDiracDeterminant& det_leader = det_list.getLeader();
670  RefVectorWithLeader<SPOSet> phi_list(*det_leader.getPhi());
671 
672  ScopedTimer local_timer(det_leader.evaluateDetsAndGradsForPtclMove_timer);
673  int success = 0;
674  int dummy_handle = 0;
675  const size_t NumOrbitals(det_leader.NumOrbitals);
676  const size_t NumPtcls(det_leader.NumPtcls);
677 
678  RefVector<OffloadVector<ValueType>> psiV_list, psiV_temp_list, new_ratios_to_ref_list, WorkSpace_list;
680 
681  RefVector<OffloadMatrix<ValueType>> psiMinv_temp_list, psiMinv_list, dpsiMinv_list;
682  RefVector<OffloadMatrix<ValueType>> table_matrix_list, psiM_list, TpsiM_list;
683 
685 
686  phi_list.reserve(nw);
687  psiV_list.reserve(nw);
688  dpsiV_list.reserve(nw);
689  d2psiV_list.reserve(nw);
690  psiV_temp_list.reserve(nw);
691  psiMinv_temp_list.reserve(nw);
692  psiMinv_list.reserve(nw);
693  psiM_list.reserve(nw);
694 
695  TpsiM_list.reserve(nw);
696  new_ratios_to_ref_list.reserve(nw);
697  table_matrix_list.reserve(nw);
698  dpsiMinv_list.reserve(nw);
699  WorkSpace_list.reserve(nw);
700 
701  auto& mw_res = det_leader.mw_res_handle_.getResource();
702  auto& ratioGradRef_list = mw_res.ratioGradRef_list;
703  auto& det0_grad_list = mw_res.det0_grad_list;
704  ratioGradRef_list.resize(nw);
705  det0_grad_list.resize(nw);
706 
707  det_leader.UpdateMode = ORB_PBYP_PARTIAL;
708  const int WorkingIndex = iat - det_leader.FirstIndex;
709 
710  for (size_t iw = 0; iw < nw; iw++)
711  {
712  MultiDiracDeterminant& det = (det_list[iw]);
713  det.UpdateMode = ORB_PBYP_PARTIAL;
714  phi_list.push_back(*det.Phi);
715  psiV_list.push_back(det.psiV);
716  dpsiV_list.push_back(det.dpsiV);
717  d2psiV_list.push_back(det.d2psiV);
718  psiV_temp_list.push_back(det.psiV_temp);
719  psiMinv_list.push_back(det.psiMinv);
720  psiM_list.push_back(det.psiM);
721  psiMinv_temp_list.push_back(det.psiMinv_temp);
722  new_ratios_to_ref_list.push_back(det.new_ratios_to_ref_);
723  TpsiM_list.push_back(det.TpsiM);
724  table_matrix_list.push_back(det.table_matrix);
725  dpsiMinv_list.push_back(det.dpsiMinv);
726  WorkSpace_list.push_back(det.WorkSpace);
727  }
728 
729  {
730  ScopedTimer local_timer(det_leader.evalOrbVGL_timer);
731  for (size_t iw = 0; iw < nw; iw++)
732  {
733  MultiDiracDeterminant& det = (det_list[iw]);
734  Vector<ValueType> psiV_list_host_view(psiV_list[iw].get().data(), psiV_list[iw].get().size());
735  Vector<GradType> dpsiV_list_host_view(dpsiV_list[iw].get().data(), dpsiV_list[iw].get().size());
736  Vector<ValueType> d2psiV_list_host_view(d2psiV_list[iw].get().data(), d2psiV_list[iw].get().size());
737  det.Phi->evaluateVGL(P_list[iw], iat, psiV_list_host_view, dpsiV_list_host_view, d2psiV_list_host_view);
738 
739  {
740  ScopedTimer local_timer(det_leader.transferH2D_timer);
741  psiV_list[iw].get().updateTo();
742  dpsiV_list[iw].get().updateTo();
743  }
744  }
745  }
746 
747  const auto psiMinv_rows = psiMinv_list[0].get().rows();
748  const auto psiMinv_cols = psiMinv_list[0].get().cols();
749  const auto TpsiM_num_cols = TpsiM_list[0].get().cols();
750  const auto psiM_num_cols = psiM_list[0].get().cols();
751  const auto& confgList = *det_leader.ciConfigList;
752 
753  auto* psiV_list_devptr = mw_res.psiV_deviceptr_list.device_data();
754  auto* psiV_temp_list_ptr = mw_res.psiV_temp_deviceptr_list.data();
755  auto* TpsiM_list_devptr = mw_res.TpsiM_deviceptr_list.device_data();
756  auto* psiM_list_ptr = mw_res.psiM_deviceptr_list.data();
757  auto* psiMinv_list_devptr = mw_res.psiMinv_deviceptr_list.device_data();
758  auto* dpsiMinv_list_devptr = mw_res.dpsiMinv_deviceptr_list.device_data();
759  auto* psiMinv_temp_list_devptr = mw_res.psiMinv_temp_deviceptr_list.device_data();
760 
761  auto& curRatio_list = mw_res.curRatio_list;
762  curRatio_list.resize(nw);
763  auto* curRatio_list_ptr = curRatio_list.data();
764 
765  auto& inv_curRatio_list = mw_res.inv_curRatio_list;
766  inv_curRatio_list.resize(nw);
767  auto* inv_curRatio_list_ptr = inv_curRatio_list.data();
768 
769  auto* det0_grad_list_ptr = det0_grad_list.data();
770  auto* confgListOccup_ptr = det_leader.refdet_occup->data();
771 
772  auto* dpsiV_list_ptr = mw_res.dpsiV_deviceptr_list.data();
773  auto* ratioGradRef_list_ptr = ratioGradRef_list.data();
774 
775  {
776  success = ompBLAS::copy_batched(dummy_handle, psiMinv_rows * psiMinv_cols, psiMinv_list_devptr, 1,
777  psiMinv_temp_list_devptr, 1, nw);
778  if (success != 0)
779  throw std::runtime_error("In MultiDiracDeterminant ompBLAS::copy_batched_offset failed.");
780 
781 
782  // Index of loop over nw must be 32 bit sized to avoid assignment-after-reduction offload bug
783  // See https://github.com/QMCPACK/qmcpack/issues/4767
784  PRAGMA_OFFLOAD("omp target teams distribute is_device_ptr(psiV_list_devptr, psiMinv_temp_list_devptr) \
785  map(always, from:curRatio_list_ptr[:nw])")
786  for (uint32_t iw = 0; iw < nw; iw++)
787  {
788  GradType ratioGradRef_local(0);
789  PRAGMA_OFFLOAD("omp parallel for reduction(+ : ratioGradRef_local)")
790  for (uint32_t i = 0; i < NumPtcls; i++)
791  {
792  const size_t J = confgListOccup_ptr[i];
793  psiV_temp_list_ptr[iw][i] = psiV_list_devptr[iw][J];
794  ratioGradRef_local += psiMinv_temp_list_devptr[iw][i * psiMinv_cols + WorkingIndex] * dpsiV_list_ptr[iw][J];
795  }
796  // Workaround for assignment-after-reduction issue.
797  // In full precision, changing the loop variable iw to 32 bits in size works.
798  // In mixed precision, changing the loop variable iw to 16 bits would work, but is not acceptable.
799  // For that case we use the other workaround of assigning each value in the array separately.
800  // See https://github.com/QMCPACK/qmcpack/issues/4767
801  ratioGradRef_list_ptr[iw][0] = ratioGradRef_local[0];
802  ratioGradRef_list_ptr[iw][1] = ratioGradRef_local[1];
803  ratioGradRef_list_ptr[iw][2] = ratioGradRef_local[2];
804 
805  ValueType c_ratio = 0.0;
806  PRAGMA_OFFLOAD("omp parallel for reduction(+ : c_ratio)")
807  for (uint32_t jc = 0; jc < psiMinv_cols; jc += 1)
808  {
809  const size_t ic = jc * psiMinv_cols;
810  c_ratio += (psiMinv_temp_list_devptr[iw] + WorkingIndex)[ic] * psiV_temp_list_ptr[iw][jc];
811  }
812  curRatio_list_ptr[iw] = c_ratio;
813  inv_curRatio_list_ptr[iw] = ValueType(1) / c_ratio;
814  }
815 
816  success = ompBLAS::copy_batched_offset(dummy_handle, det_leader.NumOrbitals, psiV_list_devptr, 0, 1,
817  TpsiM_list_devptr, WorkingIndex, TpsiM_num_cols, nw);
818  if (success != 0)
819  throw std::runtime_error("In MultiDiracDeterminant ompBLAS::copy_batched_offset failed.");
820 
821 
822  det_leader.mw_InverseUpdateByColumn(det_leader.mw_res_handle_, WorkingIndex, inv_curRatio_list,
823  mw_res.psiV_temp_deviceptr_list, mw_res.psiMinv_temp_deviceptr_list,
824  psiMinv_rows);
825 
826  ///This is needed by Host in acceptMove. Eventually acceptMove will need to become mw_acceptMove.
827  {
828  ScopedTimer local_timer(det_leader.transferD2H_timer);
829  for (size_t iw = 0; iw < nw; iw++)
830  psiMinv_temp_list[iw].get().updateFrom();
831  }
832 
833  auto& det0_list = mw_res.cone_vec;
835  det0_list, psiMinv_temp_list, TpsiM_list, *det_leader.detData,
836  *det_leader.uniquePairs, *det_leader.DetSigns, table_matrix_list,
837  new_ratios_to_ref_list);
838 
839  for (size_t idim = 0; idim < OHMMS_DIM; idim++)
840  {
841  success = ompBLAS::copy_batched(dummy_handle, psiMinv_rows * psiMinv_cols, psiMinv_list_devptr, 1,
842  dpsiMinv_list_devptr, 1, nw);
843  if (success != 0)
844  throw std::runtime_error("In MultiDiracDeterminant ompBLAS::copy_batched_offset failed.");
845 
846  PRAGMA_OFFLOAD("omp target teams distribute map(to: ratioGradRef_list_ptr[:nw])")
847  for (uint32_t iw = 0; iw < nw; iw++)
848  {
849  inv_curRatio_list_ptr[iw] = ValueType(1) / ratioGradRef_list_ptr[iw][idim];
850 
851  for (uint32_t i = 0; i < NumPtcls; i++)
852  {
853  const size_t J = confgListOccup_ptr[i];
854  psiV_temp_list_ptr[iw][i] = dpsiV_list_ptr[iw][J][idim];
855  }
856  }
857 
858  det_leader.mw_InverseUpdateByColumn(det_leader.mw_res_handle_, WorkingIndex, inv_curRatio_list,
859  mw_res.psiV_temp_deviceptr_list, mw_res.dpsiMinv_deviceptr_list,
860  psiMinv_rows);
861 
862  PRAGMA_OFFLOAD("omp target teams distribute map(to:dpsiV_list_ptr[:nw], curRatio_list_ptr[:nw]) \
863  map(always,from:det0_grad_list_ptr[:nw]) \
864  is_device_ptr(TpsiM_list_devptr)")
865  for (uint32_t iw = 0; iw < nw; iw++)
866  {
867  det0_grad_list_ptr[iw] = ratioGradRef_list_ptr[iw][idim] / curRatio_list_ptr[iw];
868  for (uint32_t i = 0; i < NumOrbitals; i++)
869  TpsiM_list_devptr[iw][i * TpsiM_num_cols + WorkingIndex] = dpsiV_list_ptr[iw][i][idim];
870  }
871 
873  WorkingIndex, idim, det_leader.getNumDets(), det0_grad_list,
874  dpsiMinv_list, TpsiM_list, *det_leader.detData,
875  *det_leader.uniquePairs, *det_leader.DetSigns, WorkSpace_list,
876  table_matrix_list, mw_grads);
877  }
878 
879  // restore the modified column of TpsiM.
880  PRAGMA_OFFLOAD("omp target teams distribute parallel for collapse(2) is_device_ptr(TpsiM_list_devptr) \
881  map(always, to:psiM_list_ptr[:nw])")
882  for (uint32_t iw = 0; iw < nw; iw++)
883  for (uint32_t i = 0; i < NumOrbitals; i++)
884  TpsiM_list_devptr[iw][i * TpsiM_num_cols + WorkingIndex] = psiM_list_ptr[iw][i + psiM_num_cols * WorkingIndex];
885  }
886 
887  for (size_t iw = 0; iw < nw; iw++)
888  {
889  MultiDiracDeterminant& det = (det_list[iw]);
890  det.curRatio = curRatio_list[iw];
891  }
892 }
893 
895 {
896  const int WorkingIndex = iat - FirstIndex;
897  assert(WorkingIndex >= 0 && WorkingIndex < LastIndex - FirstIndex);
898 
899  const auto& confgList = *ciConfigList;
900  for (size_t idim = 0; idim < OHMMS_DIM; idim++)
901  {
902  //dpsiMinv = psiMinv_temp;
903  dpsiMinv = psiMinv;
904  auto it = confgList[ReferenceDeterminant].occup.begin();
905  ValueType ratioG = 0.0;
906  for (size_t i = 0; i < NumPtcls; i++)
907  {
908  psiV_temp[i] = dpsiM(WorkingIndex, *it)[idim];
909  ratioG += psiMinv(i, WorkingIndex) * dpsiM(WorkingIndex, *it)[idim];
910  it++;
911  }
912  InverseUpdateByColumn(dpsiMinv, psiV_temp, workV1, workV2, WorkingIndex, ratioG);
913  for (size_t i = 0; i < NumOrbitals; i++)
914  TpsiM(i, WorkingIndex) = dpsiM(WorkingIndex, i)[idim];
916  ratioG, table_matrix, idim, WorkingIndex, grads);
917  }
918  // check comment above
919  for (size_t i = 0; i < NumOrbitals; i++)
920  TpsiM(i, WorkingIndex) = psiM(WorkingIndex, i);
921 }
922 
924 {
925  assert(P.isSpinor() == is_spinor_);
926  const int WorkingIndex = iat - FirstIndex;
927  assert(WorkingIndex >= 0 && WorkingIndex < LastIndex - FirstIndex);
928 
929  const auto& confgList = *ciConfigList;
930  for (size_t idim = 0; idim < OHMMS_DIM; idim++)
931  {
932  //dpsiMinv = psiMinv_temp;
933  dpsiMinv = psiMinv;
934  auto it = confgList[ReferenceDeterminant].occup.begin();
935  ValueType ratioG = 0.0;
936  for (size_t i = 0; i < NumPtcls; i++)
937  {
938  psiV_temp[i] = dpsiM(WorkingIndex, *it)[idim];
939  ratioG += psiMinv(i, WorkingIndex) * dpsiM(WorkingIndex, *it)[idim];
940  it++;
941  }
942  InverseUpdateByColumn(dpsiMinv, psiV_temp, workV1, workV2, WorkingIndex, ratioG);
943  for (size_t i = 0; i < NumOrbitals; i++)
944  TpsiM(i, WorkingIndex) = dpsiM(WorkingIndex, i)[idim];
946  ratioG, table_matrix, idim, WorkingIndex, grads);
947  }
948 
949  //Now compute the spin gradient, same procedure as normal gradient components above
950  dpsiMinv = psiMinv;
951  auto it = confgList[ReferenceDeterminant].occup.begin();
952  ValueType ratioSG = 0.0;
953  for (size_t i = 0; i < NumPtcls; i++)
954  {
955  psiV_temp[i] = dspin_psiM(WorkingIndex, *it);
956  ratioSG += psiMinv(i, WorkingIndex) * dspin_psiM(WorkingIndex, *it);
957  it++;
958  }
959  spingrads(ReferenceDeterminant, WorkingIndex) = ratioSG;
960  InverseUpdateByColumn(dpsiMinv, psiV_temp, workV1, workV2, WorkingIndex, ratioSG);
961  for (size_t i = 0; i < NumOrbitals; i++)
962  TpsiM(i, WorkingIndex) = dspin_psiM(WorkingIndex, i);
964  *DetSigns, table_matrix, WorkingIndex, spingrads);
965 
966  // check comment above
967  for (size_t i = 0; i < NumOrbitals; i++)
968  TpsiM(i, WorkingIndex) = psiM(WorkingIndex, i);
969 }
970 
972  const RefVectorWithLeader<ParticleSet>& P_list,
973  int iat,
975 {
976  const int nw = det_list.size();
977  MultiDiracDeterminant& det_leader = det_list.getLeader();
978  const int WorkingIndex = iat - det_leader.FirstIndex;
979  int success = 0;
980  int dummy_handle = 0;
981  const size_t NumOrbitals(det_leader.NumOrbitals);
982  const size_t NumPtcls(det_leader.NumPtcls);
983 
984  ScopedTimer local_timer(det_leader.evaluateGrads_timer);
985 
986  RefVector<OffloadMatrix<ValueType>> dpsiMinv_list, psiMinv_list;
988  RefVector<OffloadVector<ValueType>> psiV_temp_list, WorkSpace_list;
989  RefVector<OffloadMatrix<ValueType>> table_matrix_list, TpsiM_list, psiM_list;
990  //RefVector<OffloadMatrix<GradType>> grads_list;
991 
992  psiMinv_list.reserve(nw);
993  dpsiMinv_list.reserve(nw);
994  dpsiM_list.reserve(nw);
995  psiV_temp_list.reserve(nw);
996  //grads_list.reserve(nw);
997  table_matrix_list.reserve(nw);
998  TpsiM_list.reserve(nw);
999  psiM_list.reserve(nw);
1000  WorkSpace_list.reserve(nw);
1001 
1002  for (size_t iw = 0; iw < nw; iw++)
1003  {
1004  MultiDiracDeterminant& det = (det_list[iw]);
1005  psiMinv_list.push_back(det.psiMinv);
1006  dpsiMinv_list.push_back(det.dpsiMinv);
1007  psiV_temp_list.push_back(det.psiV_temp);
1008  dpsiM_list.push_back(det.dpsiM);
1009  //grads_list.push_back(det.grads);
1010  TpsiM_list.push_back(det.TpsiM);
1011  psiM_list.push_back(det.psiM);
1012  table_matrix_list.push_back(det.table_matrix);
1013  WorkSpace_list.push_back(det.WorkSpace);
1014  }
1015 
1016  const auto psiMinv_rows = psiMinv_list[0].get().rows();
1017  const auto psiMinv_cols = psiMinv_list[0].get().cols();
1018  const auto TpsiM_cols = TpsiM_list[0].get().cols();
1019  const auto psiM_cols = psiM_list[0].get().cols();
1020  const auto dpsiM_cols = dpsiM_list[0].get().cols();
1021  const auto dpsiM_rows = dpsiM_list[0].get().rows();
1022  const auto& confgList = *det_leader.ciConfigList;
1023 
1024  auto& mw_res = det_leader.mw_res_handle_.getResource();
1025  auto& ratioG_list = mw_res.curRatio_list;
1026  ratioG_list.resize(nw);
1027  auto* ratioG_list_ptr = ratioG_list.data();
1028 
1029  auto& inv_ratioG_list = mw_res.inv_curRatio_list;
1030  inv_ratioG_list.resize(nw);
1031  auto* inv_ratioG_list_ptr = inv_ratioG_list.data();
1032 
1033  auto* psiMinv_list_devptr = mw_res.psiMinv_deviceptr_list.device_data();
1034  auto* dpsiMinv_list_devptr = mw_res.dpsiMinv_deviceptr_list.device_data();
1035  auto* TpsiM_list_devptr = mw_res.TpsiM_deviceptr_list.data();
1036  auto* psiM_list_ptr = mw_res.psiM_deviceptr_list.data();
1037  auto* psiV_temp_list_ptr = mw_res.psiV_temp_deviceptr_list.data();
1038  auto* dpsiM_list_ptr = mw_res.dpsiM_deviceptr_list.data();
1039  auto* confgListOccup_ptr = det_leader.refdet_occup->data();
1040 
1041  {
1042  for (size_t idim = 0; idim < OHMMS_DIM; idim++)
1043  {
1044  success = ompBLAS::copy_batched(dummy_handle, psiMinv_rows * psiMinv_cols, psiMinv_list_devptr, 1,
1045  dpsiMinv_list_devptr, 1, nw);
1046  if (success != 0)
1047  throw std::runtime_error("In MultiDiracDeterminant ompBLAS::copy_batched_offset failed.");
1048 
1049  PRAGMA_OFFLOAD("omp target teams distribute map(always, to: psiV_temp_list_ptr[:nw]) \
1050  map(always, to: dpsiM_list_ptr[:nw])")
1051  for (uint32_t iw = 0; iw < nw; iw++)
1052  for (uint32_t i = 0; i < NumPtcls; i++)
1053  {
1054  size_t J = confgListOccup_ptr[i];
1055  psiV_temp_list_ptr[iw][i] = dpsiM_list_ptr[iw][WorkingIndex * dpsiM_cols + J][idim];
1056  }
1057 
1058  PRAGMA_OFFLOAD("omp target teams distribute is_device_ptr(psiMinv_list_devptr) \
1059  map(always, from: ratioG_list_ptr[:nw])")
1060  for (uint32_t iw = 0; iw < nw; iw++)
1061  {
1062  ValueType ratioG_local(0);
1063  PRAGMA_OFFLOAD("omp parallel for reduction(+ : ratioG_local)")
1064  for (uint32_t i = 0; i < NumPtcls; i++)
1065  {
1066  size_t J = confgListOccup_ptr[i];
1067  ratioG_local += psiMinv_list_devptr[iw][i * psiMinv_cols + WorkingIndex] *
1068  dpsiM_list_ptr[iw][WorkingIndex * dpsiM_cols + J][idim];
1069  }
1070  ratioG_list_ptr[iw] = ratioG_local;
1071  inv_ratioG_list_ptr[iw] = ValueType(1) / ratioG_local;
1072  }
1073 
1074  det_leader.mw_InverseUpdateByColumn(det_leader.mw_res_handle_, WorkingIndex, inv_ratioG_list,
1075  mw_res.psiV_temp_deviceptr_list, mw_res.dpsiMinv_deviceptr_list,
1076  psiMinv_rows);
1077 
1078  PRAGMA_OFFLOAD("omp target teams distribute parallel for collapse(2) map(to:dpsiM_list_ptr[:nw]) \
1079  map(always, to: TpsiM_list_devptr[:nw])")
1080  for (uint32_t iw = 0; iw < nw; iw++)
1081  for (uint32_t i = 0; i < NumOrbitals; i++)
1082  TpsiM_list_devptr[iw][i * TpsiM_cols + WorkingIndex] =
1083  dpsiM_list_ptr[iw][dpsiM_cols * WorkingIndex + i][idim];
1084 
1085 
1087  WorkingIndex, idim, det_leader.getNumDets(), ratioG_list,
1088  dpsiMinv_list, TpsiM_list, *det_leader.detData,
1089  *det_leader.uniquePairs, *det_leader.DetSigns, WorkSpace_list,
1090  table_matrix_list, mw_grads);
1091  }
1092 
1093  // restore the modified column of TpsiM.
1094  PRAGMA_OFFLOAD("omp target teams distribute parallel for map(from:TpsiM_list_devptr[:nw]) \
1095  map(always,to:psiM_list_ptr[:nw])")
1096  for (uint32_t iw = 0; iw < nw; iw++)
1097  for (uint32_t i = 0; i < NumOrbitals; i++)
1098  TpsiM_list_devptr[iw][i * TpsiM_cols + WorkingIndex] = psiM_list_ptr[iw][i + psiM_cols * WorkingIndex];
1099  }
1100 }
1101 
1103  const size_t det_offset,
1104  const size_t data_offset,
1105  SmallMatrixDetCalculator<ValueType>& det_calculator,
1106  const OffloadVector<int>& data,
1108  const RefVector<OffloadMatrix<ValueType>>& table_matrix_list,
1109  const RefVector<OffloadVector<ValueType>>& ratios_list) const
1110 {
1111  const size_t nw = ratios_list.size();
1112  const int* it2 = data.data() + data_offset;
1113  for (size_t iw = 0; iw < nw; iw++)
1114  for (size_t count = 0; count < (*ndets_per_excitation_level_)[ext_level]; ++count)
1115  {
1116  size_t det_id = det_offset + count;
1117  ratios_list[iw].get()[det_id] = sign[det_id] * ratios_list[iw].get()[0] *
1118  det_calculator.evaluate(table_matrix_list[iw].get(), it2 + 1 + count * (3 * ext_level + 1), ext_level);
1119  }
1120 }
1121 
1123  const OffloadVector<ValueType*>& ratios_deviceptr_list) const
1124 {
1125  ScopedTimer local_timer(updateRatios_timer);
1126 
1127  const size_t nw = ratios_deviceptr_list.size();
1128  auto* ratios_list_ptr = ratios_deviceptr_list.data();
1129  const auto* det0_list_ptr = det0_list.data();
1130 
1131  PRAGMA_OFFLOAD("omp target teams distribute parallel for map(always, to: det0_list_ptr[:nw])")
1132  for (uint32_t iw = 0; iw < nw; iw++)
1133  ratios_list_ptr[iw][0] = det0_list_ptr[iw];
1134 }
1135 
1136 template<unsigned EXT_LEVEL>
1137 void MultiDiracDeterminant::mw_updateRatios(const size_t det_offset,
1138  const size_t data_offset,
1139  const OffloadVector<int>& data,
1141  const OffloadVector<ValueType*>& table_matrix_deviceptr_list,
1142  const size_t num_table_matrix_cols,
1143  const OffloadVector<ValueType*>& ratios_deviceptr_list) const
1144 {
1145  const size_t nw = ratios_deviceptr_list.size();
1146  const size_t size_sign = sign.size();
1147  const size_t ndet_ext = (*ndets_per_excitation_level_)[EXT_LEVEL];
1148 
1149  ScopedTimer local_timer(updateRatios_timer);
1150 
1151  auto* ratios_list_ptr = ratios_deviceptr_list.data();
1152  const auto* sign_ptr = sign.data();
1153  const int* data_ptr = data.data();
1154  const auto* table_matrix_list_ptr = table_matrix_deviceptr_list.data();
1155 
1156  PRAGMA_OFFLOAD("omp target teams distribute parallel for collapse(2)")
1157  for (uint32_t iw = 0; iw < nw; iw++)
1158  for (uint32_t count = 0; count < ndet_ext; ++count)
1159  {
1160  size_t det_id = det_offset + count;
1161  ratios_list_ptr[iw][det_id] = sign_ptr[det_id] * ratios_list_ptr[iw][0] *
1162  CustomizedMatrixDet<EXT_LEVEL>::evaluate(table_matrix_list_ptr[iw],
1163  (data_ptr + data_offset) + 1 + count * (3 * EXT_LEVEL + 1),
1164  num_table_matrix_cols);
1165  }
1166 }
1167 
1169  const int working_index,
1170  const OffloadVector<ValueType>& inv_curRatio_list,
1171  const OffloadVector<ValueType*>& psiV_deviceptr_list,
1172  const OffloadVector<ValueType*>& psiMinv_deviceptr_list,
1173  const size_t psiMinv_rows) const
1174 {
1175  ScopedTimer local_timer(updateInverse_timer);
1176 
1177  constexpr ValueType cone(1);
1178 
1179  const size_t nw = inv_curRatio_list.size();
1180 
1181  ValueType* czero_ptr = mw_res.czero_vec.device_data();
1182  ValueType* cminus_one_ptr = mw_res.cminus_one_vec.device_data();
1183 
1184  int success = 0;
1185  int dummy_handle = 0;
1186 
1187  auto* psiV_list_devptr = psiV_deviceptr_list.device_data();
1188  auto* psiMinv_list_devptr = psiMinv_deviceptr_list.device_data();
1189  auto* workV1_list_ptr = mw_res.workV1_deviceptr_list.device_data();
1190  auto* workV2_list_ptr = mw_res.workV2_deviceptr_list.device_data();
1191  auto* inv_curRatio_list_devptr = inv_curRatio_list.device_data();
1192 
1193  success =
1194  ompBLAS::gemv_batched(dummy_handle, 'N', psiMinv_rows, psiMinv_rows, inv_curRatio_list_devptr,
1195  psiMinv_list_devptr, psiMinv_rows, psiV_list_devptr, 1, czero_ptr, workV1_list_ptr, 1, nw);
1196  if (success != 0)
1197  throw std::runtime_error("In MultiDiracDeterminant ompBLAS::gemv_batched failed.");
1198 
1199  PRAGMA_OFFLOAD("omp target teams distribute parallel for is_device_ptr(workV1_list_ptr, inv_curRatio_list_devptr)")
1200  for (uint32_t iw = 0; iw < nw; iw++)
1201  workV1_list_ptr[iw][working_index] = cone - inv_curRatio_list_devptr[iw];
1202  if (success != 0)
1203  throw std::runtime_error("In MultiDiracDeterminant ompBLAS::copy_batched_offset failed.");
1204 
1205  success = ompBLAS::copy_batched_offset(dummy_handle, psiMinv_rows, psiMinv_list_devptr, working_index, psiMinv_rows,
1206  workV2_list_ptr, 0, 1, nw);
1207  if (success != 0)
1208  throw std::runtime_error("In MultiDiracDeterminant ompBLAS::copy_batched_offset failed.");
1209 
1210  success = ompBLAS::ger_batched(dummy_handle, psiMinv_rows, psiMinv_rows, cminus_one_ptr, workV1_list_ptr, 1,
1211  workV2_list_ptr, 1, psiMinv_list_devptr, psiMinv_rows, nw);
1212  if (success != 0)
1213  throw std::runtime_error("In MultiDiracDeterminant ompBLAS::ger_batched failed.");
1214 }
1215 
1216 } // namespace qmcplusplus
std::shared_ptr< OffloadVector< size_t > > refdet_occup
reference determinant occupation
std::shared_ptr< std::vector< ci_configuration2 > > ciConfigList
use shared_ptr
helper functions for EinsplineSetBuilder
Definition: Configuration.h:43
std::shared_ptr< OffloadVector< int > > detData
Tensor< T, D >::Type_t det(const Tensor< T, D > &a)
Definition: TensorOps.h:838
T dot(const T *restrict a, const T *restrict b, int n, TRES res=TRES())
dot product
const int NumPtcls
number of particles which belong to this Dirac determinant
const std::unique_ptr< SPOSet > Phi
a set of single-particle orbitals used to fill in the values of the matrix
OffloadMatrix< GradType > dpsiM
dpsiM(i,j)
constexpr std::complex< float > cone
Definition: BLAS.hpp:50
pointer device_data()
Return the device_ptr matching X if this is a vector attached or owning dual space memory...
Definition: OhmmsVector.h:245
OffloadMatrix< ValueType > psiMinv_temp
Matrix< GradType > grads
store determinant grads (old and new)
LatticeGaussianProduct::GradType GradType
std::shared_ptr< std::vector< int > > ndets_per_excitation_level_
number of unique determinants at each excitation level (relative to reference) {1, n_singles, n_doubles, n_triples, ...}
SoA adaptor class for Vector<TinyVector<T,D> >
void mw_buildTableMatrix_calculateGradRatios(MultiDiracDetMultiWalkerResource &mw_res, int ref, int iat, int dx, int getNumDets, const OffloadVector< ValueType > &det0_grad_list, const RefVector< OffloadMatrix< ValueType >> &psiinv_list, const RefVector< OffloadMatrix< ValueType >> &psi_list, const OffloadVector< int > &data, const VectorSoaContainer< int, 2, OffloadPinnedAllocator< int >> &pairs, const OffloadVector< RealType > &sign, const RefVector< OffloadVector< ValueType >> &WorkSpace_list, const RefVector< OffloadMatrix< ValueType >> &table_matrix_list, UnpinnedOffloadMatrix< ValueType > &mw_grads)
Function to calculate the ratio of the gradients of the excited determinant to the reference determin...
void evaluateGradsWithSpin(ParticleSet &P, int iat)
evaluate the gradients of all the unique determinants with one electron moved. Used by the table meth...
VALUE calcSmallDeterminant(size_t n, const VALUE *table_matrix, const int *it, const size_t nb_cols)
static constexpr size_t MaxSmallDet
for matrices with leading dimensions <= MaxSmallDet, compute determinant with direct expansion...
#define OHMMS_DIM
Definition: config.h:64
static constexpr int ReferenceDeterminant
all the unique determinants are sorted, the id of the reference det id is always 0 ...
size_type cols() const
Definition: OhmmsMatrix.h:78
size_type extent(int i) const
Definition: OhmmsMatrix.h:82
void mw_updateRatios_generic(const int ext_level, const size_t det_offset, const size_t data_offset, SmallMatrixDetCalculator< ValueType > &det_calculator, const OffloadVector< int > &data, const OffloadVector< RealType > &sign, const RefVector< OffloadMatrix< ValueType >> &table_matrix_list, const RefVector< OffloadVector< ValueType >> &ratios_list) const
update ratios with respect to the reference deteriminant for a given excitation level ...
void evaluateDetsAndGradsForPtclMoveWithSpin(const ParticleSet &P, int iat)
evaluate the value and gradients of all the unique determinants with one electron moved...
Specialized paritlce class for atomistic simulations.
Definition: ParticleSet.h:55
static void mw_evaluateGrads(const RefVectorWithLeader< MultiDiracDeterminant > &det_list, const RefVectorWithLeader< ParticleSet > &P_list, int iat, UnpinnedOffloadMatrix< ValueType > &mw_grads)
multi walker version of mw_evaluateGrads
void buildTableMatrix_calculateRatiosValueMatrixOneParticle(int ref, const OffloadMatrix< ValueType > &psiinv, const OffloadMatrix< ValueType > &psi, const OffloadVector< int > &data, const VectorSoaContainer< int, 2, OffloadPinnedAllocator< int >> &pairs, const OffloadVector< RealType > &sign, OffloadMatrix< ValueType > &table_matrix, int iat, Matrix< ValueType > &ratios)
size_type size() const
return the current size
Definition: OhmmsVector.h:162
OffloadVector< ValueType > psiV
value of single-particle orbital for particle-by-particle update
ompBLAS_status gemv_batched(ompBLAS_handle &handle, const char trans, const int m, const int n, const T *alpha, const T *const A[], const int lda, const T *const x[], const int incx, const T *beta, T *const y[], const int incy, const int batch_count)
void buildTableMatrix_calculateRatios_impl(int ref, ValueType det0, ValueType *restrict ratios, const OffloadMatrix< ValueType > &psiinv, const OffloadMatrix< ValueType > &psi, OffloadMatrix< ValueType > &table_matrix, const OffloadVector< int > &data, const VectorSoaContainer< int, 2, OffloadPinnedAllocator< int >> &pairs, const OffloadVector< RealType > &sign)
Function to calculate the ratio of the excited determinant to the reference determinant in Customized...
QTBase::ValueType ValueType
Definition: Configuration.h:60
void mw_updateRatios_det0(const OffloadVector< ValueType > &det0_list, const OffloadVector< ValueType *> &ratios_deviceptr_list) const
update ratios of the reference deteriminant
std::shared_ptr< OffloadVector< RealType > > DetSigns
MatA::value_type DetRatioByColumn(const MatA &Minv, const VecB &newv, int colchanged)
determinant ratio with a column substitution
OffloadMatrix< ValueType > psiMinv
inverse Dirac determinant matrix of the reference det
static void mw_evaluateDetsForPtclMove(const RefVectorWithLeader< MultiDiracDeterminant > &det_list, const RefVectorWithLeader< ParticleSet > &P_list, int iat)
multi walker version of evaluateDetsForPtclMove
ompBLAS_status copy_batched_offset(ompBLAS_handle &handle, const int n, const T *const x[], const int x_offset, const int incx, T *const y[], const int y_offset, const int incy, const int batch_count)
copy device data from x to y with additional offset applied to array of device pointers ...
std::shared_ptr< VectorSoaContainer< int, 2, OffloadPinnedAllocator< int > > > uniquePairs
ompBLAS_status ger_batched(ompBLAS_handle &handle, const int m, const int n, const T *alpha, const T *const x[], const int incx, const T *const y[], const int incy, T *const A[], const int lda, const int batch_count)
ValueType curRatio
new value of the reference determinant over the old value upon a proposed move
OffloadVector< ValueType > WorkSpace
static T evaluate(T a11, T a12, T a21, T a22)
void mw_buildTableMatrix_calculateRatios_impl(MultiDiracDetMultiWalkerResource &mw_res, int ref, const OffloadVector< ValueType > &det0_list, const RefVector< OffloadMatrix< ValueType >> &psiinv_list, const RefVector< OffloadMatrix< ValueType >> &psi_list, const OffloadVector< int > &data, const VectorSoaContainer< int, 2, OffloadPinnedAllocator< int >> &pairs, const OffloadVector< RealType > &sign, const RefVector< OffloadMatrix< ValueType >> &table_matrix_list, const RefVector< OffloadVector< ValueType >> &ratios_list)
Function to calculate the ratio of the excited determinant to the reference determinant in Customized...
OMPallocator is an allocator with fused device and dualspace allocator functionality.
double sign(double x)
Definition: Standard.h:73
OffloadMatrix< ValueType > table_matrix
std::complex< double > I
SmallMatrixDetCalculator< ValueType > det_calculator_
OffloadVector< ValueType > psiV_temp
std::vector< std::reference_wrapper< T > > RefVector
const int NumOrbitals
number of single-particle orbitals which belong to this Dirac determinant
void mw_buildTableMatrix_calculateRatios(MultiDiracDetMultiWalkerResource &mw_res, int ref, const OffloadVector< ValueType > &det0_list, const RefVector< OffloadMatrix< ValueType >> &psiinv_list, const RefVector< OffloadMatrix< ValueType >> &psi_list, const OffloadVector< int > &data, const VectorSoaContainer< int, 2, OffloadPinnedAllocator< int >> &pairs, const OffloadVector< RealType > &sign, const RefVector< OffloadMatrix< ValueType >> &table_matrix_list, const RefVector< OffloadVector< ValueType >> &ratios_list)
Tensor< T, D > inverse(const Tensor< T, D > &a)
Definition: TensorOps.h:879
void evaluateDetsForPtclMove(const ParticleSet &P, int iat, int refPtcl=-1)
evaluate the value of all the unique determinants with one electron moved.
OffloadMatrix< ValueType > psiM
psiM(i,j) TpsiM(i,j)
OffloadVector< ValueType > new_ratios_to_ref_
new determinant ratios with respect to the updated reference determinant upon a proposed move ...
ResourceHandle< MultiDiracDetMultiWalkerResource > mw_res_handle_
void evaluateDetsAndGradsForPtclMove(const ParticleSet &P, int iat)
evaluate the value and gradients of all the unique determinants with one electron moved...
void mw_InverseUpdateByColumn(MultiDiracDetMultiWalkerResource &mw_res, const int working_index, const OffloadVector< ValueType > &curRatio_list, const OffloadVector< ValueType *> &psiV_deviceptr_list, const OffloadVector< ValueType *> &psiMinv_deviceptr_list, const size_t psiMinv_rows) const
LatticeGaussianProduct::ValueType ValueType
void buildTableMatrix_calculateRatios(int ref, const OffloadMatrix< ValueType > &psiinv, const OffloadMatrix< ValueType > &psi, const OffloadVector< int > &data, const VectorSoaContainer< int, 2, OffloadPinnedAllocator< int >> &pairs, const OffloadVector< RealType > &sign, OffloadMatrix< ValueType > &table_matrix, OffloadVector< ValueType > &ratios)
compute the ratio of the excited determinants to the reference determinant
const int FirstIndex
index of the first particle with respect to the particle set
void evaluate(Matrix< T, Alloc > &lhs, const Op &op, const Expression< RHS > &rhs)
Definition: OhmmsMatrix.h:514
const int LastIndex
index of the last particle with respect to the particle set
static void mw_evaluateDetsAndGradsForPtclMove(const RefVectorWithLeader< MultiDiracDeterminant > &det_list, const RefVectorWithLeader< ParticleSet > &P_list, int iat, UnpinnedOffloadMatrix< ValueType > &mw_grads)
multi walker version of mw_evaluateDetsAndGradsForPtclMove
ompBLAS_status copy_batched(ompBLAS_handle &handle, const int n, const T *const x[], const int incx, T *const y[], const int incy, const int batch_count)
copy device data from x to y
void mw_updateRatios(const size_t det_offset, const size_t data_offset, const OffloadVector< int > &data, const OffloadVector< RealType > &sign, const OffloadVector< ValueType *> &table_matrix_deviceptr_list, const size_t num_table_matrix_cols, const OffloadVector< ValueType *> &ratios_deviceptr_list) const
update ratios with respect to the reference deteriminant for a given excitation level ...
void evaluateGrads(ParticleSet &P, int iat)
evaluate the gradients of all the unique determinants with one electron moved. Used by the table meth...
void InverseUpdateByColumn(Matrix< T, ALLOC > &Minv, Vector< T, ALLOC > &newcol, Vector< T, ALLOC > &rvec, Vector< T, ALLOC > &rvecinv, int colchanged, T c_ratio)
void buildTableMatrix_calculateGradRatios(int ref, const OffloadMatrix< ValueType > &psiinv, const OffloadMatrix< ValueType > &psi, const OffloadVector< int > &data, const VectorSoaContainer< int, 2, OffloadPinnedAllocator< int >> &pairs, const OffloadVector< RealType > &sign, const ValueType &det0_grad, OffloadMatrix< ValueType > &table_matrix, int dx, int iat, Matrix< GradType > &grads)
Function to calculate the ratio of the gradients of the excited determinant to the reference determin...