QMCPACK
qmcplusplus::compute Namespace Reference

Namespaces

 BLAS
 

Classes

class  BLASHandle
 
class  BLASHandle< PlatformKind::CUDA >
 
class  BLASHandle< PlatformKind::OMPTARGET >
 
class  BLASHandle< PlatformKind::SYCL >
 
class  Queue
 
class  Queue< PlatformKind::CUDA >
 
class  Queue< PlatformKind::OMPTARGET >
 
class  Queue< PlatformKind::SYCL >
 

Functions

template<typename T >
void copyAinvRow_saveGL_batched (Queue< PlatformKind::CUDA > &queue, const int rowchanged, const int n, const T *const Ainv[], const int lda, T *const temp[], T *const rcopy[], const T *const phi_vgl_in[], const size_t phi_vgl_stride, T *const dphi_out[], T *const d2phi_out[], const int batch_count)
 
template<typename T >
void calcGradients_batched (Queue< PlatformKind::CUDA > &queue, const int n, const T *const Ainvrow[], const T *const dpsiMrow[], T *const grads_now, const int batch_count)
 
template<typename T >
void add_delay_list_save_sigma_VGL_batched (Queue< PlatformKind::CUDA > &queue, int *const delay_list[], const int rowchanged, const int delay_count, T *const binv[], const int binv_lda, const T *const ratio_inv, const T *const phi_vgl_in[], const size_t phi_vgl_stride, T *const phi_out[], T *const dphi_out[], T *const d2phi_out[], const int norb, const int n_accepted, const int batch_count)
 
template<typename T >
void applyW_batched (Queue< PlatformKind::CUDA > &queue, const int *const delay_list[], const int delay_count, T *const tempMat[], const int lda, const int batch_count)
 
template<typename T >
void copyAinvRow_saveGL_batched (Queue< PlatformKind::OMPTARGET > &queue, const int rowchanged, const int n, const T *const Ainv[], const int lda, T *const temp[], T *const rcopy[], const T *const phi_vgl_in[], const size_t phi_vgl_stride, T *const dphi_out[], T *const d2phi_out[], const int batch_count)
 
template<typename T >
void calcGradients_batched (Queue< PlatformKind::OMPTARGET > &queue, const int n, const T *const Ainvrow[], const T *const dpsiMrow[], T *const grads_now, const int batch_count)
 
template<typename T >
void add_delay_list_save_sigma_VGL_batched (Queue< PlatformKind::OMPTARGET > &queue, int *const delay_list[], const int rowchanged, const int delay_count, T *const binv[], const int binv_lda, const T *const ratio_inv, const T *const phi_vgl_in[], const size_t phi_vgl_stride, T *const phi_out[], T *const dphi_out[], T *const d2phi_out[], const int norb, const int n_accepted, const int batch_count)
 
template<typename T >
void applyW_batched (Queue< PlatformKind::OMPTARGET > &queue, const int *const delay_list[], const int delay_count, T *const tempMat[], const int lda, const int batch_count)
 
template<typename T >
void copyAinvRow_saveGL_batched (Queue< PlatformKind::SYCL > &queue, const int rowchanged, const int n, const T *const Ainv[], const int lda, T *const temp[], T *const rcopy[], const T *const phi_vgl_in[], const size_t phi_vgl_stride, T *const dphi_out[], T *const d2phi_out[], const int batch_count)
 
template<typename T >
void calcGradients_batched (Queue< PlatformKind::SYCL > &queue, const int n, const T *const Ainvrow[], const T *const dpsiMrow[], T *const grads_now, const int batch_count)
 
template<typename T >
void add_delay_list_save_sigma_VGL_batched (Queue< PlatformKind::SYCL > &queue, int *const delay_list[], const int rowchanged, const int delay_count, T *const binv[], const int binv_lda, const T *const ratio_inv, const T *const phi_vgl_in[], const size_t phi_vgl_stride, T *const phi_out[], T *const dphi_out[], T *const d2phi_out[], const int norb, const int n_accepted, const int batch_count)
 
template<typename T >
void applyW_batched (Queue< PlatformKind::SYCL > &queue, const int *const delay_list[], const int delay_count, T *const tempMat[], const int lda, const int batch_count)
 

Class Documentation

◆ qmcplusplus::compute::BLASHandle

class qmcplusplus::compute::BLASHandle

template<PlatformKind PL>
class qmcplusplus::compute::BLASHandle< PL >

Definition at line 24 of file AccelBLASHandle.hpp.

+ Collaboration diagram for BLASHandle< PL >:

◆ qmcplusplus::compute::Queue

class qmcplusplus::compute::Queue

template<PlatformKind PL>
class qmcplusplus::compute::Queue< PL >

Definition at line 24 of file Queue.hpp.

+ Collaboration diagram for Queue< PL >:

Function Documentation

◆ add_delay_list_save_sigma_VGL_batched() [1/3]

void qmcplusplus::compute::add_delay_list_save_sigma_VGL_batched ( Queue< PlatformKind::CUDA > &  queue,
int *const  delay_list[],
const int  rowchanged,
const int  delay_count,
T *const  binv[],
const int  binv_lda,
const T *const  ratio_inv,
const T *const  phi_vgl_in[],
const size_t  phi_vgl_stride,
T *const  phi_out[],
T *const  dphi_out[],
T *const  d2phi_out[],
const int  norb,
const int  n_accepted,
const int  batch_count 
)

Definition at line 57 of file AccelMatrixUpdateCUDA.hpp.

References qmcplusplus::CUDA::add_delay_list_save_sigma_VGL_batched(), qmcplusplus::cudaErrorCheck(), and qmcplusplus::queue.

Referenced by DelayedUpdateBatched< PL, VALUE >::mw_accept_rejectRow().

72 {
73  cudaErrorCheck(CUDA::add_delay_list_save_sigma_VGL_batched(queue.getNative(), delay_list, rowchanged, delay_count,
74  binv, binv_lda, ratio_inv, phi_vgl_in, phi_vgl_stride,
75  phi_out, dphi_out, d2phi_out, norb, n_accepted,
76  batch_count),
77  "CUDA::add_delay_list_save_y_VGL_batched failed!");
78 }
void add_delay_list_save_sigma_VGL_batched(Queue< PlatformKind::CUDA > &queue, int *const delay_list[], const int rowchanged, const int delay_count, T *const binv[], const int binv_lda, const T *const ratio_inv, const T *const phi_vgl_in[], const size_t phi_vgl_stride, T *const phi_out[], T *const dphi_out[], T *const d2phi_out[], const int norb, const int n_accepted, const int batch_count)
#define cudaErrorCheck(ans, cause)
Definition: CUDAerror.h:21

◆ add_delay_list_save_sigma_VGL_batched() [2/3]

void qmcplusplus::compute::add_delay_list_save_sigma_VGL_batched ( Queue< PlatformKind::SYCL > &  queue,
int *const  delay_list[],
const int  rowchanged,
const int  delay_count,
T *const  binv[],
const int  binv_lda,
const T *const  ratio_inv,
const T *const  phi_vgl_in[],
const size_t  phi_vgl_stride,
T *const  phi_out[],
T *const  dphi_out[],
T *const  d2phi_out[],
const int  norb,
const int  n_accepted,
const int  batch_count 
)

Definition at line 69 of file AccelMatrixUpdateSYCL.hpp.

References qmcplusplus::SYCL::add_delay_list_save_sigma_VGL_batched(), qmcplusplus::Units::charge::e, and qmcplusplus::queue.

84 {
85  try
86  {
87  SYCL::add_delay_list_save_sigma_VGL_batched(queue.getNative(), delay_list, rowchanged, delay_count, binv, binv_lda,
88  ratio_inv, phi_vgl_in, phi_vgl_stride, phi_out, dphi_out, d2phi_out,
89  norb, n_accepted, batch_count);
90  }
91  catch (sycl::exception& e)
92  {
93  throw std::runtime_error(std::string("SYCL::add_delay_list_save_y_VGL_batched exception: ") + e.what());
94  }
95 }
void add_delay_list_save_sigma_VGL_batched(Queue< PlatformKind::SYCL > &queue, int *const delay_list[], const int rowchanged, const int delay_count, T *const binv[], const int binv_lda, const T *const ratio_inv, const T *const phi_vgl_in[], const size_t phi_vgl_stride, T *const phi_out[], T *const dphi_out[], T *const d2phi_out[], const int norb, const int n_accepted, const int batch_count)

◆ add_delay_list_save_sigma_VGL_batched() [3/3]

void qmcplusplus::compute::add_delay_list_save_sigma_VGL_batched ( Queue< PlatformKind::OMPTARGET > &  queue,
int *const  delay_list[],
const int  rowchanged,
const int  delay_count,
T *const  binv[],
const int  binv_lda,
const T *const  ratio_inv,
const T *const  phi_vgl_in[],
const size_t  phi_vgl_stride,
T *const  phi_out[],
T *const  dphi_out[],
T *const  d2phi_out[],
const int  norb,
const int  n_accepted,
const int  batch_count 
)

Definition at line 98 of file AccelMatrixUpdateOMPTarget.hpp.

113 {
114  PRAGMA_OFFLOAD("omp target teams distribute \
115  is_device_ptr(delay_list, binv, ratio_inv, phi_vgl_in, phi_out, dphi_out, d2phi_out)")
116  for (size_t iw = 0; iw < batch_count; iw++)
117  if (iw < n_accepted)
118  {
119  // real accept, settle y and Z
120  int* __restrict__ delay_list_iw = delay_list[iw];
121  T* __restrict__ binvrow_iw = binv[iw] + delay_count * binv_lda;
122  const T* __restrict__ phi_in_iw = phi_vgl_in[iw];
123  T* __restrict__ phi_out_iw = phi_out[iw];
124  T* __restrict__ dphi_out_iw = dphi_out[iw];
125  T* __restrict__ d2phi_out_iw = d2phi_out[iw];
126 
127  delay_list_iw[delay_count] = rowchanged;
128  binvrow_iw[delay_count] = ratio_inv[iw];
129 
130  PRAGMA_OFFLOAD("omp parallel for")
131  for (size_t col_id = 0; col_id < delay_count; col_id++)
132  binvrow_iw[col_id] *= ratio_inv[iw];
133 
134  PRAGMA_OFFLOAD("omp parallel for")
135  for (size_t col_id = 0; col_id < norb; col_id++)
136  {
137  // copy phiV, dphiV and d2phiV from temporary to final without a separate kernel.
138  phi_out_iw[col_id] = phi_in_iw[col_id];
139  dphi_out_iw[col_id * 3] = phi_in_iw[col_id + phi_vgl_stride];
140  dphi_out_iw[col_id * 3 + 1] = phi_in_iw[col_id + phi_vgl_stride * 2];
141  dphi_out_iw[col_id * 3 + 2] = phi_in_iw[col_id + phi_vgl_stride * 3];
142  d2phi_out_iw[col_id] = phi_in_iw[col_id + phi_vgl_stride * 4];
143  }
144  }
145  else
146  {
147  // fake accept. Set Y, Z with zero and x with 1
148  T* __restrict__ binv_iw = binv[iw];
149  PRAGMA_OFFLOAD("omp parallel for")
150  for (size_t col_id = 0; col_id < delay_count; col_id++)
151  binv_iw[delay_count * binv_lda + col_id] = binv_iw[delay_count + binv_lda * col_id] = T(0);
152 
153  int* __restrict__ delay_list_iw = delay_list[iw];
154  binv_iw[delay_count * binv_lda + delay_count] = T(1);
155  delay_list_iw[delay_count] = -1;
156 
157  T* __restrict__ Urow_iw = phi_out[iw];
158  PRAGMA_OFFLOAD("omp parallel for")
159  for (size_t col_id = 0; col_id < norb; col_id++)
160  {
161  Urow_iw[col_id] = T(0);
162  }
163  }
164 }
if(c->rank()==0)
for(int i=0;i< size_test;++i) CHECK(Approx(gauss_random_vals[offset_for_rs+i])

◆ applyW_batched() [1/3]

void qmcplusplus::compute::applyW_batched ( Queue< PlatformKind::CUDA > &  queue,
const int *const  delay_list[],
const int  delay_count,
T *const  tempMat[],
const int  lda,
const int  batch_count 
)

Definition at line 82 of file AccelMatrixUpdateCUDA.hpp.

References qmcplusplus::CUDA::applyW_batched(), qmcplusplus::cudaErrorCheck(), qmcplusplus::lda, and qmcplusplus::queue.

Referenced by DelayedUpdateBatched< PL, VALUE >::mw_updateInvMat().

88 {
89  cudaErrorCheck(CUDA::applyW_batched(queue.getNative(), delay_list, delay_count, tempMat, lda, batch_count),
90  "CUDA::applyW_batched failed!");
91 }
void applyW_batched(Queue< PlatformKind::CUDA > &queue, const int *const delay_list[], const int delay_count, T *const tempMat[], const int lda, const int batch_count)
#define cudaErrorCheck(ans, cause)
Definition: CUDAerror.h:21

◆ applyW_batched() [2/3]

void qmcplusplus::compute::applyW_batched ( Queue< PlatformKind::SYCL > &  queue,
const int *const  delay_list[],
const int  delay_count,
T *const  tempMat[],
const int  lda,
const int  batch_count 
)

Definition at line 99 of file AccelMatrixUpdateSYCL.hpp.

References qmcplusplus::SYCL::applyW_batched(), qmcplusplus::Units::charge::e, qmcplusplus::lda, and qmcplusplus::queue.

105 {
106  try
107  {
108  SYCL::applyW_batched(queue.getNative(), delay_list, delay_count, tempMat, lda, batch_count);
109  }
110  catch (sycl::exception& e)
111  {
112  throw std::runtime_error(std::string("SYCL::applyW_batched exception: ") + e.what());
113  }
114 }
void applyW_batched(Queue< PlatformKind::SYCL > &queue, const int *const delay_list[], const int delay_count, T *const tempMat[], const int lda, const int batch_count)

◆ applyW_batched() [3/3]

void qmcplusplus::compute::applyW_batched ( Queue< PlatformKind::OMPTARGET > &  queue,
const int *const  delay_list[],
const int  delay_count,
T *const  tempMat[],
const int  lda,
const int  batch_count 
)

Definition at line 168 of file AccelMatrixUpdateOMPTarget.hpp.

References qmcplusplus::lda.

174 {
175  PRAGMA_OFFLOAD("omp target teams distribute is_device_ptr(delay_list, tempMat)")
176  for (size_t iw = 0; iw < batch_count; iw++)
177  {
178  const int* __restrict__ delay_list_iw = delay_list[iw];
179  T* __restrict__ tempMat_iw = tempMat[iw];
180 
181  PRAGMA_OFFLOAD("omp parallel for")
182  for (size_t col_id = 0; col_id < delay_count; col_id++)
183  {
184  const int row_id = delay_list_iw[col_id];
185  if (row_id >= 0)
186  tempMat_iw[row_id * lda + col_id] = tempMat_iw[row_id * lda + col_id] - T(1);
187  }
188  }
189 }
for(int i=0;i< size_test;++i) CHECK(Approx(gauss_random_vals[offset_for_rs+i])

◆ calcGradients_batched() [1/3]

void qmcplusplus::compute::calcGradients_batched ( Queue< PlatformKind::CUDA > &  queue,
const int  n,
const T *const  Ainvrow[],
const T *const  dpsiMrow[],
T *const  grads_now,
const int  batch_count 
)

Definition at line 45 of file AccelMatrixUpdateCUDA.hpp.

References qmcplusplus::CUDA::calcGradients_batched(), qmcplusplus::cudaErrorCheck(), qmcplusplus::n, and qmcplusplus::queue.

Referenced by DelayedUpdateBatched< PL, VALUE >::mw_evalGrad().

51 {
52  cudaErrorCheck(CUDA::calcGradients_batched(queue.getNative(), n, Ainvrow, dpsiMrow, grads_now, batch_count),
53  "CUDA::calcGradients_cuda failed!");
54 }
#define cudaErrorCheck(ans, cause)
Definition: CUDAerror.h:21
void calcGradients_batched(Queue< PlatformKind::CUDA > &queue, const int n, const T *const Ainvrow[], const T *const dpsiMrow[], T *const grads_now, const int batch_count)

◆ calcGradients_batched() [2/3]

void qmcplusplus::compute::calcGradients_batched ( Queue< PlatformKind::SYCL > &  queue,
const int  n,
const T *const  Ainvrow[],
const T *const  dpsiMrow[],
T *const  grads_now,
const int  batch_count 
)

Definition at line 51 of file AccelMatrixUpdateSYCL.hpp.

References qmcplusplus::SYCL::calcGradients_batched(), qmcplusplus::Units::charge::e, qmcplusplus::n, and qmcplusplus::queue.

57 {
58  try
59  {
60  SYCL::calcGradients_batched(queue.getNative(), n, Ainvrow, dpsiMrow, grads_now, batch_count);
61  }
62  catch (sycl::exception& e)
63  {
64  throw std::runtime_error(std::string("SYCL::calcGradients_batched exception: ") + e.what());
65  }
66 }
void calcGradients_batched(Queue< PlatformKind::SYCL > &queue, const int n, const T *const Ainvrow[], const T *const dpsiMrow[], T *const grads_now, const int batch_count)

◆ calcGradients_batched() [3/3]

void qmcplusplus::compute::calcGradients_batched ( Queue< PlatformKind::OMPTARGET > &  queue,
const int  n,
const T *const  Ainvrow[],
const T *const  dpsiMrow[],
T *const  grads_now,
const int  batch_count 
)

Definition at line 66 of file AccelMatrixUpdateOMPTarget.hpp.

References qmcplusplus::n.

72 {
73  PRAGMA_OFFLOAD("omp target teams distribute is_device_ptr(Ainvrow, dpsiMrow, grads_now)")
74  for (size_t iw = 0; iw < batch_count; iw++)
75  {
76  const T* __restrict__ invRow = Ainvrow[iw];
77  const T* __restrict__ dpsiM_row = dpsiMrow[iw];
78 
79  T sum_x = 0;
80  T sum_y = 0;
81  T sum_z = 0;
82 
83  PRAGMA_OFFLOAD("omp parallel for reduction(+: sum_x,sum_y,sum_z)")
84  for (size_t col_id = 0; col_id < n; col_id++)
85  {
86  sum_x += invRow[col_id] * dpsiM_row[col_id * 3];
87  sum_y += invRow[col_id] * dpsiM_row[col_id * 3 + 1];
88  sum_z += invRow[col_id] * dpsiM_row[col_id * 3 + 2];
89  }
90 
91  grads_now[iw * 3] = sum_x;
92  grads_now[iw * 3 + 1] = sum_y;
93  grads_now[iw * 3 + 2] = sum_z;
94  }
95 }
for(int i=0;i< size_test;++i) CHECK(Approx(gauss_random_vals[offset_for_rs+i])

◆ copyAinvRow_saveGL_batched() [1/3]

void qmcplusplus::compute::copyAinvRow_saveGL_batched ( Queue< PlatformKind::OMPTARGET > &  queue,
const int  rowchanged,
const int  n,
const T *const  Ainv[],
const int  lda,
T *const  temp[],
T *const  rcopy[],
const T *const  phi_vgl_in[],
const size_t  phi_vgl_stride,
T *const  dphi_out[],
T *const  d2phi_out[],
const int  batch_count 
)

Definition at line 25 of file AccelMatrixUpdateOMPTarget.hpp.

References qmcplusplus::lda, and qmcplusplus::n.

37 {
38  PRAGMA_OFFLOAD("omp target teams distribute is_device_ptr(Ainv, temp, rcopy, phi_vgl_in, dphi_out, d2phi_out)")
39  for (size_t iw = 0; iw < batch_count; iw++)
40  {
41  const T* __restrict__ Ainv_iw = Ainv[iw];
42  T* __restrict__ temp_iw = temp[iw];
43  T* __restrict__ rcopy_iw = rcopy[iw];
44  const T* __restrict__ phi_in_iw = phi_vgl_in[iw];
45  T* __restrict__ dphi_out_iw = dphi_out[iw];
46  T* __restrict__ d2phi_out_iw = d2phi_out[iw];
47 
48  temp_iw[rowchanged] = temp_iw[rowchanged] - T(1);
49 
50  PRAGMA_OFFLOAD("omp parallel for")
51  for (size_t col_id = 0; col_id < n; col_id++)
52  {
53  rcopy_iw[col_id] = Ainv_iw[rowchanged * lda + col_id];
54 
55  // the following copying data on the device is not part of SM-1
56  // it is intended to copy dphiV and d2phiV from temporary to final without a separate kernel.
57  dphi_out_iw[col_id * 3] = phi_in_iw[col_id + phi_vgl_stride];
58  dphi_out_iw[col_id * 3 + 1] = phi_in_iw[col_id + phi_vgl_stride * 2];
59  dphi_out_iw[col_id * 3 + 2] = phi_in_iw[col_id + phi_vgl_stride * 3];
60  d2phi_out_iw[col_id] = phi_in_iw[col_id + phi_vgl_stride * 4];
61  }
62  }
63 }
for(int i=0;i< size_test;++i) CHECK(Approx(gauss_random_vals[offset_for_rs+i])

◆ copyAinvRow_saveGL_batched() [2/3]

void qmcplusplus::compute::copyAinvRow_saveGL_batched ( Queue< PlatformKind::CUDA > &  queue,
const int  rowchanged,
const int  n,
const T *const  Ainv[],
const int  lda,
T *const  temp[],
T *const  rcopy[],
const T *const  phi_vgl_in[],
const size_t  phi_vgl_stride,
T *const  dphi_out[],
T *const  d2phi_out[],
const int  batch_count 
)

Definition at line 26 of file AccelMatrixUpdateCUDA.hpp.

References qmcplusplus::CUDA::copyAinvRow_saveGL_batched(), qmcplusplus::cudaErrorCheck(), qmcplusplus::lda, qmcplusplus::n, and qmcplusplus::queue.

Referenced by DelayedUpdateBatched< PL, VALUE >::mw_updateRow().

38 {
39  cudaErrorCheck(CUDA::copyAinvRow_saveGL_batched(queue.getNative(), rowchanged, n, Ainv, lda, temp, rcopy, phi_vgl_in,
40  phi_vgl_stride, dphi_out, d2phi_out, batch_count),
41  "CUDA::copyAinvRow_saveGL_cuda failed!");
42 }
#define cudaErrorCheck(ans, cause)
Definition: CUDAerror.h:21
void copyAinvRow_saveGL_batched(Queue< PlatformKind::CUDA > &queue, const int rowchanged, const int n, const T *const Ainv[], const int lda, T *const temp[], T *const rcopy[], const T *const phi_vgl_in[], const size_t phi_vgl_stride, T *const dphi_out[], T *const d2phi_out[], const int batch_count)

◆ copyAinvRow_saveGL_batched() [3/3]

void qmcplusplus::compute::copyAinvRow_saveGL_batched ( Queue< PlatformKind::SYCL > &  queue,
const int  rowchanged,
const int  n,
const T *const  Ainv[],
const int  lda,
T *const  temp[],
T *const  rcopy[],
const T *const  phi_vgl_in[],
const size_t  phi_vgl_stride,
T *const  dphi_out[],
T *const  d2phi_out[],
const int  batch_count 
)

Definition at line 26 of file AccelMatrixUpdateSYCL.hpp.

References qmcplusplus::SYCL::copyAinvRow_saveGL_batched(), qmcplusplus::Units::charge::e, qmcplusplus::lda, qmcplusplus::n, and qmcplusplus::queue.

38 {
39  try
40  {
41  SYCL::copyAinvRow_saveGL_batched(queue.getNative(), rowchanged, n, Ainv, lda, temp, rcopy, phi_vgl_in,
42  phi_vgl_stride, dphi_out, d2phi_out, batch_count);
43  }
44  catch (sycl::exception& e)
45  {
46  throw std::runtime_error(std::string("SYCL::copyAinvRow_saveGL_batched exception: ") + e.what());
47  }
48 }
void copyAinvRow_saveGL_batched(Queue< PlatformKind::SYCL > &queue, const int rowchanged, const int n, const T *const Ainv[], const int lda, T *const temp[], T *const rcopy[], const T *const phi_vgl_in[], const size_t phi_vgl_stride, T *const dphi_out[], T *const d2phi_out[], const int batch_count)