QMCPACK
AccelMatrixUpdateOMPTarget.hpp
Go to the documentation of this file.
1 //////////////////////////////////////////////////////////////////////////////////////
2 // This file is distributed under the University of Illinois/NCSA Open Source License.
3 // See LICENSE file in top directory for details.
4 //
5 // Copyright (c) 2024 QMCPACK developers.
6 //
7 // File developed by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
8 //
9 // File created by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
10 //////////////////////////////////////////////////////////////////////////////////////
11 
12 
13 #ifndef QMCPLUSPLUS_COMPUTE_MATRIX_UPDATE_OMPTARGET_H
14 #define QMCPLUSPLUS_COMPUTE_MATRIX_UPDATE_OMPTARGET_H
15 
16 #include <QueueAliases.hpp>
17 
18 namespace qmcplusplus
19 {
20 
21 namespace compute
22 {
23 
24 template<typename T>
26  const int rowchanged,
27  const int n,
28  const T* const Ainv[],
29  const int lda,
30  T* const temp[],
31  T* const rcopy[],
32  const T* const phi_vgl_in[],
33  const size_t phi_vgl_stride,
34  T* const dphi_out[],
35  T* const d2phi_out[],
36  const int batch_count)
37 {
38  PRAGMA_OFFLOAD("omp target teams distribute is_device_ptr(Ainv, temp, rcopy, phi_vgl_in, dphi_out, d2phi_out)")
39  for (size_t iw = 0; iw < batch_count; iw++)
40  {
41  const T* __restrict__ Ainv_iw = Ainv[iw];
42  T* __restrict__ temp_iw = temp[iw];
43  T* __restrict__ rcopy_iw = rcopy[iw];
44  const T* __restrict__ phi_in_iw = phi_vgl_in[iw];
45  T* __restrict__ dphi_out_iw = dphi_out[iw];
46  T* __restrict__ d2phi_out_iw = d2phi_out[iw];
47 
48  temp_iw[rowchanged] = temp_iw[rowchanged] - T(1);
49 
50  PRAGMA_OFFLOAD("omp parallel for")
51  for (size_t col_id = 0; col_id < n; col_id++)
52  {
53  rcopy_iw[col_id] = Ainv_iw[rowchanged * lda + col_id];
54 
55  // the following copying data on the device is not part of SM-1
56  // it is intended to copy dphiV and d2phiV from temporary to final without a separate kernel.
57  dphi_out_iw[col_id * 3] = phi_in_iw[col_id + phi_vgl_stride];
58  dphi_out_iw[col_id * 3 + 1] = phi_in_iw[col_id + phi_vgl_stride * 2];
59  dphi_out_iw[col_id * 3 + 2] = phi_in_iw[col_id + phi_vgl_stride * 3];
60  d2phi_out_iw[col_id] = phi_in_iw[col_id + phi_vgl_stride * 4];
61  }
62  }
63 }
64 
65 template<typename T>
67  const int n,
68  const T* const Ainvrow[],
69  const T* const dpsiMrow[],
70  T* const grads_now,
71  const int batch_count)
72 {
73  PRAGMA_OFFLOAD("omp target teams distribute is_device_ptr(Ainvrow, dpsiMrow, grads_now)")
74  for (size_t iw = 0; iw < batch_count; iw++)
75  {
76  const T* __restrict__ invRow = Ainvrow[iw];
77  const T* __restrict__ dpsiM_row = dpsiMrow[iw];
78 
79  T sum_x = 0;
80  T sum_y = 0;
81  T sum_z = 0;
82 
83  PRAGMA_OFFLOAD("omp parallel for reduction(+: sum_x,sum_y,sum_z)")
84  for (size_t col_id = 0; col_id < n; col_id++)
85  {
86  sum_x += invRow[col_id] * dpsiM_row[col_id * 3];
87  sum_y += invRow[col_id] * dpsiM_row[col_id * 3 + 1];
88  sum_z += invRow[col_id] * dpsiM_row[col_id * 3 + 2];
89  }
90 
91  grads_now[iw * 3] = sum_x;
92  grads_now[iw * 3 + 1] = sum_y;
93  grads_now[iw * 3 + 2] = sum_z;
94  }
95 }
96 
97 template<typename T>
99  int* const delay_list[],
100  const int rowchanged,
101  const int delay_count,
102  T* const binv[],
103  const int binv_lda,
104  const T* const ratio_inv,
105  const T* const phi_vgl_in[],
106  const size_t phi_vgl_stride,
107  T* const phi_out[],
108  T* const dphi_out[],
109  T* const d2phi_out[],
110  const int norb,
111  const int n_accepted,
112  const int batch_count)
113 {
114  PRAGMA_OFFLOAD("omp target teams distribute \
115  is_device_ptr(delay_list, binv, ratio_inv, phi_vgl_in, phi_out, dphi_out, d2phi_out)")
116  for (size_t iw = 0; iw < batch_count; iw++)
117  if (iw < n_accepted)
118  {
119  // real accept, settle y and Z
120  int* __restrict__ delay_list_iw = delay_list[iw];
121  T* __restrict__ binvrow_iw = binv[iw] + delay_count * binv_lda;
122  const T* __restrict__ phi_in_iw = phi_vgl_in[iw];
123  T* __restrict__ phi_out_iw = phi_out[iw];
124  T* __restrict__ dphi_out_iw = dphi_out[iw];
125  T* __restrict__ d2phi_out_iw = d2phi_out[iw];
126 
127  delay_list_iw[delay_count] = rowchanged;
128  binvrow_iw[delay_count] = ratio_inv[iw];
129 
130  PRAGMA_OFFLOAD("omp parallel for")
131  for (size_t col_id = 0; col_id < delay_count; col_id++)
132  binvrow_iw[col_id] *= ratio_inv[iw];
133 
134  PRAGMA_OFFLOAD("omp parallel for")
135  for (size_t col_id = 0; col_id < norb; col_id++)
136  {
137  // copy phiV, dphiV and d2phiV from temporary to final without a separate kernel.
138  phi_out_iw[col_id] = phi_in_iw[col_id];
139  dphi_out_iw[col_id * 3] = phi_in_iw[col_id + phi_vgl_stride];
140  dphi_out_iw[col_id * 3 + 1] = phi_in_iw[col_id + phi_vgl_stride * 2];
141  dphi_out_iw[col_id * 3 + 2] = phi_in_iw[col_id + phi_vgl_stride * 3];
142  d2phi_out_iw[col_id] = phi_in_iw[col_id + phi_vgl_stride * 4];
143  }
144  }
145  else
146  {
147  // fake accept. Set Y, Z with zero and x with 1
148  T* __restrict__ binv_iw = binv[iw];
149  PRAGMA_OFFLOAD("omp parallel for")
150  for (size_t col_id = 0; col_id < delay_count; col_id++)
151  binv_iw[delay_count * binv_lda + col_id] = binv_iw[delay_count + binv_lda * col_id] = T(0);
152 
153  int* __restrict__ delay_list_iw = delay_list[iw];
154  binv_iw[delay_count * binv_lda + delay_count] = T(1);
155  delay_list_iw[delay_count] = -1;
156 
157  T* __restrict__ Urow_iw = phi_out[iw];
158  PRAGMA_OFFLOAD("omp parallel for")
159  for (size_t col_id = 0; col_id < norb; col_id++)
160  {
161  Urow_iw[col_id] = T(0);
162  }
163  }
164 }
165 
166 
167 template<typename T>
169  const int* const delay_list[],
170  const int delay_count,
171  T* const tempMat[],
172  const int lda,
173  const int batch_count)
174 {
175  PRAGMA_OFFLOAD("omp target teams distribute is_device_ptr(delay_list, tempMat)")
176  for (size_t iw = 0; iw < batch_count; iw++)
177  {
178  const int* __restrict__ delay_list_iw = delay_list[iw];
179  T* __restrict__ tempMat_iw = tempMat[iw];
180 
181  PRAGMA_OFFLOAD("omp parallel for")
182  for (size_t col_id = 0; col_id < delay_count; col_id++)
183  {
184  const int row_id = delay_list_iw[col_id];
185  if (row_id >= 0)
186  tempMat_iw[row_id * lda + col_id] = tempMat_iw[row_id * lda + col_id] - T(1);
187  }
188  }
189 }
190 
191 
192 } // namespace compute
193 } // namespace qmcplusplus
194 #endif
void add_delay_list_save_sigma_VGL_batched(Queue< PlatformKind::CUDA > &queue, int *const delay_list[], const int rowchanged, const int delay_count, T *const binv[], const int binv_lda, const T *const ratio_inv, const T *const phi_vgl_in[], const size_t phi_vgl_stride, T *const phi_out[], T *const dphi_out[], T *const d2phi_out[], const int norb, const int n_accepted, const int batch_count)
void applyW_batched(Queue< PlatformKind::CUDA > &queue, const int *const delay_list[], const int delay_count, T *const tempMat[], const int lda, const int batch_count)
helper functions for EinsplineSetBuilder
Definition: Configuration.h:43
void calcGradients_batched(Queue< PlatformKind::CUDA > &queue, const int n, const T *const Ainvrow[], const T *const dpsiMrow[], T *const grads_now, const int batch_count)
void copyAinvRow_saveGL_batched(Queue< PlatformKind::CUDA > &queue, const int rowchanged, const int n, const T *const Ainv[], const int lda, T *const temp[], T *const rcopy[], const T *const phi_vgl_in[], const size_t phi_vgl_stride, T *const dphi_out[], T *const d2phi_out[], const int batch_count)