13 #ifndef QMCPLUSPLUS_COMPUTE_MATRIX_UPDATE_OMPTARGET_H 14 #define QMCPLUSPLUS_COMPUTE_MATRIX_UPDATE_OMPTARGET_H 28 const T*
const Ainv[],
32 const T*
const phi_vgl_in[],
33 const size_t phi_vgl_stride,
36 const int batch_count)
38 PRAGMA_OFFLOAD(
"omp target teams distribute is_device_ptr(Ainv, temp, rcopy, phi_vgl_in, dphi_out, d2phi_out)")
39 for (
size_t iw = 0; iw < batch_count; iw++)
41 const T* __restrict__ Ainv_iw = Ainv[iw];
42 T* __restrict__ temp_iw = temp[iw];
43 T* __restrict__ rcopy_iw = rcopy[iw];
44 const T* __restrict__ phi_in_iw = phi_vgl_in[iw];
45 T* __restrict__ dphi_out_iw = dphi_out[iw];
46 T* __restrict__ d2phi_out_iw = d2phi_out[iw];
48 temp_iw[rowchanged] = temp_iw[rowchanged] - T(1);
50 PRAGMA_OFFLOAD(
"omp parallel for")
51 for (
size_t col_id = 0; col_id <
n; col_id++)
53 rcopy_iw[col_id] = Ainv_iw[rowchanged *
lda + col_id];
57 dphi_out_iw[col_id * 3] = phi_in_iw[col_id + phi_vgl_stride];
58 dphi_out_iw[col_id * 3 + 1] = phi_in_iw[col_id + phi_vgl_stride * 2];
59 dphi_out_iw[col_id * 3 + 2] = phi_in_iw[col_id + phi_vgl_stride * 3];
60 d2phi_out_iw[col_id] = phi_in_iw[col_id + phi_vgl_stride * 4];
68 const T*
const Ainvrow[],
69 const T*
const dpsiMrow[],
71 const int batch_count)
73 PRAGMA_OFFLOAD(
"omp target teams distribute is_device_ptr(Ainvrow, dpsiMrow, grads_now)")
74 for (
size_t iw = 0; iw < batch_count; iw++)
76 const T* __restrict__ invRow = Ainvrow[iw];
77 const T* __restrict__ dpsiM_row = dpsiMrow[iw];
83 PRAGMA_OFFLOAD(
"omp parallel for reduction(+: sum_x,sum_y,sum_z)")
84 for (
size_t col_id = 0; col_id <
n; col_id++)
86 sum_x += invRow[col_id] * dpsiM_row[col_id * 3];
87 sum_y += invRow[col_id] * dpsiM_row[col_id * 3 + 1];
88 sum_z += invRow[col_id] * dpsiM_row[col_id * 3 + 2];
91 grads_now[iw * 3] = sum_x;
92 grads_now[iw * 3 + 1] = sum_y;
93 grads_now[iw * 3 + 2] = sum_z;
99 int*
const delay_list[],
100 const int rowchanged,
101 const int delay_count,
104 const T*
const ratio_inv,
105 const T*
const phi_vgl_in[],
106 const size_t phi_vgl_stride,
109 T*
const d2phi_out[],
111 const int n_accepted,
112 const int batch_count)
114 PRAGMA_OFFLOAD(
"omp target teams distribute \ 115 is_device_ptr(delay_list, binv, ratio_inv, phi_vgl_in, phi_out, dphi_out, d2phi_out)")
116 for (
size_t iw = 0; iw < batch_count; iw++)
120 int* __restrict__ delay_list_iw = delay_list[iw];
121 T* __restrict__ binvrow_iw = binv[iw] + delay_count * binv_lda;
122 const T* __restrict__ phi_in_iw = phi_vgl_in[iw];
123 T* __restrict__ phi_out_iw = phi_out[iw];
124 T* __restrict__ dphi_out_iw = dphi_out[iw];
125 T* __restrict__ d2phi_out_iw = d2phi_out[iw];
127 delay_list_iw[delay_count] = rowchanged;
128 binvrow_iw[delay_count] = ratio_inv[iw];
130 PRAGMA_OFFLOAD(
"omp parallel for")
131 for (
size_t col_id = 0; col_id < delay_count; col_id++)
132 binvrow_iw[col_id] *= ratio_inv[iw];
134 PRAGMA_OFFLOAD(
"omp parallel for")
135 for (
size_t col_id = 0; col_id < norb; col_id++)
138 phi_out_iw[col_id] = phi_in_iw[col_id];
139 dphi_out_iw[col_id * 3] = phi_in_iw[col_id + phi_vgl_stride];
140 dphi_out_iw[col_id * 3 + 1] = phi_in_iw[col_id + phi_vgl_stride * 2];
141 dphi_out_iw[col_id * 3 + 2] = phi_in_iw[col_id + phi_vgl_stride * 3];
142 d2phi_out_iw[col_id] = phi_in_iw[col_id + phi_vgl_stride * 4];
148 T* __restrict__ binv_iw = binv[iw];
149 PRAGMA_OFFLOAD(
"omp parallel for")
150 for (
size_t col_id = 0; col_id < delay_count; col_id++)
151 binv_iw[delay_count * binv_lda + col_id] = binv_iw[delay_count + binv_lda * col_id] = T(0);
153 int* __restrict__ delay_list_iw = delay_list[iw];
154 binv_iw[delay_count * binv_lda + delay_count] = T(1);
155 delay_list_iw[delay_count] = -1;
157 T* __restrict__ Urow_iw = phi_out[iw];
158 PRAGMA_OFFLOAD(
"omp parallel for")
159 for (
size_t col_id = 0; col_id < norb; col_id++)
161 Urow_iw[col_id] = T(0);
169 const int*
const delay_list[],
170 const int delay_count,
173 const int batch_count)
175 PRAGMA_OFFLOAD(
"omp target teams distribute is_device_ptr(delay_list, tempMat)")
176 for (
size_t iw = 0; iw < batch_count; iw++)
178 const int* __restrict__ delay_list_iw = delay_list[iw];
179 T* __restrict__ tempMat_iw = tempMat[iw];
181 PRAGMA_OFFLOAD(
"omp parallel for")
182 for (
size_t col_id = 0; col_id < delay_count; col_id++)
184 const int row_id = delay_list_iw[col_id];
186 tempMat_iw[row_id *
lda + col_id] = tempMat_iw[row_id *
lda + col_id] - T(1);
void add_delay_list_save_sigma_VGL_batched(Queue< PlatformKind::CUDA > &queue, int *const delay_list[], const int rowchanged, const int delay_count, T *const binv[], const int binv_lda, const T *const ratio_inv, const T *const phi_vgl_in[], const size_t phi_vgl_stride, T *const phi_out[], T *const dphi_out[], T *const d2phi_out[], const int norb, const int n_accepted, const int batch_count)
void applyW_batched(Queue< PlatformKind::CUDA > &queue, const int *const delay_list[], const int delay_count, T *const tempMat[], const int lda, const int batch_count)
helper functions for EinsplineSetBuilder
void calcGradients_batched(Queue< PlatformKind::CUDA > &queue, const int n, const T *const Ainvrow[], const T *const dpsiMrow[], T *const grads_now, const int batch_count)
void copyAinvRow_saveGL_batched(Queue< PlatformKind::CUDA > &queue, const int rowchanged, const int n, const T *const Ainv[], const int lda, T *const temp[], T *const rcopy[], const T *const phi_vgl_in[], const size_t phi_vgl_stride, T *const dphi_out[], T *const d2phi_out[], const int batch_count)