QMCPACK
qmcplusplus::SYCL Namespace Reference

Functions

template<typename T >
sycl::event copyAinvRow_saveGL_batched (sycl::queue &aq, const int rowchanged, const int n, const T *const Ainv[], const int lda, T *const temp[], T *const rcopy[], const T *const phi_vgl_in[], const int phi_vgl_stride, T *const dphi_out[], T *const d2phi_out[], const int batch_count, const std::vector< sycl::event > &dependencies)
 
template sycl::event copyAinvRow_saveGL_batched (sycl::queue &aq, const int rowchanged, const int n, const float *const Ainv[], const int lda, float *const temp[], float *const rcopy[], const float *const phi_vgl_in[], const int phi_vgl_stride, float *const dphi_out[], float *const d2phi_out[], const int batch_count, const std::vector< sycl::event > &dependencies)
 
template sycl::event copyAinvRow_saveGL_batched (sycl::queue &aq, const int rowchanged, const int n, const double *const Ainv[], const int lda, double *const temp[], double *const rcopy[], const double *const phi_vgl_in[], const int phi_vgl_stride, double *const dphi_out[], double *const d2phi_out[], const int batch_count, const std::vector< sycl::event > &dependencies)
 
template sycl::event copyAinvRow_saveGL_batched (sycl::queue &aq, const int rowchanged, const int n, const std::complex< float > *const Ainv[], const int lda, std::complex< float > *const temp[], std::complex< float > *const rcopy[], const std::complex< float > *const phi_vgl_in[], const int phi_vgl_stride, std::complex< float > *const dphi_out[], std::complex< float > *const d2phi_out[], const int batch_count, const std::vector< sycl::event > &dependencies)
 
template sycl::event copyAinvRow_saveGL_batched (sycl::queue &aq, const int rowchanged, const int n, const std::complex< double > *const Ainv[], const int lda, std::complex< double > *const temp[], std::complex< double > *const rcopy[], const std::complex< double > *const phi_vgl_in[], const int phi_vgl_stride, std::complex< double > *const dphi_out[], std::complex< double > *const d2phi_out[], const int batch_count, const std::vector< sycl::event > &dependencies)
 
template<typename T , int DIM>
sycl::event calcGradients_batched (sycl::queue &aq, const int n, const T *const Ainvrow[], const T *const dpsiMrow[], T *const grads_now, const int batch_count, const std::vector< sycl::event > &dependencies)
 
template sycl::event calcGradients_batched (sycl::queue &aq, const int n, const float *const Ainvrow[], const float *const dpsiMrow[], float *const grads_now, const int batch_count, const std::vector< sycl::event > &dependencies)
 
template sycl::event calcGradients_batched (sycl::queue &aq, const int n, const double *const Ainvrow[], const double *const dpsiMrow[], double *const grads_now, const int batch_count, const std::vector< sycl::event > &dependencies)
 
template sycl::event calcGradients_batched (sycl::queue &aq, const int n, const std::complex< float > *const Ainvrow[], const std::complex< float > *const dpsiMrow[], std::complex< float > *const grads_now, const int batch_count, const std::vector< sycl::event > &dependencies)
 
template sycl::event calcGradients_batched (sycl::queue &aq, const int n, const std::complex< double > *const Ainvrow[], const std::complex< double > *const dpsiMrow[], std::complex< double > *const grads_now, const int batch_count, const std::vector< sycl::event > &dependencies)
 
template<typename T >
sycl::event add_delay_list_save_sigma_VGL_batched (sycl::queue &aq, int *const delay_list[], const int rowchanged, const int delay_count, T *const binv[], const int binv_lda, const T *const ratio_inv, const T *const phi_vgl_in[], const int phi_vgl_stride, T *const phi_out[], T *const dphi_out[], T *const d2phi_out[], const int norb, const int n_accepted, const int batch_count, const std::vector< sycl::event > &dependencies)
 
template sycl::event add_delay_list_save_sigma_VGL_batched (sycl::queue &aq, int *const delay_list[], const int rowchanged, const int delay_count, float *const binv[], const int binv_lda, const float *const ratio_inv, const float *const phi_vgl_in[], const int phi_vgl_stride, float *const phi_out[], float *const dphi_out[], float *const d2phi_out[], const int norb, const int n_accepted, const int batch_count, const std::vector< sycl::event > &dependencies)
 
template sycl::event add_delay_list_save_sigma_VGL_batched (sycl::queue &aq, int *const delay_list[], const int rowchanged, const int delay_count, double *const binv[], const int binv_lda, const double *const ratio_inv, const double *const phi_vgl_in[], const int phi_vgl_stride, double *const phi_out[], double *const dphi_out[], double *const d2phi_out[], const int norb, const int n_accepted, const int batch_count, const std::vector< sycl::event > &dependencies)
 
template sycl::event add_delay_list_save_sigma_VGL_batched (sycl::queue &aq, int *const delay_list[], const int rowchanged, const int delay_count, std::complex< float > *const binv[], const int binv_lda, const std::complex< float > *const ratio_inv, const std::complex< float > *const phi_vgl_in[], const int phi_vgl_stride, std::complex< float > *const phi_out[], std::complex< float > *const dphi_out[], std::complex< float > *const d2phi_out[], const int norb, const int n_accepted, const int batch_count, const std::vector< sycl::event > &dependencies)
 
template sycl::event add_delay_list_save_sigma_VGL_batched (sycl::queue &aq, int *const delay_list[], const int rowchanged, const int delay_count, std::complex< double > *const binv[], const int binv_lda, const std::complex< double > *const ratio_inv, const std::complex< double > *const phi_vgl_in[], const int phi_vgl_stride, std::complex< double > *const phi_out[], std::complex< double > *const dphi_out[], std::complex< double > *const d2phi_out[], const int norb, const int n_accepted, const int batch_count, const std::vector< sycl::event > &dependencies)
 
template<typename T >
sycl::event applyW_batched (sycl::queue &aq, const int *const delay_list[], const int delay_count, T *const tempMat[], const int lda, const int batch_count, const std::vector< sycl::event > &dependencies)
 
template sycl::event applyW_batched (sycl::queue &aq, const int *const delay_list[], const int delay_count, float *const tempMat[], const int lda, const int batch_count, const std::vector< sycl::event > &dependencies)
 
template sycl::event applyW_batched (sycl::queue &aq, const int *const delay_list[], const int delay_count, double *const tempMat[], const int lda, const int batch_count, const std::vector< sycl::event > &dependencies)
 
template sycl::event applyW_batched (sycl::queue &aq, const int *const delay_list[], const int delay_count, std::complex< float > *const tempMat[], const int lda, const int batch_count, const std::vector< sycl::event > &dependencies)
 
template sycl::event applyW_batched (sycl::queue &aq, const int *const delay_list[], const int delay_count, std::complex< double > *const tempMat[], const int lda, const int batch_count, const std::vector< sycl::event > &dependencies)
 

Function Documentation

◆ add_delay_list_save_sigma_VGL_batched() [1/5]

sycl::event add_delay_list_save_sigma_VGL_batched ( sycl::queue aq,
int *const  delay_list[],
const int  rowchanged,
const int  delay_count,
T *const  binv[],
const int  binv_lda,
const T *const  ratio_inv,
const T *const  phi_vgl_in[],
const int  phi_vgl_stride,
T *const  phi_out[],
T *const  dphi_out[],
T *const  d2phi_out[],
const int  norb,
const int  n_accepted,
const int  batch_count,
const std::vector< sycl::event > &  dependencies 
)

Definition at line 210 of file matrix_update_helper.cpp.

Referenced by qmcplusplus::compute::add_delay_list_save_sigma_VGL_batched().

226 {
227  constexpr int COLBS = 64;
228 
229  return aq.parallel_for(sycl::nd_range<1>{{static_cast<size_t>(batch_count * COLBS)}, {static_cast<size_t>(COLBS)}},
230  dependencies, [=](sycl::nd_item<1> item) {
231  const int tid = item.get_local_id(0); //threadIdx.x;
232  const int iw = item.get_group(0); //blockIdx.x;
233 
234  if (iw < n_accepted)
235  {
236  // real accept, settle y and Z
237  int* __restrict__ delay_list_iw = delay_list[iw];
238  T* __restrict__ binvrow_iw = binv[iw] + delay_count * binv_lda;
239  const T* __restrict__ phi_in_iw = phi_vgl_in[iw];
240  T* __restrict__ phi_out_iw = phi_out[iw];
241  T* __restrict__ dphi_out_iw = dphi_out[iw];
242  T* __restrict__ d2phi_out_iw = d2phi_out[iw];
243 
244  if (tid == 0)
245  {
246  delay_list_iw[delay_count] = rowchanged;
247  binvrow_iw[delay_count] = ratio_inv[iw];
248  }
249 
250  const int num_delay_count_col_blocks = (delay_count + COLBS - 1) / COLBS;
251  for (int ib = 0; ib < num_delay_count_col_blocks; ib++)
252  {
253  const int col_id = ib * COLBS + tid;
254  if (col_id < delay_count)
255  binvrow_iw[col_id] *= ratio_inv[iw];
256  }
257 
258  const int num_col_blocks = (norb + COLBS - 1) / COLBS;
259  for (int ib = 0; ib < num_col_blocks; ib++)
260  {
261  const int col_id = ib * COLBS + tid;
262  if (col_id < norb)
263  {
264  // copy phiV, dphiV and d2phiV from temporary to final without a separate kernel.
265  phi_out_iw[col_id] = phi_in_iw[col_id];
266  dphi_out_iw[col_id * 3] = phi_in_iw[col_id + phi_vgl_stride];
267  dphi_out_iw[col_id * 3 + 1] = phi_in_iw[col_id + phi_vgl_stride * 2];
268  dphi_out_iw[col_id * 3 + 2] = phi_in_iw[col_id + phi_vgl_stride * 3];
269  d2phi_out_iw[col_id] = phi_in_iw[col_id + phi_vgl_stride * 4];
270  }
271  }
272  }
273  else
274  {
275  // fake accept. Set Y, Z with zero and x with 1
276  T* __restrict__ Urow_iw = phi_out[iw];
277  const int num_blocks_norb = (norb + COLBS - 1) / COLBS;
278  for (int ib = 0; ib < num_blocks_norb; ib++)
279  {
280  const int col_id = ib * COLBS + tid;
281  if (col_id < norb)
282  Urow_iw[col_id] = T{};
283  }
284 
285  T* __restrict__ binv_iw = binv[iw];
286  const int num_blocks_delay_count = (delay_count + COLBS - 1) / COLBS;
287  for (int ib = 0; ib < num_blocks_delay_count; ib++)
288  {
289  const int col_id = ib * COLBS + tid;
290  if (col_id < delay_count)
291  binv_iw[delay_count * binv_lda + col_id] = binv_iw[delay_count + binv_lda * col_id] =
292  T(0);
293  }
294 
295  int* __restrict__ delay_list_iw = delay_list[iw];
296  if (tid == 0)
297  {
298  binv_iw[delay_count * binv_lda + delay_count] = T(1);
299  delay_list_iw[delay_count] = -1;
300  }
301  }
302  });
303 }

◆ add_delay_list_save_sigma_VGL_batched() [2/5]

template sycl::event qmcplusplus::SYCL::add_delay_list_save_sigma_VGL_batched ( sycl::queue aq,
int *const  delay_list[],
const int  rowchanged,
const int  delay_count,
float *const  binv[],
const int  binv_lda,
const float *const  ratio_inv,
const float *const  phi_vgl_in[],
const int  phi_vgl_stride,
float *const  phi_out[],
float *const  dphi_out[],
float *const  d2phi_out[],
const int  norb,
const int  n_accepted,
const int  batch_count,
const std::vector< sycl::event > &  dependencies 
)

◆ add_delay_list_save_sigma_VGL_batched() [3/5]

template sycl::event qmcplusplus::SYCL::add_delay_list_save_sigma_VGL_batched ( sycl::queue aq,
int *const  delay_list[],
const int  rowchanged,
const int  delay_count,
double *const  binv[],
const int  binv_lda,
const double *const  ratio_inv,
const double *const  phi_vgl_in[],
const int  phi_vgl_stride,
double *const  phi_out[],
double *const  dphi_out[],
double *const  d2phi_out[],
const int  norb,
const int  n_accepted,
const int  batch_count,
const std::vector< sycl::event > &  dependencies 
)

◆ add_delay_list_save_sigma_VGL_batched() [4/5]

template sycl::event qmcplusplus::SYCL::add_delay_list_save_sigma_VGL_batched ( sycl::queue aq,
int *const  delay_list[],
const int  rowchanged,
const int  delay_count,
std::complex< float > *const  binv[],
const int  binv_lda,
const std::complex< float > *const  ratio_inv,
const std::complex< float > *const  phi_vgl_in[],
const int  phi_vgl_stride,
std::complex< float > *const  phi_out[],
std::complex< float > *const  dphi_out[],
std::complex< float > *const  d2phi_out[],
const int  norb,
const int  n_accepted,
const int  batch_count,
const std::vector< sycl::event > &  dependencies 
)

◆ add_delay_list_save_sigma_VGL_batched() [5/5]

template sycl::event qmcplusplus::SYCL::add_delay_list_save_sigma_VGL_batched ( sycl::queue aq,
int *const  delay_list[],
const int  rowchanged,
const int  delay_count,
std::complex< double > *const  binv[],
const int  binv_lda,
const std::complex< double > *const  ratio_inv,
const std::complex< double > *const  phi_vgl_in[],
const int  phi_vgl_stride,
std::complex< double > *const  phi_out[],
std::complex< double > *const  dphi_out[],
std::complex< double > *const  d2phi_out[],
const int  norb,
const int  n_accepted,
const int  batch_count,
const std::vector< sycl::event > &  dependencies 
)

◆ applyW_batched() [1/5]

sycl::event applyW_batched ( sycl::queue aq,
const int *const  delay_list[],
const int  delay_count,
T *const  tempMat[],
const int  lda,
const int  batch_count,
const std::vector< sycl::event > &  dependencies 
)

Definition at line 374 of file matrix_update_helper.cpp.

References qmcplusplus::lda, and qmcplusplus::hdf::num_blocks.

Referenced by qmcplusplus::compute::applyW_batched().

381 {
382  constexpr int COLBS = 32;
383 
384  return aq.parallel_for(sycl::nd_range<1>{{static_cast<size_t>(batch_count * COLBS)}, {static_cast<size_t>(COLBS)}},
385  dependencies, [=](sycl::nd_item<1> item) {
386  const int iw = item.get_group(0); //blockIdx.x;
387  const int* __restrict__ delay_list_iw = delay_list[iw];
388  T* __restrict__ tempMat_iw = tempMat[iw];
389 
390  const T mone(-1);
391  const int tid = item.get_local_id(0); //threadIdx.x;
392  const int num_blocks = (delay_count + COLBS - 1) / COLBS;
393  for (int ib = 0; ib < num_blocks; ib++)
394  {
395  const int col_id = ib * COLBS + tid;
396  if (col_id < delay_count)
397  {
398  const int row_id = delay_list_iw[col_id];
399  if (row_id >= 0)
400  tempMat_iw[row_id * lda + col_id] +=
401  mone; //subtractOne<T>(tempMat_iw[row_id * lda + col_id]);
402  }
403  }
404  });
405 }
const char num_blocks[]
Definition: HDFVersion.h:44

◆ applyW_batched() [2/5]

template sycl::event qmcplusplus::SYCL::applyW_batched ( sycl::queue aq,
const int *const  delay_list[],
const int  delay_count,
float *const  tempMat[],
const int  lda,
const int  batch_count,
const std::vector< sycl::event > &  dependencies 
)

◆ applyW_batched() [3/5]

template sycl::event qmcplusplus::SYCL::applyW_batched ( sycl::queue aq,
const int *const  delay_list[],
const int  delay_count,
double *const  tempMat[],
const int  lda,
const int  batch_count,
const std::vector< sycl::event > &  dependencies 
)

◆ applyW_batched() [4/5]

template sycl::event qmcplusplus::SYCL::applyW_batched ( sycl::queue aq,
const int *const  delay_list[],
const int  delay_count,
std::complex< float > *const  tempMat[],
const int  lda,
const int  batch_count,
const std::vector< sycl::event > &  dependencies 
)

◆ applyW_batched() [5/5]

template sycl::event qmcplusplus::SYCL::applyW_batched ( sycl::queue aq,
const int *const  delay_list[],
const int  delay_count,
std::complex< double > *const  tempMat[],
const int  lda,
const int  batch_count,
const std::vector< sycl::event > &  dependencies 
)

◆ calcGradients_batched() [1/5]

sycl::event calcGradients_batched ( sycl::queue aq,
const int  n,
const T *const  Ainvrow[],
const T *const  dpsiMrow[],
T *const  grads_now,
const int  batch_count,
const std::vector< sycl::event > &  dependencies 
)

Definition at line 128 of file matrix_update_helper.cpp.

References qmcplusplus::ewaldref::DIM, and qmcplusplus::n.

Referenced by qmcplusplus::compute::calcGradients_batched().

135 {
136  constexpr int COLBS = 128;
137 
138  return aq.submit([&](sycl::handler& cgh) {
139  cgh.depends_on(dependencies);
140 
141  sycl::local_accessor<T, 1> sum((static_cast<size_t>(DIM * COLBS)), cgh);
142  cgh.parallel_for(sycl::nd_range<1>{{static_cast<size_t>(batch_count * COLBS)}, {static_cast<size_t>(COLBS)}},
143  [=](sycl::nd_item<1> item) {
144  const int iw = item.get_group(0); //blockIdx.x;
145  const T* __restrict__ invRow = Ainvrow[iw];
146  const T* __restrict__ dpsiM_row = dpsiMrow[iw];
147 
148  const int tid = item.get_local_id(0); //threadIdx.x;
149  for (int idim = 0; idim < DIM; idim++)
150  sum[idim * COLBS + tid] = T{};
151 
152  const int num_col_blocks = (n + COLBS - 1) / COLBS;
153  for (int ib = 0; ib < num_col_blocks; ib++)
154  {
155  const int col_id = ib * COLBS + tid;
156  for (int idim = 0; idim < DIM; idim++)
157  if (col_id < n)
158  sum[idim * COLBS + tid] += invRow[col_id] * dpsiM_row[col_id * DIM + idim];
159  }
160 
161  for (int iend = COLBS / 2; iend > 0; iend /= 2)
162  {
163  item.barrier(sycl::access::fence_space::local_space);
164  for (int idim = 0; idim < DIM; idim++)
165  if (tid < iend)
166  sum[idim * COLBS + tid] += sum[idim * COLBS + tid + iend];
167  }
168 
169  if (tid == 0)
170  for (int idim = 0; idim < DIM; idim++)
171  grads_now[iw * DIM + idim] = sum[idim * COLBS];
172  });
173  });
174 }

◆ calcGradients_batched() [2/5]

template sycl::event qmcplusplus::SYCL::calcGradients_batched ( sycl::queue aq,
const int  n,
const float *const  Ainvrow[],
const float *const  dpsiMrow[],
float *const  grads_now,
const int  batch_count,
const std::vector< sycl::event > &  dependencies 
)

◆ calcGradients_batched() [3/5]

template sycl::event qmcplusplus::SYCL::calcGradients_batched ( sycl::queue aq,
const int  n,
const double *const  Ainvrow[],
const double *const  dpsiMrow[],
double *const  grads_now,
const int  batch_count,
const std::vector< sycl::event > &  dependencies 
)

◆ calcGradients_batched() [4/5]

template sycl::event qmcplusplus::SYCL::calcGradients_batched ( sycl::queue aq,
const int  n,
const std::complex< float > *const  Ainvrow[],
const std::complex< float > *const  dpsiMrow[],
std::complex< float > *const  grads_now,
const int  batch_count,
const std::vector< sycl::event > &  dependencies 
)

◆ calcGradients_batched() [5/5]

template sycl::event qmcplusplus::SYCL::calcGradients_batched ( sycl::queue aq,
const int  n,
const std::complex< double > *const  Ainvrow[],
const std::complex< double > *const  dpsiMrow[],
std::complex< double > *const  grads_now,
const int  batch_count,
const std::vector< sycl::event > &  dependencies 
)

◆ copyAinvRow_saveGL_batched() [1/5]

sycl::event copyAinvRow_saveGL_batched ( sycl::queue aq,
const int  rowchanged,
const int  n,
const T *const  Ainv[],
const int  lda,
T *const  temp[],
T *const  rcopy[],
const T *const  phi_vgl_in[],
const int  phi_vgl_stride,
T *const  dphi_out[],
T *const  d2phi_out[],
const int  batch_count,
const std::vector< sycl::event > &  dependencies 
)

Definition at line 21 of file matrix_update_helper.cpp.

References qmcplusplus::lda, and qmcplusplus::n.

Referenced by qmcplusplus::compute::copyAinvRow_saveGL_batched().

34 {
35  constexpr int COLBS = 128;
36 
37  return aq
38  .parallel_for(sycl::nd_range<1>{{static_cast<size_t>(batch_count * COLBS)}, {static_cast<size_t>(COLBS)}},
39  dependencies, [=](sycl::nd_item<1> item) {
40  const int iw = item.get_group(0); //blockIdx.x;
41  const T* __restrict__ Ainv_iw = Ainv[iw];
42  T* __restrict__ temp_iw = temp[iw];
43  T* __restrict__ rcopy_iw = rcopy[iw];
44  const T* __restrict__ phi_in_iw = phi_vgl_in[iw];
45  T* __restrict__ dphi_out_iw = dphi_out[iw];
46  T* __restrict__ d2phi_out_iw = d2phi_out[iw];
47 
48  const int tid = item.get_local_id(0); //threadIdx.x;
49  if (tid == 0)
50  temp_iw[rowchanged] -= T(1); //temp_iw[rowchanged] = subtractOne<T>(temp_iw[rowchanged]);
51 
52  const int num_col_blocks = (n + COLBS - 1) / COLBS;
53  for (int ib = 0; ib < num_col_blocks; ib++)
54  {
55  const int col_id = ib * COLBS + tid; //threadIdx.x;
56  if (col_id < n)
57  {
58  rcopy_iw[col_id] = Ainv_iw[rowchanged * lda + col_id];
59 
60  // the following copying data on the device is not part of SM-1
61  // it is intended to copy dphiV and d2phiV from temporary to final without a separate kernel.
62  dphi_out_iw[col_id * 3] = phi_in_iw[col_id + phi_vgl_stride];
63  dphi_out_iw[col_id * 3 + 1] = phi_in_iw[col_id + phi_vgl_stride * 2];
64  dphi_out_iw[col_id * 3 + 2] = phi_in_iw[col_id + phi_vgl_stride * 3];
65  d2phi_out_iw[col_id] = phi_in_iw[col_id + phi_vgl_stride * 4];
66  }
67  }
68  });
69 }

◆ copyAinvRow_saveGL_batched() [2/5]

template sycl::event qmcplusplus::SYCL::copyAinvRow_saveGL_batched ( sycl::queue aq,
const int  rowchanged,
const int  n,
const float *const  Ainv[],
const int  lda,
float *const  temp[],
float *const  rcopy[],
const float *const  phi_vgl_in[],
const int  phi_vgl_stride,
float *const  dphi_out[],
float *const  d2phi_out[],
const int  batch_count,
const std::vector< sycl::event > &  dependencies 
)

◆ copyAinvRow_saveGL_batched() [3/5]

template sycl::event qmcplusplus::SYCL::copyAinvRow_saveGL_batched ( sycl::queue aq,
const int  rowchanged,
const int  n,
const double *const  Ainv[],
const int  lda,
double *const  temp[],
double *const  rcopy[],
const double *const  phi_vgl_in[],
const int  phi_vgl_stride,
double *const  dphi_out[],
double *const  d2phi_out[],
const int  batch_count,
const std::vector< sycl::event > &  dependencies 
)

◆ copyAinvRow_saveGL_batched() [4/5]

template sycl::event qmcplusplus::SYCL::copyAinvRow_saveGL_batched ( sycl::queue aq,
const int  rowchanged,
const int  n,
const std::complex< float > *const  Ainv[],
const int  lda,
std::complex< float > *const  temp[],
std::complex< float > *const  rcopy[],
const std::complex< float > *const  phi_vgl_in[],
const int  phi_vgl_stride,
std::complex< float > *const  dphi_out[],
std::complex< float > *const  d2phi_out[],
const int  batch_count,
const std::vector< sycl::event > &  dependencies 
)

◆ copyAinvRow_saveGL_batched() [5/5]

template sycl::event qmcplusplus::SYCL::copyAinvRow_saveGL_batched ( sycl::queue aq,
const int  rowchanged,
const int  n,
const std::complex< double > *const  Ainv[],
const int  lda,
std::complex< double > *const  temp[],
std::complex< double > *const  rcopy[],
const std::complex< double > *const  phi_vgl_in[],
const int  phi_vgl_stride,
std::complex< double > *const  dphi_out[],
std::complex< double > *const  d2phi_out[],
const int  batch_count,
const std::vector< sycl::event > &  dependencies 
)