Namespaces
	BLAS

Classes
class	BLASHandle

class	BLASHandle< PlatformKind::CUDA >

class	BLASHandle< PlatformKind::OMPTARGET >

class	BLASHandle< PlatformKind::SYCL >

class	Queue

class	Queue< PlatformKind::CUDA >

class	Queue< PlatformKind::OMPTARGET >

class	Queue< PlatformKind::SYCL >

Functions
template<typename T >
void	copyAinvRow_saveGL_batched (Queue< PlatformKind::CUDA > &queue, const int rowchanged, const int n, const T const Ainv[], const int lda, T const temp[], T const rcopy[], const T const phi_vgl_in[], const size_t phi_vgl_stride, T const dphi_out[], T const d2phi_out[], const int batch_count)

template<typename T >
void	calcGradients_batched (Queue< PlatformKind::CUDA > &queue, const int n, const T const Ainvrow[], const T const dpsiMrow[], T *const grads_now, const int batch_count)

template<typename T >
void	add_delay_list_save_sigma_VGL_batched (Queue< PlatformKind::CUDA > &queue, int const delay_list[], const int rowchanged, const int delay_count, T const binv[], const int binv_lda, const T const ratio_inv, const T const phi_vgl_in[], const size_t phi_vgl_stride, T const phi_out[], T const dphi_out[], T *const d2phi_out[], const int norb, const int n_accepted, const int batch_count)

template<typename T >
void	applyW_batched (Queue< PlatformKind::CUDA > &queue, const int const delay_list[], const int delay_count, T const tempMat[], const int lda, const int batch_count)

template<typename T >
void	copyAinvRow_saveGL_batched (Queue< PlatformKind::OMPTARGET > &queue, const int rowchanged, const int n, const T const Ainv[], const int lda, T const temp[], T const rcopy[], const T const phi_vgl_in[], const size_t phi_vgl_stride, T const dphi_out[], T const d2phi_out[], const int batch_count)

template<typename T >
void	calcGradients_batched (Queue< PlatformKind::OMPTARGET > &queue, const int n, const T const Ainvrow[], const T const dpsiMrow[], T *const grads_now, const int batch_count)

template<typename T >
void	add_delay_list_save_sigma_VGL_batched (Queue< PlatformKind::OMPTARGET > &queue, int const delay_list[], const int rowchanged, const int delay_count, T const binv[], const int binv_lda, const T const ratio_inv, const T const phi_vgl_in[], const size_t phi_vgl_stride, T const phi_out[], T const dphi_out[], T *const d2phi_out[], const int norb, const int n_accepted, const int batch_count)

template<typename T >
void	applyW_batched (Queue< PlatformKind::OMPTARGET > &queue, const int const delay_list[], const int delay_count, T const tempMat[], const int lda, const int batch_count)

template<typename T >
void	copyAinvRow_saveGL_batched (Queue< PlatformKind::SYCL > &queue, const int rowchanged, const int n, const T const Ainv[], const int lda, T const temp[], T const rcopy[], const T const phi_vgl_in[], const size_t phi_vgl_stride, T const dphi_out[], T const d2phi_out[], const int batch_count)

template<typename T >
void	calcGradients_batched (Queue< PlatformKind::SYCL > &queue, const int n, const T const Ainvrow[], const T const dpsiMrow[], T *const grads_now, const int batch_count)

template<typename T >
void	add_delay_list_save_sigma_VGL_batched (Queue< PlatformKind::SYCL > &queue, int const delay_list[], const int rowchanged, const int delay_count, T const binv[], const int binv_lda, const T const ratio_inv, const T const phi_vgl_in[], const size_t phi_vgl_stride, T const phi_out[], T const dphi_out[], T *const d2phi_out[], const int norb, const int n_accepted, const int batch_count)

template<typename T >
void	applyW_batched (Queue< PlatformKind::SYCL > &queue, const int const delay_list[], const int delay_count, T const tempMat[], const int lda, const int batch_count)

Class Documentation

◆ qmcplusplus::compute::BLASHandle

class qmcplusplus::compute::BLASHandle

template<PlatformKind PL>
class qmcplusplus::compute::BLASHandle< PL >

Definition at line 24 of file AccelBLASHandle.hpp.

Collaboration diagram for BLASHandle< PL >:

◆ qmcplusplus::compute::Queue

class qmcplusplus::compute::Queue

template<PlatformKind PL>
class qmcplusplus::compute::Queue< PL >

Definition at line 24 of file Queue.hpp.

Collaboration diagram for Queue< PL >:

Function Documentation

◆ add_delay_list_save_sigma_VGL_batched() [1/3]

void qmcplusplus::compute::add_delay_list_save_sigma_VGL_batched	(	Queue< PlatformKind::CUDA > &	queue,
		int *const	delay_list[],
		const int	rowchanged,
		const int	delay_count,
		T *const	binv[],
		const int	binv_lda,
		const T *const	ratio_inv,
		const T *const	phi_vgl_in[],
		const size_t	phi_vgl_stride,
		T *const	phi_out[],
		T *const	dphi_out[],
		T *const	d2phi_out[],
		const int	norb,
		const int	n_accepted,
		const int	batch_count
	)

Definition at line 57 of file AccelMatrixUpdateCUDA.hpp.

References qmcplusplus::CUDA::add_delay_list_save_sigma_VGL_batched(), qmcplusplus::cudaErrorCheck(), and qmcplusplus::queue.

Referenced by DelayedUpdateBatched< PL, VALUE >::mw_accept_rejectRow().

 {
   cudaErrorCheck(CUDA::add_delay_list_save_sigma_VGL_batched(queue.getNative(), delay_list, rowchanged, delay_count,
                                                              binv, binv_lda, ratio_inv, phi_vgl_in, phi_vgl_stride,
                                                              phi_out, dphi_out, d2phi_out, norb, n_accepted,
                                                              batch_count),
                  "CUDA::add_delay_list_save_y_VGL_batched failed!");
 }

◆ add_delay_list_save_sigma_VGL_batched() [2/3]

void qmcplusplus::compute::add_delay_list_save_sigma_VGL_batched	(	Queue< PlatformKind::SYCL > &	queue,
		int *const	delay_list[],
		const int	rowchanged,
		const int	delay_count,
		T *const	binv[],
		const int	binv_lda,
		const T *const	ratio_inv,
		const T *const	phi_vgl_in[],
		const size_t	phi_vgl_stride,
		T *const	phi_out[],
		T *const	dphi_out[],
		T *const	d2phi_out[],
		const int	norb,
		const int	n_accepted,
		const int	batch_count
	)

Definition at line 69 of file AccelMatrixUpdateSYCL.hpp.

References qmcplusplus::SYCL::add_delay_list_save_sigma_VGL_batched(), qmcplusplus::Units::charge::e, and qmcplusplus::queue.

 {
   try
   {
     SYCL::add_delay_list_save_sigma_VGL_batched(queue.getNative(), delay_list, rowchanged, delay_count, binv, binv_lda,
                                                 ratio_inv, phi_vgl_in, phi_vgl_stride, phi_out, dphi_out, d2phi_out,
                                                 norb, n_accepted, batch_count);
   }
   catch (sycl::exception& e)
   {
     throw std::runtime_error(std::string("SYCL::add_delay_list_save_y_VGL_batched exception: ") + e.what());
   }
 }

◆ add_delay_list_save_sigma_VGL_batched() [3/3]

void qmcplusplus::compute::add_delay_list_save_sigma_VGL_batched	(	Queue< PlatformKind::OMPTARGET > &	queue,
		int *const	delay_list[],
		const int	rowchanged,
		const int	delay_count,
		T *const	binv[],
		const int	binv_lda,
		const T *const	ratio_inv,
		const T *const	phi_vgl_in[],
		const size_t	phi_vgl_stride,
		T *const	phi_out[],
		T *const	dphi_out[],
		T *const	d2phi_out[],
		const int	norb,
		const int	n_accepted,
		const int	batch_count
	)

Definition at line 98 of file AccelMatrixUpdateOMPTarget.hpp.

 {
   PRAGMA_OFFLOAD("omp target teams distribute \
                   is_device_ptr(delay_list, binv, ratio_inv, phi_vgl_in, phi_out, dphi_out, d2phi_out)")
   for (size_t iw = 0; iw < batch_count; iw++)
     if (iw < n_accepted)
     {
       // real accept, settle y and Z
       int* __restrict__ delay_list_iw = delay_list[iw];
       T* __restrict__ binvrow_iw      = binv[iw] + delay_count * binv_lda;
       const T* __restrict__ phi_in_iw = phi_vgl_in[iw];
       T* __restrict__ phi_out_iw      = phi_out[iw];
       T* __restrict__ dphi_out_iw     = dphi_out[iw];
       T* __restrict__ d2phi_out_iw    = d2phi_out[iw];
 
       delay_list_iw[delay_count] = rowchanged;
       binvrow_iw[delay_count]    = ratio_inv[iw];
 
       PRAGMA_OFFLOAD("omp parallel for")
       for (size_t col_id = 0; col_id < delay_count; col_id++)
         binvrow_iw[col_id] *= ratio_inv[iw];
 
       PRAGMA_OFFLOAD("omp parallel for")
       for (size_t col_id = 0; col_id < norb; col_id++)
       {
         // copy phiV, dphiV and d2phiV from temporary to final without a separate kernel.
         phi_out_iw[col_id]          = phi_in_iw[col_id];
         dphi_out_iw[col_id * 3]     = phi_in_iw[col_id + phi_vgl_stride];
         dphi_out_iw[col_id * 3 + 1] = phi_in_iw[col_id + phi_vgl_stride * 2];
         dphi_out_iw[col_id * 3 + 2] = phi_in_iw[col_id + phi_vgl_stride * 3];
         d2phi_out_iw[col_id]        = phi_in_iw[col_id + phi_vgl_stride * 4];
       }
     }
     else
     {
       // fake accept. Set Y, Z with zero and x with 1
       T* __restrict__ binv_iw = binv[iw];
       PRAGMA_OFFLOAD("omp parallel for")
       for (size_t col_id = 0; col_id < delay_count; col_id++)
         binv_iw[delay_count * binv_lda + col_id] = binv_iw[delay_count + binv_lda * col_id] = T(0);
 
       int* __restrict__ delay_list_iw               = delay_list[iw];
       binv_iw[delay_count * binv_lda + delay_count] = T(1);
       delay_list_iw[delay_count]                    = -1;
 
       T* __restrict__ Urow_iw = phi_out[iw];
       PRAGMA_OFFLOAD("omp parallel for")
       for (size_t col_id = 0; col_id < norb; col_id++)
       {
         Urow_iw[col_id] = T(0);
       }
     }
 }

◆ applyW_batched() [1/3]

void qmcplusplus::compute::applyW_batched	(	Queue< PlatformKind::CUDA > &	queue,
		const int *const	delay_list[],
		const int	delay_count,
		T *const	tempMat[],
		const int	lda,
		const int	batch_count
	)

Definition at line 82 of file AccelMatrixUpdateCUDA.hpp.

References qmcplusplus::CUDA::applyW_batched(), qmcplusplus::cudaErrorCheck(), qmcplusplus::lda, and qmcplusplus::queue.

Referenced by DelayedUpdateBatched< PL, VALUE >::mw_updateInvMat().

 {
   cudaErrorCheck(CUDA::applyW_batched(queue.getNative(), delay_list, delay_count, tempMat, lda, batch_count),
                  "CUDA::applyW_batched failed!");
 }

◆ applyW_batched() [2/3]

void qmcplusplus::compute::applyW_batched	(	Queue< PlatformKind::SYCL > &	queue,
		const int *const	delay_list[],
		const int	delay_count,
		T *const	tempMat[],
		const int	lda,
		const int	batch_count
	)

Definition at line 99 of file AccelMatrixUpdateSYCL.hpp.

References qmcplusplus::SYCL::applyW_batched(), qmcplusplus::Units::charge::e, qmcplusplus::lda, and qmcplusplus::queue.

 {
   try
   {
     SYCL::applyW_batched(queue.getNative(), delay_list, delay_count, tempMat, lda, batch_count);
   }
   catch (sycl::exception& e)
   {
     throw std::runtime_error(std::string("SYCL::applyW_batched exception: ") + e.what());
   }
 }

◆ applyW_batched() [3/3]

void qmcplusplus::compute::applyW_batched	(	Queue< PlatformKind::OMPTARGET > &	queue,
		const int *const	delay_list[],
		const int	delay_count,
		T *const	tempMat[],
		const int	lda,
		const int	batch_count
	)

Definition at line 168 of file AccelMatrixUpdateOMPTarget.hpp.

References qmcplusplus::lda.

 {
   PRAGMA_OFFLOAD("omp target teams distribute is_device_ptr(delay_list, tempMat)")
   for (size_t iw = 0; iw < batch_count; iw++)
   {
     const int* __restrict__ delay_list_iw = delay_list[iw];
     T* __restrict__ tempMat_iw            = tempMat[iw];
 
     PRAGMA_OFFLOAD("omp parallel for")
     for (size_t col_id = 0; col_id < delay_count; col_id++)
     {
       const int row_id = delay_list_iw[col_id];
       if (row_id >= 0)
         tempMat_iw[row_id * lda + col_id] = tempMat_iw[row_id * lda + col_id] - T(1);
     }
   }
 }

◆ calcGradients_batched() [1/3]

void qmcplusplus::compute::calcGradients_batched	(	Queue< PlatformKind::CUDA > &	queue,
		const int	n,
		const T *const	Ainvrow[],
		const T *const	dpsiMrow[],
		T *const	grads_now,
		const int	batch_count
	)

Definition at line 45 of file AccelMatrixUpdateCUDA.hpp.

References qmcplusplus::CUDA::calcGradients_batched(), qmcplusplus::cudaErrorCheck(), qmcplusplus::n, and qmcplusplus::queue.

Referenced by DelayedUpdateBatched< PL, VALUE >::mw_evalGrad().

 {
   cudaErrorCheck(CUDA::calcGradients_batched(queue.getNative(), n, Ainvrow, dpsiMrow, grads_now, batch_count),
                  "CUDA::calcGradients_cuda failed!");
 }

◆ calcGradients_batched() [2/3]

void qmcplusplus::compute::calcGradients_batched	(	Queue< PlatformKind::SYCL > &	queue,
		const int	n,
		const T *const	Ainvrow[],
		const T *const	dpsiMrow[],
		T *const	grads_now,
		const int	batch_count
	)

Definition at line 51 of file AccelMatrixUpdateSYCL.hpp.

References qmcplusplus::SYCL::calcGradients_batched(), qmcplusplus::Units::charge::e, qmcplusplus::n, and qmcplusplus::queue.

 {
   try
   {
     SYCL::calcGradients_batched(queue.getNative(), n, Ainvrow, dpsiMrow, grads_now, batch_count);
   }
   catch (sycl::exception& e)
   {
     throw std::runtime_error(std::string("SYCL::calcGradients_batched exception: ") + e.what());
   }
 }

◆ calcGradients_batched() [3/3]

void qmcplusplus::compute::calcGradients_batched	(	Queue< PlatformKind::OMPTARGET > &	queue,
		const int	n,
		const T *const	Ainvrow[],
		const T *const	dpsiMrow[],
		T *const	grads_now,
		const int	batch_count
	)

Definition at line 66 of file AccelMatrixUpdateOMPTarget.hpp.

References qmcplusplus::n.

 {
   PRAGMA_OFFLOAD("omp target teams distribute is_device_ptr(Ainvrow, dpsiMrow, grads_now)")
   for (size_t iw = 0; iw < batch_count; iw++)
   {
     const T* __restrict__ invRow    = Ainvrow[iw];
     const T* __restrict__ dpsiM_row = dpsiMrow[iw];
 
     T sum_x = 0;
     T sum_y = 0;
     T sum_z = 0;
 
     PRAGMA_OFFLOAD("omp parallel for reduction(+: sum_x,sum_y,sum_z)")
     for (size_t col_id = 0; col_id < n; col_id++)
     {
       sum_x += invRow[col_id] * dpsiM_row[col_id * 3];
       sum_y += invRow[col_id] * dpsiM_row[col_id * 3 + 1];
       sum_z += invRow[col_id] * dpsiM_row[col_id * 3 + 2];
     }
 
     grads_now[iw * 3]     = sum_x;
     grads_now[iw * 3 + 1] = sum_y;
     grads_now[iw * 3 + 2] = sum_z;
   }
 }

◆ copyAinvRow_saveGL_batched() [1/3]

void qmcplusplus::compute::copyAinvRow_saveGL_batched	(	Queue< PlatformKind::OMPTARGET > &	queue,
		const int	rowchanged,
		const int	n,
		const T *const	Ainv[],
		const int	lda,
		T *const	temp[],
		T *const	rcopy[],
		const T *const	phi_vgl_in[],
		const size_t	phi_vgl_stride,
		T *const	dphi_out[],
		T *const	d2phi_out[],
		const int	batch_count
	)

Definition at line 25 of file AccelMatrixUpdateOMPTarget.hpp.

References qmcplusplus::lda, and qmcplusplus::n.

 {
   PRAGMA_OFFLOAD("omp target teams distribute is_device_ptr(Ainv, temp, rcopy, phi_vgl_in, dphi_out, d2phi_out)")
   for (size_t iw = 0; iw < batch_count; iw++)
   {
     const T* __restrict__ Ainv_iw   = Ainv[iw];
     T* __restrict__ temp_iw         = temp[iw];
     T* __restrict__ rcopy_iw        = rcopy[iw];
     const T* __restrict__ phi_in_iw = phi_vgl_in[iw];
     T* __restrict__ dphi_out_iw     = dphi_out[iw];
     T* __restrict__ d2phi_out_iw    = d2phi_out[iw];
 
     temp_iw[rowchanged] = temp_iw[rowchanged] - T(1);
 
     PRAGMA_OFFLOAD("omp parallel for")
     for (size_t col_id = 0; col_id < n; col_id++)
     {
       rcopy_iw[col_id] = Ainv_iw[rowchanged * lda + col_id];
 
       // the following copying data on the device is not part of SM-1
       // it is intended to copy dphiV and d2phiV from temporary to final without a separate kernel.
       dphi_out_iw[col_id * 3]     = phi_in_iw[col_id + phi_vgl_stride];
       dphi_out_iw[col_id * 3 + 1] = phi_in_iw[col_id + phi_vgl_stride * 2];
       dphi_out_iw[col_id * 3 + 2] = phi_in_iw[col_id + phi_vgl_stride * 3];
       d2phi_out_iw[col_id]        = phi_in_iw[col_id + phi_vgl_stride * 4];
     }
   }
 }

◆ copyAinvRow_saveGL_batched() [2/3]

void qmcplusplus::compute::copyAinvRow_saveGL_batched	(	Queue< PlatformKind::CUDA > &	queue,
		const int	rowchanged,
		const int	n,
		const T *const	Ainv[],
		const int	lda,
		T *const	temp[],
		T *const	rcopy[],
		const T *const	phi_vgl_in[],
		const size_t	phi_vgl_stride,
		T *const	dphi_out[],
		T *const	d2phi_out[],
		const int	batch_count
	)

Definition at line 26 of file AccelMatrixUpdateCUDA.hpp.

References qmcplusplus::CUDA::copyAinvRow_saveGL_batched(), qmcplusplus::cudaErrorCheck(), qmcplusplus::lda, qmcplusplus::n, and qmcplusplus::queue.

Referenced by DelayedUpdateBatched< PL, VALUE >::mw_updateRow().

 {
   cudaErrorCheck(CUDA::copyAinvRow_saveGL_batched(queue.getNative(), rowchanged, n, Ainv, lda, temp, rcopy, phi_vgl_in,
                                                   phi_vgl_stride, dphi_out, d2phi_out, batch_count),
                  "CUDA::copyAinvRow_saveGL_cuda failed!");
 }

◆ copyAinvRow_saveGL_batched() [3/3]

void qmcplusplus::compute::copyAinvRow_saveGL_batched	(	Queue< PlatformKind::SYCL > &	queue,
		const int	rowchanged,
		const int	n,
		const T *const	Ainv[],
		const int	lda,
		T *const	temp[],
		T *const	rcopy[],
		const T *const	phi_vgl_in[],
		const size_t	phi_vgl_stride,
		T *const	dphi_out[],
		T *const	d2phi_out[],
		const int	batch_count
	)

Definition at line 26 of file AccelMatrixUpdateSYCL.hpp.

References qmcplusplus::SYCL::copyAinvRow_saveGL_batched(), qmcplusplus::Units::charge::e, qmcplusplus::lda, qmcplusplus::n, and qmcplusplus::queue.

 {
   try
   {
     SYCL::copyAinvRow_saveGL_batched(queue.getNative(), rowchanged, n, Ainv, lda, temp, rcopy, phi_vgl_in,
                                      phi_vgl_stride, dphi_out, d2phi_out, batch_count);
   }
   catch (sycl::exception& e)
   {
     throw std::runtime_error(std::string("SYCL::copyAinvRow_saveGL_batched exception: ") + e.what());
   }
 }

Namespaces

Classes

Functions