Implement selected batched BLAS1/2 calls using CUDA for different data types S/C/D/Z. More...

Functions
cudaError_t	gemv_batched (cudaStream_t handle, const char trans, const int m, const int n, const float alpha, const float const A[], const int lda, const float const x[], const int incx, const float beta, float *const y[], const int incy, const int batch_count)
	Xgemv batched API. More...

cudaError_t	gemv_batched (cudaStream_t handle, const char trans, const int m, const int n, const double alpha, const double const A[], const int lda, const double const x[], const int incx, const double beta, double *const y[], const int incy, const int batch_count)

cudaError_t	gemv_batched (cudaStream_t handle, const char trans, const int m, const int n, const std::complex< float > alpha, const std::complex< float > const A[], const int lda, const std::complex< float > const x[], const int incx, const std::complex< float > beta, std::complex< float > *const y[], const int incy, const int batch_count)

cudaError_t	gemv_batched (cudaStream_t handle, const char trans, const int m, const int n, const std::complex< double > alpha, const std::complex< double > const A[], const int lda, const std::complex< double > const x[], const int incx, const std::complex< double > beta, std::complex< double > *const y[], const int incy, const int batch_count)

cudaError_t	ger_batched (cudaStream_t handle, const int m, const int n, const float alpha, const float const x[], const int incx, const float const y[], const int incy, float const A[], const int lda, const int batch_count)
	Xger batched API. More...

cudaError_t	ger_batched (cudaStream_t handle, const int m, const int n, const double alpha, const double const x[], const int incx, const double const y[], const int incy, double const A[], const int lda, const int batch_count)

cudaError_t	ger_batched (cudaStream_t handle, const int m, const int n, const std::complex< float > alpha, const std::complex< float > const x[], const int incx, const std::complex< float > const y[], const int incy, std::complex< float > const A[], const int lda, const int batch_count)

cudaError_t	ger_batched (cudaStream_t handle, const int m, const int n, const std::complex< double > alpha, const std::complex< double > const x[], const int incx, const std::complex< double > const y[], const int incy, std::complex< double > const A[], const int lda, const int batch_count)

cudaError_t	copy_batched (cudaStream_t hstream, const int n, const float const in[], const int incx, float const out[], const int incy, const int batch_count)
	Xcopy batched API. More...

cudaError_t	copy_batched (cudaStream_t hstream, const int n, const double const in[], const int incx, double const out[], const int incy, const int batch_count)

cudaError_t	copy_batched (cudaStream_t hstream, const int n, const std::complex< float > const in[], const int incx, std::complex< float > const out[], const int incy, const int batch_count)

cudaError_t	copy_batched (cudaStream_t hstream, const int n, const std::complex< double > const in[], const int incx, std::complex< double > const out[], const int incy, const int batch_count)

Detailed Description

Implement selected batched BLAS1/2 calls using CUDA for different data types S/C/D/Z.

cuBLAS_MFs stands for missing functions in cuBLAS. 1) column major just like the BLAS fortran API 2) all the functions are asynchronous 3) all the pointer arguments are expected as device pointers. 4) in batched APIs, alpha and beta are not scalars but pointers to array of batch size.

Function Documentation

◆ copy_batched() [1/4]

cudaError_t qmcplusplus::cuBLAS_MFs::copy_batched	(	cudaStream_t	hstream,
		const int	n,
		const float *const	in[],
		const int	incx,
		float *const	out[],
		const int	incy,
		const int	batch_count
	)

Xcopy batched API.

Parameters

handle	handle for asynchronous computation
n	number of elements to be copied
in	device array of device pointers of vector
incx	increment for the elements of in. It cannot be zero.
out	device array of device pointers of vector
incy	increment for the elements of out. It cannot be zero.
batch_count	batch size

Referenced by qmcplusplus::compute::BLAS::copy_batched().

◆ copy_batched() [2/4]

cudaError_t qmcplusplus::cuBLAS_MFs::copy_batched	(	cudaStream_t	hstream,
		const int	n,
		const double *const	in[],
		const int	incx,
		double *const	out[],
		const int	incy,
		const int	batch_count
	)

◆ copy_batched() [3/4]

cudaError_t qmcplusplus::cuBLAS_MFs::copy_batched	(	cudaStream_t	hstream,
		const int	n,
		const std::complex< float > *const	in[],
		const int	incx,
		std::complex< float > *const	out[],
		const int	incy,
		const int	batch_count
	)

◆ copy_batched() [4/4]

cudaError_t qmcplusplus::cuBLAS_MFs::copy_batched	(	cudaStream_t	hstream,
		const int	n,
		const std::complex< double > *const	in[],
		const int	incx,
		std::complex< double > *const	out[],
		const int	incy,
		const int	batch_count
	)

◆ gemv_batched() [1/4]

cudaError_t qmcplusplus::cuBLAS_MFs::gemv_batched	(	cudaStream_t	handle,
		const char	trans,
		const int	m,
		const int	n,
		const float *	alpha,
		const float *const	A[],
		const int	lda,
		const float *const	x[],
		const int	incx,
		const float *	beta,
		float *const	y[],
		const int	incy,
		const int	batch_count
	)

Xgemv batched API.

Parameters

handle	handle for asynchronous computation
trans	whether A matrices are transposed
m	number of rows in A
n	number of columns in A
alpha	the factor vector of A
A	device array of device pointers of matrices
lda	leading dimension of A
x	device array of device pointers of vector
incx	increment for the elements of x. It cannot be zero.
beta	the factor vector of vector y
y	device array of device pointers of vector
incy	increment for the elements of y. It cannot be zero.
batch_count	batch size

Referenced by qmcplusplus::compute::BLAS::gemv_batched().

◆ gemv_batched() [2/4]

cudaError_t qmcplusplus::cuBLAS_MFs::gemv_batched	(	cudaStream_t	handle,
		const char	trans,
		const int	m,
		const int	n,
		const double *	alpha,
		const double *const	A[],
		const int	lda,
		const double *const	x[],
		const int	incx,
		const double *	beta,
		double *const	y[],
		const int	incy,
		const int	batch_count
	)

◆ gemv_batched() [3/4]

cudaError_t qmcplusplus::cuBLAS_MFs::gemv_batched	(	cudaStream_t	handle,
		const char	trans,
		const int	m,
		const int	n,
		const std::complex< float > *	alpha,
		const std::complex< float > *const	A[],
		const int	lda,
		const std::complex< float > *const	x[],
		const int	incx,
		const std::complex< float > *	beta,
		std::complex< float > *const	y[],
		const int	incy,
		const int	batch_count
	)

◆ gemv_batched() [4/4]

cudaError_t qmcplusplus::cuBLAS_MFs::gemv_batched	(	cudaStream_t	handle,
		const char	trans,
		const int	m,
		const int	n,
		const std::complex< double > *	alpha,
		const std::complex< double > *const	A[],
		const int	lda,
		const std::complex< double > *const	x[],
		const int	incx,
		const std::complex< double > *	beta,
		std::complex< double > *const	y[],
		const int	incy,
		const int	batch_count
	)

◆ ger_batched() [1/4]

cudaError_t qmcplusplus::cuBLAS_MFs::ger_batched	(	cudaStream_t	handle,
		const int	m,
		const int	n,
		const float *	alpha,
		const float *const	x[],
		const int	incx,
		const float *const	y[],
		const int	incy,
		float *const	A[],
		const int	lda,
		const int	batch_count
	)

Xger batched API.

Parameters

handle	handle for asynchronous computation
m	number of rows in A
n	number of columns in A
alpha	the factor vector of A
x	device array of device pointers of vector
incx	increment for the elements of x. It cannot be zero.
y	device array of device pointers of vector
incy	increment for the elements of y. It cannot be zero.
A	device array of device pointers of matrices
lda	leading dimension of A
batch_count	batch size

Referenced by qmcplusplus::compute::BLAS::ger_batched().

◆ ger_batched() [2/4]

cudaError_t qmcplusplus::cuBLAS_MFs::ger_batched	(	cudaStream_t	handle,
		const int	m,
		const int	n,
		const double *	alpha,
		const double *const	x[],
		const int	incx,
		const double *const	y[],
		const int	incy,
		double *const	A[],
		const int	lda,
		const int	batch_count
	)

◆ ger_batched() [3/4]

cudaError_t qmcplusplus::cuBLAS_MFs::ger_batched	(	cudaStream_t	handle,
		const int	m,
		const int	n,
		const std::complex< float > *	alpha,
		const std::complex< float > *const	x[],
		const int	incx,
		const std::complex< float > *const	y[],
		const int	incy,
		std::complex< float > *const	A[],
		const int	lda,
		const int	batch_count
	)

◆ ger_batched() [4/4]

cudaError_t qmcplusplus::cuBLAS_MFs::ger_batched	(	cudaStream_t	handle,
		const int	m,
		const int	n,
		const std::complex< double > *	alpha,
		const std::complex< double > *const	x[],
		const int	incx,
		const std::complex< double > *const	y[],
		const int	incy,
		std::complex< double > *const	A[],
		const int	lda,
		const int	batch_count
	)

Functions

Detailed Description

Function Documentation

◆ copy_batched() [1/4]

◆ copy_batched() [2/4]

◆ copy_batched() [3/4]

◆ copy_batched() [4/4]

◆ gemv_batched() [1/4]

◆ gemv_batched() [2/4]

◆ gemv_batched() [3/4]

◆ gemv_batched() [4/4]

◆ ger_batched() [1/4]

◆ ger_batched() [2/4]

◆ ger_batched() [3/4]

◆ ger_batched() [4/4]