QMCPACK
qmcplusplus::cuBLAS_MFs Namespace Reference

Implement selected batched BLAS1/2 calls using CUDA for different data types S/C/D/Z. More...

Functions

cudaError_t gemv_batched (cudaStream_t handle, const char trans, const int m, const int n, const float *alpha, const float *const A[], const int lda, const float *const x[], const int incx, const float *beta, float *const y[], const int incy, const int batch_count)
 Xgemv batched API. More...
 
cudaError_t gemv_batched (cudaStream_t handle, const char trans, const int m, const int n, const double *alpha, const double *const A[], const int lda, const double *const x[], const int incx, const double *beta, double *const y[], const int incy, const int batch_count)
 
cudaError_t gemv_batched (cudaStream_t handle, const char trans, const int m, const int n, const std::complex< float > *alpha, const std::complex< float > *const A[], const int lda, const std::complex< float > *const x[], const int incx, const std::complex< float > *beta, std::complex< float > *const y[], const int incy, const int batch_count)
 
cudaError_t gemv_batched (cudaStream_t handle, const char trans, const int m, const int n, const std::complex< double > *alpha, const std::complex< double > *const A[], const int lda, const std::complex< double > *const x[], const int incx, const std::complex< double > *beta, std::complex< double > *const y[], const int incy, const int batch_count)
 
cudaError_t ger_batched (cudaStream_t handle, const int m, const int n, const float *alpha, const float *const x[], const int incx, const float *const y[], const int incy, float *const A[], const int lda, const int batch_count)
 Xger batched API. More...
 
cudaError_t ger_batched (cudaStream_t handle, const int m, const int n, const double *alpha, const double *const x[], const int incx, const double *const y[], const int incy, double *const A[], const int lda, const int batch_count)
 
cudaError_t ger_batched (cudaStream_t handle, const int m, const int n, const std::complex< float > *alpha, const std::complex< float > *const x[], const int incx, const std::complex< float > *const y[], const int incy, std::complex< float > *const A[], const int lda, const int batch_count)
 
cudaError_t ger_batched (cudaStream_t handle, const int m, const int n, const std::complex< double > *alpha, const std::complex< double > *const x[], const int incx, const std::complex< double > *const y[], const int incy, std::complex< double > *const A[], const int lda, const int batch_count)
 
cudaError_t copy_batched (cudaStream_t hstream, const int n, const float *const in[], const int incx, float *const out[], const int incy, const int batch_count)
 Xcopy batched API. More...
 
cudaError_t copy_batched (cudaStream_t hstream, const int n, const double *const in[], const int incx, double *const out[], const int incy, const int batch_count)
 
cudaError_t copy_batched (cudaStream_t hstream, const int n, const std::complex< float > *const in[], const int incx, std::complex< float > *const out[], const int incy, const int batch_count)
 
cudaError_t copy_batched (cudaStream_t hstream, const int n, const std::complex< double > *const in[], const int incx, std::complex< double > *const out[], const int incy, const int batch_count)
 

Detailed Description

Implement selected batched BLAS1/2 calls using CUDA for different data types S/C/D/Z.

cuBLAS_MFs stands for missing functions in cuBLAS. 1) column major just like the BLAS fortran API 2) all the functions are asynchronous 3) all the pointer arguments are expected as device pointers. 4) in batched APIs, alpha and beta are not scalars but pointers to array of batch size.

Function Documentation

◆ copy_batched() [1/4]

cudaError_t qmcplusplus::cuBLAS_MFs::copy_batched ( cudaStream_t  hstream,
const int  n,
const float *const  in[],
const int  incx,
float *const  out[],
const int  incy,
const int  batch_count 
)

Xcopy batched API.

Parameters
handlehandle for asynchronous computation
nnumber of elements to be copied
indevice array of device pointers of vector
incxincrement for the elements of in. It cannot be zero.
outdevice array of device pointers of vector
incyincrement for the elements of out. It cannot be zero.
batch_countbatch size

Referenced by qmcplusplus::compute::BLAS::copy_batched().

◆ copy_batched() [2/4]

cudaError_t qmcplusplus::cuBLAS_MFs::copy_batched ( cudaStream_t  hstream,
const int  n,
const double *const  in[],
const int  incx,
double *const  out[],
const int  incy,
const int  batch_count 
)

◆ copy_batched() [3/4]

cudaError_t qmcplusplus::cuBLAS_MFs::copy_batched ( cudaStream_t  hstream,
const int  n,
const std::complex< float > *const  in[],
const int  incx,
std::complex< float > *const  out[],
const int  incy,
const int  batch_count 
)

◆ copy_batched() [4/4]

cudaError_t qmcplusplus::cuBLAS_MFs::copy_batched ( cudaStream_t  hstream,
const int  n,
const std::complex< double > *const  in[],
const int  incx,
std::complex< double > *const  out[],
const int  incy,
const int  batch_count 
)

◆ gemv_batched() [1/4]

cudaError_t qmcplusplus::cuBLAS_MFs::gemv_batched ( cudaStream_t  handle,
const char  trans,
const int  m,
const int  n,
const float *  alpha,
const float *const  A[],
const int  lda,
const float *const  x[],
const int  incx,
const float *  beta,
float *const  y[],
const int  incy,
const int  batch_count 
)

Xgemv batched API.

Parameters
handlehandle for asynchronous computation
transwhether A matrices are transposed
mnumber of rows in A
nnumber of columns in A
alphathe factor vector of A
Adevice array of device pointers of matrices
ldaleading dimension of A
xdevice array of device pointers of vector
incxincrement for the elements of x. It cannot be zero.
betathe factor vector of vector y
ydevice array of device pointers of vector
incyincrement for the elements of y. It cannot be zero.
batch_countbatch size

Referenced by qmcplusplus::compute::BLAS::gemv_batched().

◆ gemv_batched() [2/4]

cudaError_t qmcplusplus::cuBLAS_MFs::gemv_batched ( cudaStream_t  handle,
const char  trans,
const int  m,
const int  n,
const double *  alpha,
const double *const  A[],
const int  lda,
const double *const  x[],
const int  incx,
const double *  beta,
double *const  y[],
const int  incy,
const int  batch_count 
)

◆ gemv_batched() [3/4]

cudaError_t qmcplusplus::cuBLAS_MFs::gemv_batched ( cudaStream_t  handle,
const char  trans,
const int  m,
const int  n,
const std::complex< float > *  alpha,
const std::complex< float > *const  A[],
const int  lda,
const std::complex< float > *const  x[],
const int  incx,
const std::complex< float > *  beta,
std::complex< float > *const  y[],
const int  incy,
const int  batch_count 
)

◆ gemv_batched() [4/4]

cudaError_t qmcplusplus::cuBLAS_MFs::gemv_batched ( cudaStream_t  handle,
const char  trans,
const int  m,
const int  n,
const std::complex< double > *  alpha,
const std::complex< double > *const  A[],
const int  lda,
const std::complex< double > *const  x[],
const int  incx,
const std::complex< double > *  beta,
std::complex< double > *const  y[],
const int  incy,
const int  batch_count 
)

◆ ger_batched() [1/4]

cudaError_t qmcplusplus::cuBLAS_MFs::ger_batched ( cudaStream_t  handle,
const int  m,
const int  n,
const float *  alpha,
const float *const  x[],
const int  incx,
const float *const  y[],
const int  incy,
float *const  A[],
const int  lda,
const int  batch_count 
)

Xger batched API.

Parameters
handlehandle for asynchronous computation
mnumber of rows in A
nnumber of columns in A
alphathe factor vector of A
xdevice array of device pointers of vector
incxincrement for the elements of x. It cannot be zero.
ydevice array of device pointers of vector
incyincrement for the elements of y. It cannot be zero.
Adevice array of device pointers of matrices
ldaleading dimension of A
batch_countbatch size

Referenced by qmcplusplus::compute::BLAS::ger_batched().

◆ ger_batched() [2/4]

cudaError_t qmcplusplus::cuBLAS_MFs::ger_batched ( cudaStream_t  handle,
const int  m,
const int  n,
const double *  alpha,
const double *const  x[],
const int  incx,
const double *const  y[],
const int  incy,
double *const  A[],
const int  lda,
const int  batch_count 
)

◆ ger_batched() [3/4]

cudaError_t qmcplusplus::cuBLAS_MFs::ger_batched ( cudaStream_t  handle,
const int  m,
const int  n,
const std::complex< float > *  alpha,
const std::complex< float > *const  x[],
const int  incx,
const std::complex< float > *const  y[],
const int  incy,
std::complex< float > *const  A[],
const int  lda,
const int  batch_count 
)

◆ ger_batched() [4/4]

cudaError_t qmcplusplus::cuBLAS_MFs::ger_batched ( cudaStream_t  handle,
const int  m,
const int  n,
const std::complex< double > *  alpha,
const std::complex< double > *const  x[],
const int  incx,
const std::complex< double > *const  y[],
const int  incy,
std::complex< double > *const  A[],
const int  lda,
const int  batch_count 
)