![]() |
QMCPACK
|
Implement selected batched BLAS1/2 calls using CUDA for different data types S/C/D/Z. More...
Functions | |
cudaError_t | gemv_batched (cudaStream_t handle, const char trans, const int m, const int n, const float *alpha, const float *const A[], const int lda, const float *const x[], const int incx, const float *beta, float *const y[], const int incy, const int batch_count) |
Xgemv batched API. More... | |
cudaError_t | gemv_batched (cudaStream_t handle, const char trans, const int m, const int n, const double *alpha, const double *const A[], const int lda, const double *const x[], const int incx, const double *beta, double *const y[], const int incy, const int batch_count) |
cudaError_t | gemv_batched (cudaStream_t handle, const char trans, const int m, const int n, const std::complex< float > *alpha, const std::complex< float > *const A[], const int lda, const std::complex< float > *const x[], const int incx, const std::complex< float > *beta, std::complex< float > *const y[], const int incy, const int batch_count) |
cudaError_t | gemv_batched (cudaStream_t handle, const char trans, const int m, const int n, const std::complex< double > *alpha, const std::complex< double > *const A[], const int lda, const std::complex< double > *const x[], const int incx, const std::complex< double > *beta, std::complex< double > *const y[], const int incy, const int batch_count) |
cudaError_t | ger_batched (cudaStream_t handle, const int m, const int n, const float *alpha, const float *const x[], const int incx, const float *const y[], const int incy, float *const A[], const int lda, const int batch_count) |
Xger batched API. More... | |
cudaError_t | ger_batched (cudaStream_t handle, const int m, const int n, const double *alpha, const double *const x[], const int incx, const double *const y[], const int incy, double *const A[], const int lda, const int batch_count) |
cudaError_t | ger_batched (cudaStream_t handle, const int m, const int n, const std::complex< float > *alpha, const std::complex< float > *const x[], const int incx, const std::complex< float > *const y[], const int incy, std::complex< float > *const A[], const int lda, const int batch_count) |
cudaError_t | ger_batched (cudaStream_t handle, const int m, const int n, const std::complex< double > *alpha, const std::complex< double > *const x[], const int incx, const std::complex< double > *const y[], const int incy, std::complex< double > *const A[], const int lda, const int batch_count) |
cudaError_t | copy_batched (cudaStream_t hstream, const int n, const float *const in[], const int incx, float *const out[], const int incy, const int batch_count) |
Xcopy batched API. More... | |
cudaError_t | copy_batched (cudaStream_t hstream, const int n, const double *const in[], const int incx, double *const out[], const int incy, const int batch_count) |
cudaError_t | copy_batched (cudaStream_t hstream, const int n, const std::complex< float > *const in[], const int incx, std::complex< float > *const out[], const int incy, const int batch_count) |
cudaError_t | copy_batched (cudaStream_t hstream, const int n, const std::complex< double > *const in[], const int incx, std::complex< double > *const out[], const int incy, const int batch_count) |
Implement selected batched BLAS1/2 calls using CUDA for different data types S/C/D/Z.
cuBLAS_MFs stands for missing functions in cuBLAS. 1) column major just like the BLAS fortran API 2) all the functions are asynchronous 3) all the pointer arguments are expected as device pointers. 4) in batched APIs, alpha and beta are not scalars but pointers to array of batch size.
cudaError_t qmcplusplus::cuBLAS_MFs::copy_batched | ( | cudaStream_t | hstream, |
const int | n, | ||
const float *const | in[], | ||
const int | incx, | ||
float *const | out[], | ||
const int | incy, | ||
const int | batch_count | ||
) |
Xcopy batched API.
handle | handle for asynchronous computation |
n | number of elements to be copied |
in | device array of device pointers of vector |
incx | increment for the elements of in. It cannot be zero. |
out | device array of device pointers of vector |
incy | increment for the elements of out. It cannot be zero. |
batch_count | batch size |
Referenced by qmcplusplus::compute::BLAS::copy_batched().
cudaError_t qmcplusplus::cuBLAS_MFs::copy_batched | ( | cudaStream_t | hstream, |
const int | n, | ||
const double *const | in[], | ||
const int | incx, | ||
double *const | out[], | ||
const int | incy, | ||
const int | batch_count | ||
) |
cudaError_t qmcplusplus::cuBLAS_MFs::copy_batched | ( | cudaStream_t | hstream, |
const int | n, | ||
const std::complex< float > *const | in[], | ||
const int | incx, | ||
std::complex< float > *const | out[], | ||
const int | incy, | ||
const int | batch_count | ||
) |
cudaError_t qmcplusplus::cuBLAS_MFs::copy_batched | ( | cudaStream_t | hstream, |
const int | n, | ||
const std::complex< double > *const | in[], | ||
const int | incx, | ||
std::complex< double > *const | out[], | ||
const int | incy, | ||
const int | batch_count | ||
) |
cudaError_t qmcplusplus::cuBLAS_MFs::gemv_batched | ( | cudaStream_t | handle, |
const char | trans, | ||
const int | m, | ||
const int | n, | ||
const float * | alpha, | ||
const float *const | A[], | ||
const int | lda, | ||
const float *const | x[], | ||
const int | incx, | ||
const float * | beta, | ||
float *const | y[], | ||
const int | incy, | ||
const int | batch_count | ||
) |
Xgemv batched API.
handle | handle for asynchronous computation |
trans | whether A matrices are transposed |
m | number of rows in A |
n | number of columns in A |
alpha | the factor vector of A |
A | device array of device pointers of matrices |
lda | leading dimension of A |
x | device array of device pointers of vector |
incx | increment for the elements of x. It cannot be zero. |
beta | the factor vector of vector y |
y | device array of device pointers of vector |
incy | increment for the elements of y. It cannot be zero. |
batch_count | batch size |
Referenced by qmcplusplus::compute::BLAS::gemv_batched().
cudaError_t qmcplusplus::cuBLAS_MFs::gemv_batched | ( | cudaStream_t | handle, |
const char | trans, | ||
const int | m, | ||
const int | n, | ||
const double * | alpha, | ||
const double *const | A[], | ||
const int | lda, | ||
const double *const | x[], | ||
const int | incx, | ||
const double * | beta, | ||
double *const | y[], | ||
const int | incy, | ||
const int | batch_count | ||
) |
cudaError_t qmcplusplus::cuBLAS_MFs::gemv_batched | ( | cudaStream_t | handle, |
const char | trans, | ||
const int | m, | ||
const int | n, | ||
const std::complex< float > * | alpha, | ||
const std::complex< float > *const | A[], | ||
const int | lda, | ||
const std::complex< float > *const | x[], | ||
const int | incx, | ||
const std::complex< float > * | beta, | ||
std::complex< float > *const | y[], | ||
const int | incy, | ||
const int | batch_count | ||
) |
cudaError_t qmcplusplus::cuBLAS_MFs::gemv_batched | ( | cudaStream_t | handle, |
const char | trans, | ||
const int | m, | ||
const int | n, | ||
const std::complex< double > * | alpha, | ||
const std::complex< double > *const | A[], | ||
const int | lda, | ||
const std::complex< double > *const | x[], | ||
const int | incx, | ||
const std::complex< double > * | beta, | ||
std::complex< double > *const | y[], | ||
const int | incy, | ||
const int | batch_count | ||
) |
cudaError_t qmcplusplus::cuBLAS_MFs::ger_batched | ( | cudaStream_t | handle, |
const int | m, | ||
const int | n, | ||
const float * | alpha, | ||
const float *const | x[], | ||
const int | incx, | ||
const float *const | y[], | ||
const int | incy, | ||
float *const | A[], | ||
const int | lda, | ||
const int | batch_count | ||
) |
Xger batched API.
handle | handle for asynchronous computation |
m | number of rows in A |
n | number of columns in A |
alpha | the factor vector of A |
x | device array of device pointers of vector |
incx | increment for the elements of x. It cannot be zero. |
y | device array of device pointers of vector |
incy | increment for the elements of y. It cannot be zero. |
A | device array of device pointers of matrices |
lda | leading dimension of A |
batch_count | batch size |
Referenced by qmcplusplus::compute::BLAS::ger_batched().
cudaError_t qmcplusplus::cuBLAS_MFs::ger_batched | ( | cudaStream_t | handle, |
const int | m, | ||
const int | n, | ||
const double * | alpha, | ||
const double *const | x[], | ||
const int | incx, | ||
const double *const | y[], | ||
const int | incy, | ||
double *const | A[], | ||
const int | lda, | ||
const int | batch_count | ||
) |
cudaError_t qmcplusplus::cuBLAS_MFs::ger_batched | ( | cudaStream_t | handle, |
const int | m, | ||
const int | n, | ||
const std::complex< float > * | alpha, | ||
const std::complex< float > *const | x[], | ||
const int | incx, | ||
const std::complex< float > *const | y[], | ||
const int | incy, | ||
std::complex< float > *const | A[], | ||
const int | lda, | ||
const int | batch_count | ||
) |
cudaError_t qmcplusplus::cuBLAS_MFs::ger_batched | ( | cudaStream_t | handle, |
const int | m, | ||
const int | n, | ||
const std::complex< double > * | alpha, | ||
const std::complex< double > *const | x[], | ||
const int | incx, | ||
const std::complex< double > *const | y[], | ||
const int | incy, | ||
std::complex< double > *const | A[], | ||
const int | lda, | ||
const int | batch_count | ||
) |