QMCPACK
qmcplusplus::syclBLAS Namespace Reference

Typedefs

using syclBLAS_int = std::int64_t
 
using syclBLAS_status = sycl::event
 
using syclBLAS_handle = sycl::queue
 

Functions

template<typename T >
sycl::event gemv (sycl::queue &handle, const char trans, const int m, const int n, const T alpha, const T *const A, const int lda, const T *const x, const int incx, const T beta, T *const y, const int incy, const std::vector< sycl::event > &events)
 
template sycl::event gemv (sycl::queue &handle, const char trans, const int m, const int n, const double alpha, const double *const A, const int lda, const double *const x, const int incx, const double beta, double *const y, const int incy, const std::vector< sycl::event > &events)
 
template sycl::event gemv (sycl::queue &handle, const char trans, const int m, const int n, const float alpha, const float *const A, const int lda, const float *const x, const int incx, const float beta, float *const y, const int incy, const std::vector< sycl::event > &events)
 
template sycl::event gemv (sycl::queue &handle, const char trans, const int m, const int n, const std::complex< double > alpha, const std::complex< double > *const A, const int lda, const std::complex< double > *const x, const int incx, const std::complex< double > beta, std::complex< double > *const y, const int incy, const std::vector< sycl::event > &events)
 
template sycl::event gemv (sycl::queue &handle, const char trans, const int m, const int n, const std::complex< float > alpha, const std::complex< float > *const A, const int lda, const std::complex< float > *const x, const int incx, const std::complex< float > beta, std::complex< float > *const y, const int incy, const std::vector< sycl::event > &events)
 
template<typename T , unsigned COLBS>
sycl::event gemvT_batched_impl (sycl::queue &handle, const int m, const int n, const T *alpha, const T *const A[], const int lda, const T *const x[], const int incx, const T *beta, T *const y[], const int incy, const size_t batch_count, const std::vector< sycl::event > &events={})
 gemv trans = 'T' case. More...
 
template<typename T , unsigned ROWBS>
sycl::event gemvN_batched_impl (sycl::queue &handle, const int m, const int n, const T *alpha, const T *const A[], const int lda, const T *const x[], const int incx, const T *beta, T *const y[], const int incy, const size_t batch_count, const std::vector< sycl::event > &events={})
 gemv trans = 'N' case. More...
 
template<>
sycl::event gemv_batched< float > (sycl::queue &handle, const char trans, const int m, const int n, const float *alpha, const float *const A[], const int lda, const float *const x[], const int incx, const float *beta, float *const y[], const int incy, const size_t batch_count, const std::vector< sycl::event > &events)
 
template<>
sycl::event gemv_batched< double > (sycl::queue &handle, const char trans, const int m, const int n, const double *alpha, const double *const A[], const int lda, const double *const x[], const int incx, const double *beta, double *const y[], const int incy, const size_t batch_count, const std::vector< sycl::event > &events)
 
template<>
sycl::event gemv_batched< std::complex< float > > (sycl::queue &handle, const char trans, const int m, const int n, const std::complex< float > *alpha, const std::complex< float > *const A[], const int lda, const std::complex< float > *const x[], const int incx, const std::complex< float > *beta, std::complex< float > *const y[], const int incy, const size_t batch_count, const std::vector< sycl::event > &events)
 
template<>
sycl::event gemv_batched< std::complex< double > > (sycl::queue &handle, const char trans, const int m, const int n, const std::complex< double > *alpha, const std::complex< double > *const A[], const int lda, const std::complex< double > *const x[], const int incx, const std::complex< double > *beta, std::complex< double > *const y[], const int incy, const size_t batch_count, const std::vector< sycl::event > &events)
 
template<typename T >
sycl::event gemm (sycl::queue &handle, const char tA, const char tB, const int m, const int n, const int k, const T alpha, const T *A, const int lda, const T *B, const int ldb, const T beta, T *C, const int ldc, const std::vector< sycl::event > &events)
 
template sycl::event gemm (sycl::queue &handle, const char tA, const char tB, const int m, const int n, const int k, const float alpha, const float *const A, const int lda, const float *const B, const int ldb, const float beta, float *const C, const int ldc, const std::vector< sycl::event > &events)
 
template sycl::event gemm (sycl::queue &handle, const char tA, const char tB, const int m, const int n, const int k, const double alpha, const double *const A, const int lda, const double *const B, const int ldb, const double beta, double *const C, const int ldc, const std::vector< sycl::event > &events)
 
template sycl::event gemm (sycl::queue &handle, const char tA, const char tB, const int m, const int n, const int k, const std::complex< float > alpha, const std::complex< float > *const A, const int lda, const std::complex< float > *const B, const int ldb, const std::complex< float > beta, std::complex< float > *const C, const int ldc, const std::vector< sycl::event > &events)
 
template sycl::event gemm (sycl::queue &handle, const char tA, const char tB, const int m, const int n, const int k, const std::complex< double > alpha, const std::complex< double > *const A, const int lda, const std::complex< double > *const B, const int ldb, const std::complex< double > beta, std::complex< double > *const C, const int ldc, const std::vector< sycl::event > &events)
 
template<typename T , int TILE_SIZE, int ROWBS>
sycl::event ger_batched_impl (sycl::queue &handle, const int m, const int n, const T *alpha, const T *const x[], const int incx, const T *const y[], const int incy, T *const A[], const int lda, const size_t batch_count, const std::vector< sycl::event > &events)
 
template<>
sycl::event ger_batched< float > (sycl::queue &handle, const int m, const int n, const float *alpha, const float *const x[], const int incx, const float *const y[], const int incy, float *const A[], const int lda, const size_t batch_count, const std::vector< sycl::event > &events)
 
template<>
sycl::event ger_batched< double > (sycl::queue &handle, const int m, const int n, const double *alpha, const double *const x[], const int incx, const double *const y[], const int incy, double *const A[], const int lda, const size_t batch_count, const std::vector< sycl::event > &events)
 
template<>
sycl::event ger_batched< std::complex< float > > (sycl::queue &handle, const int m, const int n, const std::complex< float > *alpha, const std::complex< float > *const x[], const int incx, const std::complex< float > *const y[], const int incy, std::complex< float > *const A[], const int lda, const size_t batch_count, const std::vector< sycl::event > &events)
 
template<>
sycl::event ger_batched< std::complex< double > > (sycl::queue &handle, const int m, const int n, const std::complex< double > *alpha, const std::complex< double > *const x[], const int incx, const std::complex< double > *const y[], const int incy, std::complex< double > *const A[], const int lda, const size_t batch_count, const std::vector< sycl::event > &events)
 
template<typename T1 , typename T2 >
sycl::event transpose (sycl::queue &q, const T1 *restrict in, int m, int lda, T2 *restrict out, int n, int ldb, const std::vector< sycl::event > &events)
 
template sycl::event transpose (sycl::queue &q, const float *restrict in, int m, int lda, double *restrict out, int n, int ldb, const std::vector< sycl::event > &events)
 
template sycl::event transpose (sycl::queue &q, const double *restrict in, int m, int lda, double *restrict out, int n, int ldb, const std::vector< sycl::event > &events)
 
template sycl::event transpose (sycl::queue &q, const std::complex< float > *restrict in, int m, int lda, std::complex< double > *restrict out, int n, int ldb, const std::vector< sycl::event > &events)
 
template sycl::event transpose (sycl::queue &q, const std::complex< double > *restrict in, int m, int lda, std::complex< double > *restrict out, int n, int ldb, const std::vector< sycl::event > &events)
 
template<typename T1 , typename T2 >
sycl::event copy_n (sycl::queue &aq, const T1 *restrict VA, size_t array_size, T2 *restrict VC, const std::vector< sycl::event > &events)
 
template sycl::event copy_n (sycl::queue &aq, const double *restrict VA, size_t array_size, float *restrict VC, const std::vector< sycl::event > &events)
 
template sycl::event copy_n (sycl::queue &aq, const std::complex< double > *restrict VA, size_t array_size, std::complex< float > *restrict VC, const std::vector< sycl::event > &events)
 
oneapi::mkl::transpose convertTransEnum (char trans)
 
template<typename T >
sycl::event gemv_batched (sycl::queue &handle, const char trans, const int m, const int n, const T *alpha, const T *const A[], const int lda, const T *const x[], const int incx, const T *beta, T *const y[], const int incy, const size_t batch_count, const std::vector< sycl::event > &events={})
 in-house version of gemv_batch implemented in SYCL. Can be dropped if we have vendor optimized versions More...
 
template<typename T >
sycl::event ger_batched (sycl::queue &handle, const int m, const int n, const T *alpha, const T *const x[], const int incx, const T *const y[], const int incy, T *const A[], const int lda, const size_t batch_count, const std::vector< sycl::event > &events={})
 in-house version of ger_batch implemented in SYCL. Can be dropped if we have vendor optimized versions More...
 
template<typename T1 , typename T2 >
sycl::event transpose (sycl::queue &q, const T1 *in, int m, int lda, T2 *out, int n, int ldb, const std::vector< sycl::event > &events={})
 
template<typename T1 , typename T2 >
sycl::event copy_n (sycl::queue &aq, const T1 *VA, size_t array_size, T2 *VC, const std::vector< sycl::event > &events={})
 

Typedef Documentation

◆ syclBLAS_handle

Definition at line 26 of file syclBLAS.hpp.

◆ syclBLAS_int

using syclBLAS_int = std::int64_t

Definition at line 24 of file syclBLAS.hpp.

◆ syclBLAS_status

using syclBLAS_status = sycl::event

Definition at line 25 of file syclBLAS.hpp.

Function Documentation

◆ convertTransEnum()

oneapi::mkl::transpose qmcplusplus::syclBLAS::convertTransEnum ( char  trans)
inline

Definition at line 28 of file syclBLAS.hpp.

Referenced by qmcplusplus::compute::BLAS::gemm(), gemm(), qmcplusplus::compute::BLAS::gemm_batched(), gemv(), and qmcplusplus::compute::BLAS::gemv().

29 {
30  if (trans == 'N' || trans == 'n')
31  return oneapi::mkl::transpose::nontrans;
32  else if (trans == 'T' || trans == 't')
33  return oneapi::mkl::transpose::trans;
34  else if (trans == 'C' || trans == 'c')
35  return oneapi::mkl::transpose::conjtrans;
36  else
37  throw std::runtime_error(
38  "syclBLAS::convertTransEnum trans can only be 'N', 'T', 'C', 'n', 't', 'c'. Input value is " +
39  std::string(1, trans));
40 }

◆ copy_n() [1/4]

sycl::event qmcplusplus::syclBLAS::copy_n ( sycl::queue aq,
const T1 *  VA,
size_t  array_size,
T2 *  VC,
const std::vector< sycl::event > &  events = {} 
)

◆ copy_n() [2/4]

sycl::event qmcplusplus::syclBLAS::copy_n ( sycl::queue aq,
const T1 *restrict  VA,
size_t  array_size,
T2 *restrict  VC,
const std::vector< sycl::event > &  events 
)

Definition at line 548 of file syclBLAS.cpp.

Referenced by DelayedUpdate< T, T_FP >::acceptRow(), DelayedUpdateSYCL< T, T_FP >::acceptRow(), DelayedUpdateCUDA< T, T_FP >::acceptRow(), SplineR2R< ST >::applyRotation(), SplineC2C< ST >::applyRotation(), AtomicOrbitals< ST >::applyRotation(), TraceBuffer< TraceInt >::collect_sample(), Vector< T, std::allocator< T > >::construct_copy_elements(), LCAOrbitalSet::evaluate_vgh_impl(), LCAOrbitalSet::evaluate_vghgh_impl(), LCAOrbitalSet::evaluate_vgl_impl(), DiracDeterminant< DU_TYPE >::evaluateDerivRatios(), DiracDeterminantBatched< PL, VT, FPVT >::evaluateDerivRatios(), LCAOrbitalSet::evaluateDetRatios(), SplineC2COMPTarget< ST >::evaluateDetRatios(), SplineC2ROMPTarget< ST >::evaluateDetRatios(), DiracDeterminant< DU_TYPE >::evaluateRatios(), DiracDeterminantBatched< PL, VT, FPVT >::evaluateRatios(), DiracDeterminant< DU_TYPE >::evaluateSpinorRatios(), DiracDeterminantBatched< PL, VT, FPVT >::evaluateSpinorRatios(), DelayedUpdate< T, T_FP >::getInvRow(), DelayedUpdateSYCL< T, T_FP >::getInvRow(), DelayedUpdateCUDA< T, T_FP >::getInvRow(), qmcplusplus::testing::getParticularListener(), MCCoords< CoordsType::POS >::getSubset(), MCCoords< CoordsType::POS_SPIN >::getSubset(), syclSolverInverter< T_FP >::invert_transpose(), WalkerLogBuffer< WLog::Real >::makeNewRow(), LCAOrbitalSet::mw_evaluateValue(), LCAOrbitalSet::mw_evaluateValueImplGEMM(), LCAOrbitalSet::mw_evaluateValueVPsImplGEMM(), LCAOrbitalSet::mw_evaluateVGL(), LCAOrbitalSet::mw_evaluateVGLImplGEMM(), StructFact::mw_updateAllPart(), VectorSoaContainer< ST, 5 >::operator=(), Vector< T, std::allocator< T > >::operator=(), ParticleAttribXmlNode< PAT >::put(), EstimatorManagerNew::reduceOperatorEstimators(), RotatedSPOs::resetParametersExclusive(), WalkerLogBuffer< WLog::Real >::resetRowSize(), SplineR2R< ST >::storeParamsBeforeRotation(), SplineC2C< ST >::storeParamsBeforeRotation(), AtomicOrbitals< ST >::storeParamsBeforeRotation(), qmcplusplus::TEST_CASE(), qmcplusplus::testDualAllocator(), SoaDistanceTableAB< T, D, SC >::update(), SoaDistanceTableAA< T, D, SC >::update(), SoaDistanceTableABOMPTarget< T, D, SC >::update(), SoaDistanceTableAAOMPTarget< T, D, SC >::update(), SoaDistanceTableAA< T, D, SC >::updatePartial(), SoaDistanceTableAAOMPTarget< T, D, SC >::updatePartial(), VectorSoaContainer< ST, 5 >::VectorSoaContainer(), NESpaceGrid< REAL >::write(), and OperatorEstBase::write().

553 {
554  if (array_size == 0)
555  return sycl::event();
556  constexpr size_t tile_size = 64;
557  const size_t a_max = ((array_size + tile_size - 1) / tile_size) * tile_size;
558  return aq.parallel_for(sycl::range<1>{a_max}, events, [=](sycl::id<1> id) {
559  if (id < array_size)
560  VC[id] = static_cast<T2>(VA[id]);
561  });
562 }

◆ copy_n() [3/4]

template sycl::event qmcplusplus::syclBLAS::copy_n ( sycl::queue aq,
const double *restrict  VA,
size_t  array_size,
float *restrict  VC,
const std::vector< sycl::event > &  events 
)

◆ copy_n() [4/4]

template sycl::event qmcplusplus::syclBLAS::copy_n ( sycl::queue aq,
const std::complex< double > *restrict  VA,
size_t  array_size,
std::complex< float > *restrict  VC,
const std::vector< sycl::event > &  events 
)

◆ gemm() [1/5]

sycl::event gemm ( sycl::queue handle,
const char  tA,
const char  tB,
const int  m,
const int  n,
const int  k,
const T  alpha,
const T *  A,
const int  lda,
const T *  B,
const int  ldb,
const T  beta,
T *  C,
const int  ldc,
const std::vector< sycl::event > &  events 
)

Definition at line 275 of file syclBLAS.cpp.

References qmcplusplus::Units::distance::A, B(), qmcplusplus::Units::charge::C, convertTransEnum(), gemm(), qmcplusplus::lda, qmcplusplus::Units::distance::m, and qmcplusplus::n.

Referenced by DelayedUpdateSYCL< T, T_FP >::updateInvMat().

290 {
291  return oneapi::mkl::blas::gemm(handle, convertTransEnum(tA), convertTransEnum(tB), m, n, k, alpha, A, lda, B, ldb,
292  beta, C, ldc, events);
293 }
oneapi::mkl::transpose convertTransEnum(char trans)
Definition: syclBLAS.hpp:28
template sycl::event gemm(sycl::queue &handle, const char tA, const char tB, const int m, const int n, const int k, const std::complex< double > alpha, const std::complex< double > *const A, const int lda, const std::complex< double > *const B, const int ldb, const std::complex< double > beta, std::complex< double > *const C, const int ldc, const std::vector< sycl::event > &events)
double B(double x, int k, int i, const std::vector< double > &t)

◆ gemm() [2/5]

template sycl::event qmcplusplus::syclBLAS::gemm ( sycl::queue handle,
const char  tA,
const char  tB,
const int  m,
const int  n,
const int  k,
const float  alpha,
const float *const  A,
const int  lda,
const float *const  B,
const int  ldb,
const float  beta,
float *const  C,
const int  ldc,
const std::vector< sycl::event > &  events 
)

◆ gemm() [3/5]

template sycl::event qmcplusplus::syclBLAS::gemm ( sycl::queue handle,
const char  tA,
const char  tB,
const int  m,
const int  n,
const int  k,
const double  alpha,
const double *const  A,
const int  lda,
const double *const  B,
const int  ldb,
const double  beta,
double *const  C,
const int  ldc,
const std::vector< sycl::event > &  events 
)

◆ gemm() [4/5]

template sycl::event qmcplusplus::syclBLAS::gemm ( sycl::queue handle,
const char  tA,
const char  tB,
const int  m,
const int  n,
const int  k,
const std::complex< float >  alpha,
const std::complex< float > *const  A,
const int  lda,
const std::complex< float > *const  B,
const int  ldb,
const std::complex< float >  beta,
std::complex< float > *const  C,
const int  ldc,
const std::vector< sycl::event > &  events 
)

◆ gemm() [5/5]

template sycl::event qmcplusplus::syclBLAS::gemm ( sycl::queue handle,
const char  tA,
const char  tB,
const int  m,
const int  n,
const int  k,
const std::complex< double >  alpha,
const std::complex< double > *const  A,
const int  lda,
const std::complex< double > *const  B,
const int  ldb,
const std::complex< double >  beta,
std::complex< double > *const  C,
const int  ldc,
const std::vector< sycl::event > &  events 
)

Referenced by gemm().

◆ gemv() [1/5]

sycl::event gemv ( sycl::queue handle,
const char  trans,
const int  m,
const int  n,
const T  alpha,
const T *const  A,
const int  lda,
const T *const  x,
const int  incx,
const T  beta,
T *const  y,
const int  incy,
const std::vector< sycl::event > &  events 
)

Definition at line 21 of file syclBLAS.cpp.

References qmcplusplus::Units::distance::A, convertTransEnum(), gemv(), qmcplusplus::lda, qmcplusplus::Units::distance::m, and qmcplusplus::n.

34 {
35  return oneapi::mkl::blas::gemv(handle, convertTransEnum(trans), m, n, alpha, A, lda, x, incx, beta, y, incy, events);
36 }
oneapi::mkl::transpose convertTransEnum(char trans)
Definition: syclBLAS.hpp:28
template sycl::event gemv(sycl::queue &handle, const char trans, const int m, const int n, const std::complex< float > alpha, const std::complex< float > *const A, const int lda, const std::complex< float > *const x, const int incx, const std::complex< float > beta, std::complex< float > *const y, const int incy, const std::vector< sycl::event > &events)

◆ gemv() [2/5]

template sycl::event qmcplusplus::syclBLAS::gemv ( sycl::queue handle,
const char  trans,
const int  m,
const int  n,
const double  alpha,
const double *const  A,
const int  lda,
const double *const  x,
const int  incx,
const double  beta,
double *const  y,
const int  incy,
const std::vector< sycl::event > &  events 
)

◆ gemv() [3/5]

template sycl::event qmcplusplus::syclBLAS::gemv ( sycl::queue handle,
const char  trans,
const int  m,
const int  n,
const float  alpha,
const float *const  A,
const int  lda,
const float *const  x,
const int  incx,
const float  beta,
float *const  y,
const int  incy,
const std::vector< sycl::event > &  events 
)

◆ gemv() [4/5]

template sycl::event qmcplusplus::syclBLAS::gemv ( sycl::queue handle,
const char  trans,
const int  m,
const int  n,
const std::complex< double >  alpha,
const std::complex< double > *const  A,
const int  lda,
const std::complex< double > *const  x,
const int  incx,
const std::complex< double >  beta,
std::complex< double > *const  y,
const int  incy,
const std::vector< sycl::event > &  events 
)

◆ gemv() [5/5]

template sycl::event qmcplusplus::syclBLAS::gemv ( sycl::queue handle,
const char  trans,
const int  m,
const int  n,
const std::complex< float >  alpha,
const std::complex< float > *const  A,
const int  lda,
const std::complex< float > *const  x,
const int  incx,
const std::complex< float >  beta,
std::complex< float > *const  y,
const int  incy,
const std::vector< sycl::event > &  events 
)

Referenced by gemv().

◆ gemv_batched()

sycl::event qmcplusplus::syclBLAS::gemv_batched ( sycl::queue handle,
const char  trans,
const int  m,
const int  n,
const T *  alpha,
const T *const  A[],
const int  lda,
const T *const  x[],
const int  incx,
const T *  beta,
T *const  y[],
const int  incy,
const size_t  batch_count,
const std::vector< sycl::event > &  events = {} 
)

in-house version of gemv_batch implemented in SYCL. Can be dropped if we have vendor optimized versions

Referenced by qmcplusplus::compute::BLAS::gemv_batched().

◆ gemv_batched< double >()

sycl::event qmcplusplus::syclBLAS::gemv_batched< double > ( sycl::queue handle,
const char  trans,
const int  m,
const int  n,
const double *  alpha,
const double *const  A[],
const int  lda,
const double *const  x[],
const int  incx,
const double *  beta,
double *const  y[],
const int  incy,
const size_t  batch_count,
const std::vector< sycl::event > &  events 
)

Definition at line 196 of file syclBLAS.cpp.

References qmcplusplus::Units::distance::A, qmcplusplus::lda, qmcplusplus::Units::distance::m, and qmcplusplus::n.

210 {
211  if (trans == 'N' || trans == 'n')
212  return gemvN_batched_impl<double, 64>(handle, m, n, alpha, A, lda, x, incx, beta, y, incy, batch_count);
213  else if (trans == 'T' || trans == 't')
214  return gemvT_batched_impl<double, 64>(handle, m, n, alpha, A, lda, x, incx, beta, y, incy, batch_count);
215  else
216  throw std::runtime_error("syclBLAS::gemv_batched only supports 'N', 'T', 'C', 'n'. Input value is " +
217  std::string(1, trans));
218 }

◆ gemv_batched< float >()

sycl::event qmcplusplus::syclBLAS::gemv_batched< float > ( sycl::queue handle,
const char  trans,
const int  m,
const int  n,
const float *  alpha,
const float *const  A[],
const int  lda,
const float *const  x[],
const int  incx,
const float *  beta,
float *const  y[],
const int  incy,
const size_t  batch_count,
const std::vector< sycl::event > &  events 
)

Definition at line 171 of file syclBLAS.cpp.

References qmcplusplus::Units::distance::A, qmcplusplus::lda, qmcplusplus::Units::distance::m, and qmcplusplus::n.

185 {
186  if (trans == 'N' || trans == 'n')
187  return gemvN_batched_impl<float, 64>(handle, m, n, alpha, A, lda, x, incx, beta, y, incy, batch_count);
188  else if (trans == 'T' || trans == 't')
189  return gemvT_batched_impl<float, 64>(handle, m, n, alpha, A, lda, x, incx, beta, y, incy, batch_count);
190  else
191  throw std::runtime_error("syclBLAS::gemv_batched only supports 'N', 'T', 'C', 'n'. Input value is " +
192  std::string(1, trans));
193 }

◆ gemv_batched< std::complex< double > >()

sycl::event qmcplusplus::syclBLAS::gemv_batched< std::complex< double > > ( sycl::queue handle,
const char  trans,
const int  m,
const int  n,
const std::complex< double > *  alpha,
const std::complex< double > *const  A[],
const int  lda,
const std::complex< double > *const  x[],
const int  incx,
const std::complex< double > *  beta,
std::complex< double > *const  y[],
const int  incy,
const size_t  batch_count,
const std::vector< sycl::event > &  events 
)

Definition at line 248 of file syclBLAS.cpp.

References qmcplusplus::Units::distance::A, qmcplusplus::lda, qmcplusplus::Units::distance::m, and qmcplusplus::n.

262 {
263  if (trans == 'N' || trans == 'n')
264  return gemvN_batched_impl<std::complex<double>, 64>(handle, m, n, alpha, A, lda, x, incx, beta, y, incy,
265  batch_count);
266  else if (trans == 'T' || trans == 't')
267  return gemvT_batched_impl<std::complex<double>, 64>(handle, m, n, alpha, A, lda, x, incx, beta, y, incy,
268  batch_count);
269  else
270  throw std::runtime_error("syclBLAS::gemv_batched only supports 'N', 'T', 'C', 'n'. Input value is " +
271  std::string(1, trans));
272 }

◆ gemv_batched< std::complex< float > >()

sycl::event qmcplusplus::syclBLAS::gemv_batched< std::complex< float > > ( sycl::queue handle,
const char  trans,
const int  m,
const int  n,
const std::complex< float > *  alpha,
const std::complex< float > *const  A[],
const int  lda,
const std::complex< float > *const  x[],
const int  incx,
const std::complex< float > *  beta,
std::complex< float > *const  y[],
const int  incy,
const size_t  batch_count,
const std::vector< sycl::event > &  events 
)

Definition at line 221 of file syclBLAS.cpp.

References qmcplusplus::Units::distance::A, qmcplusplus::lda, qmcplusplus::Units::distance::m, and qmcplusplus::n.

235 {
236  if (trans == 'N' || trans == 'n')
237  return gemvN_batched_impl<std::complex<float>, 64>(handle, m, n, alpha, A, lda, x, incx, beta, y, incy,
238  batch_count);
239  else if (trans == 'T' || trans == 't')
240  return gemvT_batched_impl<std::complex<float>, 64>(handle, m, n, alpha, A, lda, x, incx, beta, y, incy,
241  batch_count);
242  else
243  throw std::runtime_error("syclBLAS::gemv_batched only supports 'N', 'T', 'C', 'n'. Input value is " +
244  std::string(1, trans));
245 }

◆ gemvN_batched_impl()

sycl::event qmcplusplus::syclBLAS::gemvN_batched_impl ( sycl::queue handle,
const int  m,
const int  n,
const T *  alpha,
const T *const  A[],
const int  lda,
const T *const  x[],
const int  incx,
const T *  beta,
T *const  y[],
const int  incy,
const size_t  batch_count,
const std::vector< sycl::event > &  events = {} 
)

gemv trans = 'N' case.

ROW refers to rows of the m x n column-major Fortran matrix A.

Definition at line 135 of file syclBLAS.cpp.

147  {})
148 {
149  if (m == 0 || n == 0 || batch_count == 0)
150  return sycl::event();
151 
152  const int num_row_blocks = (m + ROWBS - 1) / ROWBS;
153  return handle.parallel_for(sycl::nd_range<2>{{batch_count, num_row_blocks * ROWBS}, {1, ROWBS}},
154  [=](sycl::nd_item<2> item) {
155  const unsigned batch = item.get_group(0);
156  const int row = item.get_global_id(1);
157  if (row < m)
158  {
159  T sum(0);
160  for (int col = 0; col < n; col++)
161  sum += A[batch][col * lda + row] * x[batch][col * incx];
162  if (beta[batch] == T(0))
163  y[batch][row * incy] = alpha[batch] * sum; // protecting NaN from y_iw
164  else
165  y[batch][row * incy] = alpha[batch] * sum + beta[batch] * y[batch][row * incy];
166  }
167  });
168 }

◆ gemvT_batched_impl()

sycl::event qmcplusplus::syclBLAS::gemvT_batched_impl ( sycl::queue handle,
const int  m,
const int  n,
const T *  alpha,
const T *const  A[],
const int  lda,
const T *const  x[],
const int  incx,
const T *  beta,
T *const  y[],
const int  incy,
const size_t  batch_count,
const std::vector< sycl::event > &  events = {} 
)

gemv trans = 'T' case.

COLS refers to columns of the m x n column-major Fortran matrix A.

Definition at line 97 of file syclBLAS.cpp.

109  {})
110 {
111  if (m == 0 || n == 0 || batch_count == 0)
112  return sycl::event();
113 
114  const int num_col_blocks = (n + COLBS - 1) / COLBS;
115  return handle.parallel_for(sycl::nd_range<2>{{batch_count, num_col_blocks * COLBS}, {1, COLBS}},
116  [=](sycl::nd_item<2> item) {
117  const unsigned batch = item.get_group(0);
118  const int col = item.get_global_id(1);
119  if (col < n)
120  {
121  T sum(0);
122  for (int row = 0; row < m; row++)
123  sum += A[batch][col * lda + row] * x[batch][row * incx];
124  if (beta[batch] == T(0))
125  y[batch][col * incy] = alpha[batch] * sum; // protecting NaN from y_iw
126  else
127  y[batch][col * incy] = alpha[batch] * sum + beta[batch] * y[batch][col * incy];
128  }
129  });
130 }

◆ ger_batched()

sycl::event qmcplusplus::syclBLAS::ger_batched ( sycl::queue handle,
const int  m,
const int  n,
const T *  alpha,
const T *const  x[],
const int  incx,
const T *const  y[],
const int  incy,
T *const  A[],
const int  lda,
const size_t  batch_count,
const std::vector< sycl::event > &  events = {} 
)

in-house version of ger_batch implemented in SYCL. Can be dropped if we have vendor optimized versions

Referenced by qmcplusplus::compute::BLAS::ger_batched().

◆ ger_batched< double >()

sycl::event qmcplusplus::syclBLAS::ger_batched< double > ( sycl::queue handle,
const int  m,
const int  n,
const double *  alpha,
const double *const  x[],
const int  incx,
const double *const  y[],
const int  incy,
double *const  A[],
const int  lda,
const size_t  batch_count,
const std::vector< sycl::event > &  events 
)

Definition at line 421 of file syclBLAS.cpp.

References qmcplusplus::Units::distance::A, qmcplusplus::lda, qmcplusplus::Units::distance::m, and qmcplusplus::n.

433 {
434  return ger_batched_impl<double, 32, 8>(handle, m, n, alpha, x, incx, y, incy, A, lda, batch_count, events);
435 }

◆ ger_batched< float >()

sycl::event qmcplusplus::syclBLAS::ger_batched< float > ( sycl::queue handle,
const int  m,
const int  n,
const float *  alpha,
const float *const  x[],
const int  incx,
const float *const  y[],
const int  incy,
float *const  A[],
const int  lda,
const size_t  batch_count,
const std::vector< sycl::event > &  events 
)

Definition at line 404 of file syclBLAS.cpp.

References qmcplusplus::Units::distance::A, qmcplusplus::lda, qmcplusplus::Units::distance::m, and qmcplusplus::n.

416 {
417  return ger_batched_impl<float, 32, 8>(handle, m, n, alpha, x, incx, y, incy, A, lda, batch_count, events);
418 }

◆ ger_batched< std::complex< double > >()

sycl::event qmcplusplus::syclBLAS::ger_batched< std::complex< double > > ( sycl::queue handle,
const int  m,
const int  n,
const std::complex< double > *  alpha,
const std::complex< double > *const  x[],
const int  incx,
const std::complex< double > *const  y[],
const int  incy,
std::complex< double > *const  A[],
const int  lda,
const size_t  batch_count,
const std::vector< sycl::event > &  events 
)

Definition at line 456 of file syclBLAS.cpp.

References qmcplusplus::Units::distance::A, qmcplusplus::lda, qmcplusplus::Units::distance::m, and qmcplusplus::n.

468 {
469  return ger_batched_impl<std::complex<double>, 32, 8>(handle, m, n, alpha, x, incx, y, incy, A, lda, batch_count,
470  events);
471 }

◆ ger_batched< std::complex< float > >()

sycl::event qmcplusplus::syclBLAS::ger_batched< std::complex< float > > ( sycl::queue handle,
const int  m,
const int  n,
const std::complex< float > *  alpha,
const std::complex< float > *const  x[],
const int  incx,
const std::complex< float > *const  y[],
const int  incy,
std::complex< float > *const  A[],
const int  lda,
const size_t  batch_count,
const std::vector< sycl::event > &  events 
)

Definition at line 438 of file syclBLAS.cpp.

References qmcplusplus::Units::distance::A, qmcplusplus::lda, qmcplusplus::Units::distance::m, and qmcplusplus::n.

450 {
451  return ger_batched_impl<std::complex<float>, 32, 8>(handle, m, n, alpha, x, incx, y, incy, A, lda, batch_count,
452  events);
453 }

◆ ger_batched_impl()

sycl::event qmcplusplus::syclBLAS::ger_batched_impl ( sycl::queue handle,
const int  m,
const int  n,
const T *  alpha,
const T *const  x[],
const int  incx,
const T *const  y[],
const int  incy,
T *const  A[],
const int  lda,
const size_t  batch_count,
const std::vector< sycl::event > &  events 
)

Definition at line 361 of file syclBLAS.cpp.

References qmcplusplus::Units::distance::A, qmcplusplus::lda, qmcplusplus::Units::distance::m, and qmcplusplus::n.

373 {
374  static_assert(ROWBS <= TILE_SIZE, "ROWBS cannot be larger than TILE_SIZE!");
375  if (m == 0 || n == 0 || batch_count == 0)
376  return sycl::event();
377 
378  // A is m x n in Fortran, n x m in C.
379  constexpr size_t tile_size = TILE_SIZE;
380  constexpr size_t block_rows = ROWBS;
381  // the computation is tiled and distributed.
382  const size_t row_tiles = (n + tile_size - 1) / tile_size;
383  const size_t col_tiles = (m + tile_size - 1) / tile_size;
384 
385  return handle.parallel_for(sycl::nd_range<3>{{batch_count, row_tiles * block_rows, col_tiles * tile_size},
386  {1, block_rows, tile_size}},
387  [=](sycl::nd_item<3> item) {
388  const unsigned batch = item.get_group(0);
389  const unsigned thX = item.get_local_id(2);
390  const unsigned thY = item.get_local_id(1);
391  const unsigned column = item.get_group(2) * tile_size + thX;
392  const unsigned row_offset = item.get_group(1) * tile_size + thY;
393  if (column < m)
394  {
395  const T alphaX = alpha[batch] * x[batch][column * incx];
396  for (unsigned j = 0; j < tile_size; j += block_rows)
397  if (const unsigned row = row_offset + j; row < n)
398  A[batch][row * lda + column] += alphaX * y[batch][row * incy];
399  }
400  });
401 }

◆ transpose() [1/6]

sycl::event qmcplusplus::syclBLAS::transpose ( sycl::queue q,
const T1 *  in,
int  m,
int  lda,
T2 *  out,
int  n,
int  ldb,
const std::vector< sycl::event > &  events = {} 
)

◆ transpose() [2/6]

sycl::event qmcplusplus::syclBLAS::transpose ( sycl::queue q,
const T1 *restrict  in,
int  m,
int  lda,
T2 *restrict  out,
int  n,
int  ldb,
const std::vector< sycl::event > &  events 
)

Definition at line 475 of file syclBLAS.cpp.

References qmcplusplus::lda, qmcplusplus::Units::distance::m, and qmcplusplus::n.

Referenced by syclSolverInverter< T_FP >::invert_transpose().

483 {
484  constexpr size_t tile_size = 16;
485  const size_t m_max = ((m + tile_size - 1) / tile_size) * tile_size;
486  const size_t n_max = ((n + tile_size - 1) / tile_size) * tile_size;
487 
488  return q.submit([&](sycl::handler& cgh) {
489  cgh.depends_on(events);
490  sycl::local_accessor<T2, 2> tile(sycl::range<2>(tile_size, tile_size + 1), cgh);
491 
492  cgh.parallel_for(sycl::nd_range<2>{{m_max, n_max}, {tile_size, tile_size}}, [=](sycl::nd_item<2> item) {
493  unsigned x = item.get_global_id(1);
494  unsigned y = item.get_global_id(0);
495  unsigned xth = item.get_local_id(1);
496  unsigned yth = item.get_local_id(0);
497 
498  if (x < n && y < m)
499  tile[yth][xth] = in[(y)*lda + x];
500  item.barrier(sycl::access::fence_space::local_space);
501 
502  x = item.get_group(0) * tile_size + xth;
503  y = item.get_group(1) * tile_size + yth;
504  if (x < m && y < n)
505  out[(y)*ldb + x] = tile[xth][yth];
506  });
507  });
508 }

◆ transpose() [3/6]

template sycl::event qmcplusplus::syclBLAS::transpose ( sycl::queue q,
const float *restrict  in,
int  m,
int  lda,
double *restrict  out,
int  n,
int  ldb,
const std::vector< sycl::event > &  events 
)

◆ transpose() [4/6]

template sycl::event qmcplusplus::syclBLAS::transpose ( sycl::queue q,
const double *restrict  in,
int  m,
int  lda,
double *restrict  out,
int  n,
int  ldb,
const std::vector< sycl::event > &  events 
)

◆ transpose() [5/6]

template sycl::event qmcplusplus::syclBLAS::transpose ( sycl::queue q,
const std::complex< float > *restrict  in,
int  m,
int  lda,
std::complex< double > *restrict  out,
int  n,
int  ldb,
const std::vector< sycl::event > &  events 
)

◆ transpose() [6/6]

template sycl::event qmcplusplus::syclBLAS::transpose ( sycl::queue q,
const std::complex< double > *restrict  in,
int  m,
int  lda,
std::complex< double > *restrict  out,
int  n,
int  ldb,
const std::vector< sycl::event > &  events 
)