12 #ifndef QMCPLUSPLUS_DIRAC_MATRIX_COMPUTE_CUDA_H 13 #define QMCPLUSPLUS_DIRAC_MATRIX_COMPUTE_CUDA_H 15 #include <type_traits> 48 template<
typename VALUE_FP>
111 const int nw = a_mats.size();
112 assert(a_mats.size() == inv_a_mats.size());
115 const int lda = a_mats[0].get().cols();
116 const int ldinv = inv_a_mats[0].get().cols();
120 for (
int iw = 0; iw < nw; ++iw)
129 "cudaMemcpyAsync failed copying DiracMatrixBatch::psiM to device");
132 inv_a_mats[iw].
get().device_data(), ldinv, &
host_zero,
134 "cuBLAS::geam failed.");
141 "cudaMemcpyAsync psiM_invM_ptrs_ failed!");
146 for (
int iw = 0; iw < nw; ++iw)
150 "cudaMemcpyAsync failed copying DiracMatrixBatch::inv_psiM to host");
154 "cudaMemcpyAsync log_values failed!");
185 for (
int iw = 0; iw < nw; ++iw)
197 "cudaMemcpyAsync failed copying DiracMatrixBatch::psiM_fp to device");
200 "cudaMemcpyAsync psiM_invM_ptrs_ failed!");
212 "cudaMemcpyAsync failed copying back DiracMatrixBatch::invM_fp from device");
215 "cudaMemcpyAsync log_values failed!");
232 std::unique_ptr<Resource>
makeClone()
const override {
return std::make_unique<DiracMatrixComputeCUDA>(*this); }
243 template<
typename TMAT>
251 const int n = a_mat.
rows();
259 "cudaMemcpyAsync failed copying DiracMatrixBatch::log_values to device");
263 "cudaMemcpyAsync failed copying DiracMatrixBatch::psiM_fp to device");
274 "cudaMemcpyAsync of inv_a_mat to device failed!");
290 template<
typename TMAT>
300 const int nw = a_mats.size();
301 const int n = a_mats[0].get().rows();
302 const int lda = a_mats[0].get().cols();
310 "cudaMemcpyAsync failed copying DiracMatrixBatch::log_values to device");
311 for (
int iw = 0; iw < nw; ++iw)
314 for (
int iw = 0; iw < a_mats.size(); ++iw)
320 inv_a_mats[iw].get().assignUpperLeft(data_ref_matrix);
323 "cudaMemcpyAsync of inv_a_mat to device failed!");
332 template<
typename TMAT>
342 const int n = a_mats[0].get().rows();
349 #endif //QMCPLUSPLUS_DIRAC_MATRIX_COMPUTE_CUDA_H
~DiracMatrixComputeCUDA()
const std::string & getName() const
helper functions for EinsplineSetBuilder
DualVector< VALUE_FP * > psiM_invM_ptrs_
Transfer buffer for device pointers to matrices.
std::vector< StdComp, CUDAHostAllocator< StdComp > > log_values(batch_size)
pointer device_data()
Return the device_ptr matching X if this is a vector attached or owning dual space memory...
RealAlias< VALUE_FP > FullPrecReal
void transpose(const T *restrict A, size_t m, size_t lda, TO *restrict B, size_t n, size_t ldb)
transpose of A(m,n) to B(n,m)
std::enable_if_t<!std::is_same< VALUE_FP, TMAT >::value > mw_invertTranspose(compute::Queue< PlatformKind::CUDA > &queue, const RefVector< const DualMatrix< TMAT >> &a_mats, const RefVector< DualMatrix< TMAT >> &inv_a_mats, DualVector< LogValue > &log_values)
Mixed precision specialization When TMAT is not full precision we need to still do the inversion and ...
void computeInverseAndDetLog_batched(cublasHandle_t &h_cublas, cudaStream_t &hstream, const int n, const int lda, T *Ms[], T *Cs[], T *LU_diags, int *pivots, int *host_infos, int *infos, std::complex< double > *log_dets, const int batch_size)
Takes PsiM in column major layout and uses LU factorization to compute the log determinant and invPsi...
At the qmcplusplus cuBLAS_LU level all *, **, *[] are assumed to be to device addresses.
std::unique_ptr< Resource > makeClone() const override
DualVector< VALUE_FP > psiM_fp_
size_type size() const
return the current size
DualVector< VALUE_FP > invM_fp_
void assignUpperLeft(const Matrix< T_FROM, ALLOC_FROM > &from)
This assigns from a matrix with larger row size (used for alignment) to whatever the rowsize is here...
DualVector< VALUE_FP > LU_diags_fp_
void mw_computeInvertAndLog(compute::Queue< PlatformKind::CUDA > &queue, const RefVector< const DualMatrix< VALUE_FP >> &a_mats, const RefVector< DualMatrix< VALUE_FP >> &inv_a_mats, const int n, DualVector< LogValue > &log_values)
Calculates the actual inv and log determinant on accelerator.
std::enable_if_t< std::is_same< VALUE_FP, TMAT >::value > mw_invertTranspose(compute::Queue< PlatformKind::CUDA > &queue, const RefVector< const DualMatrix< TMAT >> &a_mats, const RefVector< DualMatrix< TMAT >> &inv_a_mats, DualVector< LogValue > &log_values)
Batched inversion and calculation of log determinants.
void remapCopy(size_t m, size_t n, const T *restrict A, size_t lda, TO *restrict B, size_t ldb)
copy of A(m,n) to B(m,n)
These allocators are to make code that should be generic with the respect to accelerator code flavor ...
void invert_transpose(compute::Queue< PlatformKind::CUDA > &queue, DualMatrix< TMAT > &a_mat, DualMatrix< TMAT > &inv_a_mat, DualVector< LogValue > &log_values)
Given a_mat returns inverted amit and log determinant of a_matches.
cudaErrorCheck(cudaMemcpyAsync(dev_lu.data(), lu.data(), sizeof(decltype(lu)::value_type) *lu.size(), cudaMemcpyHostToDevice, hstream), "cudaMemcpyAsync failed copying log_values to device")
#define cudaMemcpyDeviceToHost
#define cudaStreamSynchronize
#define cudaMemcpyHostToDevice
void mw_computeInvertAndLog_stride(compute::Queue< PlatformKind::CUDA > &queue, DualVector< VALUE_FP > &psi_Ms, DualVector< VALUE_FP > &inv_Ms, const int n, const int lda, DualVector< LogValue > &log_values)
Calculates the actual inv and log determinant on accelerator with psiMs and invMs widened to full pre...
std::vector< std::reference_wrapper< T > > RefVector
void attachReference(T *ref)
class defining a compute and memory resource to compute matrix inversion and the log determinants of ...
cublasStatus_t geam(cublasHandle_t &handle, cublasOperation_t &transa, cublasOperation_t &transb, int m, int n, const float *alpha, const float *A, int lda, const float *beta, const float *B, int ldb, float *C, int ldc)
typename RealAlias_impl< T >::value_type RealAlias
If you have a function templated on a value that can be real or complex and you need to get the base ...
std::complex< FullPrecReal > LogValue
DiracMatrixComputeCUDA(const DiracMatrixComputeCUDA &other)
#define cublasErrorCheck(ans, cause)
DualVector< int > pivots_
SIMD version of functions in algorithm.