df/dee/a01646_source.html

 //////////////////////////////////////////////////////////////////////////////////////
 // This file is distributed under the University of Illinois/NCSA Open Source License.
 // See LICENSE file in top directory for details.
 //
 // Copyright (c) 2021 QMCPACK developers.
 //
 // File developed by: Peter Doak, doakpw@ornl.gov, Oak Ridge National Lab
 //
 // File created by: Peter Doak, doakpw@ornl.gov, Oak Ridge National Lab
 //////////////////////////////////////////////////////////////////////////////////////

 #ifndef QMCPLUSPLUS_DIRAC_MATRIX_COMPUTE_CUDA_H
 #define QMCPLUSPLUS_DIRAC_MATRIX_COMPUTE_CUDA_H

 #include <type_traits>

 #include "OhmmsPETE/OhmmsMatrix.h"
 #include "DualAllocatorAliases.hpp"
 #include "Platforms/CUDA/cuBLAS.hpp"
 #include "Platforms/CUDA/QueueCUDA.hpp"
 #include "detail/CUDA/cuBLAS_LU.hpp"
 #include "type_traits/complex_help.hpp"
 #include "Concurrency/OpenMP.h"
 #include "CPU/SIMD/algorithm.hpp"
 #include "ResourceCollection.h"

 namespace qmcplusplus
 {
 /** class defining a compute and memory resource to compute matrix inversion and
  *  the log determinants of a batch of DiracMatrixes.
  *  Multiplicty is one per crowd not one per UpdateEngine
  *  It matches the multiplicity of the accelerator call
  *  and batched resource requirement.
  *
  *  @tparam VALUE_FP the datatype used in the actual computation of matrix inversion
  *
  *  There are no per walker variables, resources specific to the per crowd
  *  compute object are owned here. The compute object itself is the resource
  *  to the per walker DiracDeterminantBatched.
  *  Resources used by this object but owned by the
  *  surrounding scope are passed as arguments.
  *
  *  All the public APIs are synchronous. The asynchronous queue argument gets synchronized before return.
  *  rocBLAS, indirectly used via hipBLAS, requires synchronizing the old stream before setting a new one.
  *  We don't need to actively synchronize the old stream because it gets synchronized right after each use.
  *
  */
 template<typename VALUE_FP>
 class DiracMatrixComputeCUDA : public Resource
 {
   using FullPrecReal = RealAlias<VALUE_FP>;
   using LogValue     = std::complex<FullPrecReal>;

   template<typename T>
   using DualMatrix = Matrix<T, PinnedDualAllocator<T>>;

   template<typename T>
   using DualVector = Vector<T, PinnedDualAllocator<T>>;

   // Contiguous memory for fp precision Matrices for each walker, n^2 * nw elements
   DualVector<VALUE_FP> psiM_fp_;
   DualVector<VALUE_FP> invM_fp_;

   // working vectors
   DualVector<VALUE_FP> LU_diags_fp_;
   DualVector<int> pivots_;
   DualVector<int> infos_;

   //DualMatrix<T_FP> temp_mat_;

   /** Transfer buffer for device pointers to matrices.
    *  The element count is usually low and the transfer launch cost are more than the transfer themselves.
    *  For this reason, it is beneficial to fusing multiple lists of pointers.
    *  Right now this buffer packs nw psiM pointers and then packs nw invM pointers.
    *  Use only within a function scope and do not rely on previous value.
    */
   DualVector<VALUE_FP*> psiM_invM_ptrs_;

   // cuBLAS geam wants these.
   VALUE_FP host_one{1.0};
   VALUE_FP host_zero{0.0};

   // cublas handle owned by this inverter
   cublasHandle_t h_cublas_;

   /** Calculates the actual inv and log determinant on accelerator
    *
    *  \param[in]      h_cublas    cublas handle, h_stream handle is retrieved from it.
    *  \param[in,out]  a_mats      dual A matrices, they will be transposed on the device side as a side effect.
    *  \param[out]     inv_a_mats  dual invM matrices
    *  \param[in]      n           matrices rank.
    *  \param[out]     log_values  log determinant value for each matrix, batch_size = log_values.size()
    *
    *  On Volta so far little seems to be achieved by having the mats continuous.
    *
    *  List of operations:
    *  1. matrix-by-matrix. Copy a_mat to inv_a_mat on host, transfer inv_a_mat to device, transpose inv_a_mat to a_mat on device.
    *  2. batched. LU and invert
    *  3. matrix-by-matrix. Transfer inv_a_mat to host
    *
    *  Pros and cons:
    *  1. \todo try to do like mw_computeInvertAndLog_stride, copy and transpose to psiM_fp_ and fuse transfer.
    *  3. \todo Remove Transfer inv_a_mat to host and let the upper level code handle it.
    */
   inline void mw_computeInvertAndLog(compute::Queue<PlatformKind::CUDA>& queue,
                                      const RefVector<const DualMatrix<VALUE_FP>>& a_mats,
                                      const RefVector<DualMatrix<VALUE_FP>>& inv_a_mats,
                                      const int n,
                                      DualVector<LogValue>& log_values)
   {
     const int nw = a_mats.size();
     assert(a_mats.size() == inv_a_mats.size());

     psiM_invM_ptrs_.resize(nw * 2);
     const int lda         = a_mats[0].get().cols();
     const int ldinv       = inv_a_mats[0].get().cols();
     cudaStream_t h_stream = queue.getNative();
     psiM_fp_.resize(n * ldinv * nw);

     for (int iw = 0; iw < nw; ++iw)
     {
       psiM_invM_ptrs_[iw]      = psiM_fp_.device_data() + iw * n * ldinv;
       psiM_invM_ptrs_[iw + nw] = inv_a_mats[iw].get().device_data();
       // Since inv_a_mat can have a different leading dimension from a_mat first we remap copy on the host
       simd::remapCopy(n, n, a_mats[iw].get().data(), lda, inv_a_mats[iw].get().data(), ldinv);
       // Then copy a_mat in inv_a_mats to the device
       cudaErrorCheck(cudaMemcpyAsync(inv_a_mats[iw].get().device_data(), inv_a_mats[iw].get().data(),
                                      inv_a_mats[iw].get().size() * sizeof(VALUE_FP), cudaMemcpyHostToDevice, h_stream),
                      "cudaMemcpyAsync failed copying DiracMatrixBatch::psiM to device");
       // On the device Here we transpose to a_mat;
       cublasErrorCheck(cuBLAS::geam(h_cublas_, CUBLAS_OP_T, CUBLAS_OP_N, n, n, &host_one,
                                     inv_a_mats[iw].get().device_data(), ldinv, &host_zero,
                                     a_mats[iw].get().device_data(), lda, psiM_invM_ptrs_[iw], ldinv),
                        "cuBLAS::geam failed.");
     }
     pivots_.resize(n * nw);
     infos_.resize(nw);
     LU_diags_fp_.resize(n * nw);
     cudaErrorCheck(cudaMemcpyAsync(psiM_invM_ptrs_.device_data(), psiM_invM_ptrs_.data(),
                                    psiM_invM_ptrs_.size() * sizeof(VALUE_FP*), cudaMemcpyHostToDevice, h_stream),
                    "cudaMemcpyAsync psiM_invM_ptrs_ failed!");
     cuBLAS_LU::computeInverseAndDetLog_batched(h_cublas_, h_stream, n, ldinv, psiM_invM_ptrs_.device_data(),
                                                psiM_invM_ptrs_.device_data() + nw, LU_diags_fp_.device_data(),
                                                pivots_.device_data(), infos_.data(), infos_.device_data(),
                                                log_values.device_data(), nw);
     for (int iw = 0; iw < nw; ++iw)
     {
       cudaErrorCheck(cudaMemcpyAsync(inv_a_mats[iw].get().data(), inv_a_mats[iw].get().device_data(),
                                      inv_a_mats[iw].get().size() * sizeof(VALUE_FP), cudaMemcpyDeviceToHost, h_stream),
                      "cudaMemcpyAsync failed copying DiracMatrixBatch::inv_psiM to host");
     }
     cudaErrorCheck(cudaMemcpyAsync(log_values.data(), log_values.device_data(), log_values.size() * sizeof(LogValue),
                                    cudaMemcpyDeviceToHost, h_stream),
                    "cudaMemcpyAsync log_values failed!");
     cudaErrorCheck(cudaStreamSynchronize(h_stream), "cudaStreamSynchronize failed!");
   }


   /** Calculates the actual inv and log determinant on accelerator with psiMs and invMs widened to full precision
    *  and copied into continuous vectors.
    *
    *  \param[in]      h_cublas    cublas handle, h_stream handle is retrieved from it.
    *  \param[in,out]  psi_Ms      matrices flattened into single pinned vector, returned with LU matrices.
    *  \param[out]     inv_Ms      matrices flattened into single pinned vector.
    *  \param[in]      n           matrices rank.
    *  \param[in]      lda         leading dimension of each matrix
    *  \param[out]     log_values  log determinant value for each matrix, batch_size = log_values.size()
    *
    *  List of operations:
    *  1. batched. Transfer psi_Ms to device
    *  2. batched. LU and invert
    *  3. batched. Transfer inv_Ms to host
    *  \todo Remove 1 and 3. Handle transfer at upper level.
    */
   inline void mw_computeInvertAndLog_stride(compute::Queue<PlatformKind::CUDA>& queue,
                                             DualVector<VALUE_FP>& psi_Ms,
                                             DualVector<VALUE_FP>& inv_Ms,
                                             const int n,
                                             const int lda,
                                             DualVector<LogValue>& log_values)
   {
     // This is probably dodgy
     const int nw = log_values.size();
     psiM_invM_ptrs_.resize(nw * 2);
     for (int iw = 0; iw < nw; ++iw)
     {
       psiM_invM_ptrs_[iw]      = psi_Ms.device_data() + iw * n * lda;
       psiM_invM_ptrs_[iw + nw] = inv_Ms.device_data() + iw * n * lda;
     }
     pivots_.resize(n * nw);
     infos_.resize(nw);
     LU_diags_fp_.resize(n * nw);

     cudaStream_t h_stream = queue.getNative();
     cudaErrorCheck(cudaMemcpyAsync(psi_Ms.device_data(), psi_Ms.data(), psi_Ms.size() * sizeof(VALUE_FP),
                                    cudaMemcpyHostToDevice, h_stream),
                    "cudaMemcpyAsync failed copying DiracMatrixBatch::psiM_fp to device");
     cudaErrorCheck(cudaMemcpyAsync(psiM_invM_ptrs_.device_data(), psiM_invM_ptrs_.data(),
                                    psiM_invM_ptrs_.size() * sizeof(VALUE_FP*), cudaMemcpyHostToDevice, h_stream),
                    "cudaMemcpyAsync psiM_invM_ptrs_ failed!");
     cuBLAS_LU::computeInverseAndDetLog_batched(h_cublas_, h_stream, n, lda, psiM_invM_ptrs_.device_data(),
                                                psiM_invM_ptrs_.device_data() + nw, LU_diags_fp_.device_data(),
                                                pivots_.device_data(), infos_.data(), infos_.device_data(),
                                                log_values.device_data(), nw);
 #if NDEBUG
     // This is very useful to see whether the data after all kernels and cublas calls are run is wrong on the device or due to copy.
     // cuBLAS_LU::peekinvM_batched(h_stream, psiM_mw_ptr, invM_mw_ptr, pivots_.device_data(), infos_.device_data(),
     //                             log_values.device_data(), nw);
 #endif
     cudaErrorCheck(cudaMemcpyAsync(inv_Ms.data(), inv_Ms.device_data(), inv_Ms.size() * sizeof(VALUE_FP),
                                    cudaMemcpyDeviceToHost, h_stream),
                    "cudaMemcpyAsync failed copying back DiracMatrixBatch::invM_fp from device");
     cudaErrorCheck(cudaMemcpyAsync(log_values.data(), log_values.device_data(), log_values.size() * sizeof(LogValue),
                                    cudaMemcpyDeviceToHost, h_stream),
                    "cudaMemcpyAsync log_values failed!");
     cudaErrorCheck(cudaStreamSynchronize(h_stream), "cudaStreamSynchronize failed!");
   }

 public:
   DiracMatrixComputeCUDA() : Resource("DiracMatrixComputeCUDA")
   {
     cublasErrorCheck(cublasCreate(&h_cublas_), "cublasCreate failed!");
   }

   DiracMatrixComputeCUDA(const DiracMatrixComputeCUDA& other) : Resource(other.getName())
   {
     cublasErrorCheck(cublasCreate(&h_cublas_), "cublasCreate failed!");
   }

   ~DiracMatrixComputeCUDA() { cublasErrorCheck(cublasDestroy(h_cublas_), "cublasDestroy failed!"); }

   std::unique_ptr<Resource> makeClone() const override { return std::make_unique<DiracMatrixComputeCUDA>(*this); }

   /** Given a_mat returns inverted amit and log determinant of a_matches.
    *  \param [in] a_mat a matrix input
    *  \param [out] inv_a_mat inverted matrix
    *  \param [out] log determinant is in logvalues[0]
    *
    *  I consider this single call to be semi depricated so the log determinant values
    *  vector is used to match the primary batched interface to the accelerated routings.
    *  There is no optimization (yet) for TMAT same type as TREAL
    */
   template<typename TMAT>
   void invert_transpose(compute::Queue<PlatformKind::CUDA>& queue,
                         DualMatrix<TMAT>& a_mat,
                         DualMatrix<TMAT>& inv_a_mat,
                         DualVector<LogValue>& log_values)
   {
     cudaStream_t h_stream = queue.getNative();
     cublasErrorCheck(cublasSetStream(h_cublas_, h_stream), "cublasSetStream failed!");
     const int n   = a_mat.rows();
     const int lda = a_mat.cols();
     psiM_fp_.resize(n * lda);
     invM_fp_.resize(n * lda);
     std::fill(log_values.begin(), log_values.end(), LogValue{0.0, 0.0});
     // making sure we know the log_values are zero'd on the device.
     cudaErrorCheck(cudaMemcpyAsync(log_values.device_data(), log_values.data(), log_values.size() * sizeof(LogValue),
                                    cudaMemcpyHostToDevice, h_stream),
                    "cudaMemcpyAsync failed copying DiracMatrixBatch::log_values to device");
     simd::transpose(a_mat.data(), n, lda, psiM_fp_.data(), n, lda);
     cudaErrorCheck(cudaMemcpyAsync(psiM_fp_.device_data(), psiM_fp_.data(), psiM_fp_.size() * sizeof(VALUE_FP),
                                    cudaMemcpyHostToDevice, h_stream),
                    "cudaMemcpyAsync failed copying DiracMatrixBatch::psiM_fp to device");
     mw_computeInvertAndLog_stride(queue, psiM_fp_, invM_fp_, n, lda, log_values);
     DualMatrix<VALUE_FP> data_ref_matrix;

     data_ref_matrix.attachReference(invM_fp_.data(), n, n);

     // We can't use operator= with different lda, ldb which can happen so we use this assignment which is over the
     // smaller of the two's dimensions
     inv_a_mat.assignUpperLeft(data_ref_matrix);
     cudaErrorCheck(cudaMemcpyAsync(inv_a_mat.device_data(), inv_a_mat.data(), inv_a_mat.size() * sizeof(TMAT),
                                    cudaMemcpyHostToDevice, h_stream),
                    "cudaMemcpyAsync of inv_a_mat to device failed!");
   }

   /** Mixed precision specialization
    *  When TMAT is not full precision we need to still do the inversion and log
    *  at full precision. This is not yet optimized to transpose on the GPU
    *
    *  List of operations:
    *  1. matrix-by-matrix. Transpose a_mat to psiM_fp_ used on host
    *  2. batched. Call mw_computeInvertAndLog_stride, H2D, invert, D2H
    *  3. matrix-by-matrix. Copy invM_fp_ to inv_a_mat on host. Transfer inv_a_mat to device.
    *
    *  Pros and cons:
    *  1. transfer is batched but double the transfer size due to precision promotion
    *  3. \todo Copy invM_fp_ to inv_a_mat on device is desired. Transfer inv_a_mat to host should be handled by the upper level code.
    */
   template<typename TMAT>
   inline std::enable_if_t<!std::is_same<VALUE_FP, TMAT>::value> mw_invertTranspose(
       compute::Queue<PlatformKind::CUDA>& queue,
       const RefVector<const DualMatrix<TMAT>>& a_mats,
       const RefVector<DualMatrix<TMAT>>& inv_a_mats,
       DualVector<LogValue>& log_values)
   {
     cudaStream_t h_stream = queue.getNative();
     cublasErrorCheck(cublasSetStream(h_cublas_, h_stream), "cublasSetStream failed!");
     assert(log_values.size() == a_mats.size());
     const int nw  = a_mats.size();
     const int n   = a_mats[0].get().rows();
     const int lda = a_mats[0].get().cols();
     size_t nsqr   = n * n;
     psiM_fp_.resize(n * lda * nw);
     invM_fp_.resize(n * lda * nw);
     std::fill(log_values.begin(), log_values.end(), LogValue{0.0, 0.0});
     // making sure we know the log_values are zero'd on the device.
     cudaErrorCheck(cudaMemcpyAsync(log_values.device_data(), log_values.data(), log_values.size() * sizeof(LogValue),
                                    cudaMemcpyHostToDevice, h_stream),
                    "cudaMemcpyAsync failed copying DiracMatrixBatch::log_values to device");
     for (int iw = 0; iw < nw; ++iw)
       simd::transpose(a_mats[iw].get().data(), n, a_mats[iw].get().cols(), psiM_fp_.data() + nsqr * iw, n, lda);
     mw_computeInvertAndLog_stride(queue, psiM_fp_, invM_fp_, n, lda, log_values);
     for (int iw = 0; iw < a_mats.size(); ++iw)
     {
       DualMatrix<VALUE_FP> data_ref_matrix;
       data_ref_matrix.attachReference(invM_fp_.data() + nsqr * iw, n, lda);
       // We can't use operator= with different lda, ldb which can happen so we use this assignment which is over the
       // smaller of the two's dimensions
       inv_a_mats[iw].get().assignUpperLeft(data_ref_matrix);
       cudaErrorCheck(cudaMemcpyAsync(inv_a_mats[iw].get().device_data(), inv_a_mats[iw].get().data(),
                                      inv_a_mats[iw].get().size() * sizeof(TMAT), cudaMemcpyHostToDevice, h_stream),
                      "cudaMemcpyAsync of inv_a_mat to device failed!");
     }
   }

   /** Batched inversion and calculation of log determinants.
    *  When TMAT is full precision we can use the a_mat and inv_mat directly
    *  Side effect of this is after this call the device copy of  a_mats contains
    *  the LU factorization matrix.
    */
   template<typename TMAT>
   inline std::enable_if_t<std::is_same<VALUE_FP, TMAT>::value> mw_invertTranspose(
       compute::Queue<PlatformKind::CUDA>& queue,
       const RefVector<const DualMatrix<TMAT>>& a_mats,
       const RefVector<DualMatrix<TMAT>>& inv_a_mats,
       DualVector<LogValue>& log_values)
   {
     cudaStream_t h_stream = queue.getNative();
     cublasErrorCheck(cublasSetStream(h_cublas_, h_stream), "cublasSetStream failed!");
     assert(log_values.size() == a_mats.size());
     const int n = a_mats[0].get().rows();
     mw_computeInvertAndLog(queue, a_mats, inv_a_mats, n, log_values);
   }
 };

 } // namespace qmcplusplus

 #endif //QMCPLUSPLUS_DIRAC_MATRIX_COMPUTE_CUDA_H
CUBLAS_OP_N
#define CUBLAS_OP_N
Definition: cuda2hip.h:19

qmcplusplus::DiracMatrixComputeCUDA::~DiracMatrixComputeCUDA
~DiracMatrixComputeCUDA()
Definition: DiracMatrixComputeCUDA.hpp:230

qmcplusplus::DiracMatrixComputeCUDA::h_cublas_
cublasHandle_t h_cublas_
Definition: DiracMatrixComputeCUDA.hpp:84

qmcplusplus::DataLocality::queue

qmcplusplus::Resource::getName
const std::string & getName() const
Definition: Resource.h:26

qmcplusplus
helper functions for EinsplineSetBuilder
Definition: Configuration.h:43

qmcplusplus::DiracMatrixComputeCUDA::psiM_invM_ptrs_
DualVector< VALUE_FP * > psiM_invM_ptrs_
Transfer buffer for device pointers to matrices.
Definition: DiracMatrixComputeCUDA.hpp:77

qmcplusplus::log_values
std::vector< StdComp, CUDAHostAllocator< StdComp > > log_values(batch_size)

OpenMP.h

qmcplusplus::Vector::device_data
pointer device_data()
Return the device_ptr matching X if this is a vector attached or owning dual space memory...
Definition: OhmmsVector.h:245

qmcplusplus::DiracMatrixComputeCUDA::host_zero
VALUE_FP host_zero
Definition: DiracMatrixComputeCUDA.hpp:81

qmcplusplus::DiracMatrixComputeCUDA::FullPrecReal
RealAlias< VALUE_FP > FullPrecReal
Definition: DiracMatrixComputeCUDA.hpp:51

qmcplusplus::simd::transpose
void transpose(const T *restrict A, size_t m, size_t lda, TO *restrict B, size_t n, size_t ldb)
transpose of A(m,n) to B(n,m)
Definition: algorithm.hpp:97

cublasDestroy
#define cublasDestroy
Definition: cuda2hip.h:38

qmcplusplus::Vector
Definition: OhmmsVector.h:33

qmcplusplus::Resource
Definition: Resource.h:20

cudaStream_t
#define cudaStream_t
Definition: cuda2hip.h:149

qmcplusplus::DiracMatrixComputeCUDA::mw_invertTranspose
std::enable_if_t<!std::is_same< VALUE_FP, TMAT >::value > mw_invertTranspose(compute::Queue< PlatformKind::CUDA > &queue, const RefVector< const DualMatrix< TMAT >> &a_mats, const RefVector< DualMatrix< TMAT >> &inv_a_mats, DualVector< LogValue > &log_values)
Mixed precision specialization When TMAT is not full precision we need to still do the inversion and ...
Definition: DiracMatrixComputeCUDA.hpp:291

qmcplusplus::Vector::data
pointer data()
Definition: OhmmsVector.h:238

qmcplusplus::cuBLAS_LU::computeInverseAndDetLog_batched
void computeInverseAndDetLog_batched(cublasHandle_t &h_cublas, cudaStream_t &hstream, const int n, const int lda, T *Ms[], T *Cs[], T *LU_diags, int *pivots, int *host_infos, int *infos, std::complex< double > *log_dets, const int batch_size)
Takes PsiM in column major layout and uses LU factorization to compute the log determinant and invPsi...

qmcplusplus::Matrix::cols
size_type cols() const
Definition: OhmmsMatrix.h:78

cuBLAS_LU.hpp
At the qmcplusplus cuBLAS_LU level all *, **, *[] are assumed to be to device addresses.

qmcplusplus::DiracMatrixComputeCUDA::makeClone
std::unique_ptr< Resource > makeClone() const override
Definition: DiracMatrixComputeCUDA.hpp:232

qmcplusplus::Matrix::size
size_type size() const
Definition: OhmmsMatrix.h:76

qmcplusplus::DiracMatrixComputeCUDA::psiM_fp_
DualVector< VALUE_FP > psiM_fp_
Definition: DiracMatrixComputeCUDA.hpp:61

qmcplusplus::Vector::size
size_type size() const
return the current size
Definition: OhmmsVector.h:162

qmcplusplus::DiracMatrixComputeCUDA::DiracMatrixComputeCUDA
DiracMatrixComputeCUDA()
Definition: DiracMatrixComputeCUDA.hpp:220

qmcplusplus::DiracMatrixComputeCUDA::invM_fp_
DualVector< VALUE_FP > invM_fp_
Definition: DiracMatrixComputeCUDA.hpp:62

qmcplusplus::Matrix::assignUpperLeft
void assignUpperLeft(const Matrix< T_FROM, ALLOC_FROM > &from)
This assigns from a matrix with larger row size (used for alignment) to whatever the rowsize is here...
Definition: OhmmsMatrix.h:155

qmcplusplus::DiracMatrixComputeCUDA::LU_diags_fp_
DualVector< VALUE_FP > LU_diags_fp_
Definition: DiracMatrixComputeCUDA.hpp:65

qmcplusplus::DiracMatrixComputeCUDA::mw_computeInvertAndLog
void mw_computeInvertAndLog(compute::Queue< PlatformKind::CUDA > &queue, const RefVector< const DualMatrix< VALUE_FP >> &a_mats, const RefVector< DualMatrix< VALUE_FP >> &inv_a_mats, const int n, DualVector< LogValue > &log_values)
Calculates the actual inv and log determinant on accelerator.
Definition: DiracMatrixComputeCUDA.hpp:105

qmcplusplus::DiracMatrixComputeCUDA::mw_invertTranspose
std::enable_if_t< std::is_same< VALUE_FP, TMAT >::value > mw_invertTranspose(compute::Queue< PlatformKind::CUDA > &queue, const RefVector< const DualMatrix< TMAT >> &a_mats, const RefVector< DualMatrix< TMAT >> &inv_a_mats, DualVector< LogValue > &log_values)
Batched inversion and calculation of log determinants.
Definition: DiracMatrixComputeCUDA.hpp:333

qmcplusplus::simd::remapCopy
void remapCopy(size_t m, size_t n, const T *restrict A, size_t lda, TO *restrict B, size_t ldb)
copy of A(m,n) to B(m,n)
Definition: algorithm.hpp:115

DualAllocatorAliases.hpp
These allocators are to make code that should be generic with the respect to accelerator code flavor ...

qmcplusplus::DiracMatrixComputeCUDA::invert_transpose
void invert_transpose(compute::Queue< PlatformKind::CUDA > &queue, DualMatrix< TMAT > &a_mat, DualMatrix< TMAT > &inv_a_mat, DualVector< LogValue > &log_values)
Given a_mat returns inverted amit and log determinant of a_matches.
Definition: DiracMatrixComputeCUDA.hpp:244

qmcplusplus::cudaErrorCheck
cudaErrorCheck(cudaMemcpyAsync(dev_lu.data(), lu.data(), sizeof(decltype(lu)::value_type) *lu.size(), cudaMemcpyHostToDevice, hstream), "cudaMemcpyAsync failed copying log_values to device")

cudaMemcpyDeviceToHost
#define cudaMemcpyDeviceToHost
Definition: cuda2hip.h:138

CUBLAS_OP_T
#define CUBLAS_OP_T
Definition: cuda2hip.h:20

cuBLAS.hpp

cudaStreamSynchronize
#define cudaStreamSynchronize
Definition: cuda2hip.h:152

qmcplusplus::lda
int lda
Definition: test_cuBLAS_LU.cpp:217

cudaMemcpyHostToDevice
#define cudaMemcpyHostToDevice
Definition: cuda2hip.h:139

qmcplusplus::Matrix::rows
size_type rows() const
Definition: OhmmsMatrix.h:77

qmcplusplus::Matrix::device_data
pointer device_data()
Definition: OhmmsMatrix.h:188

qmcplusplus::DiracMatrixComputeCUDA::mw_computeInvertAndLog_stride
void mw_computeInvertAndLog_stride(compute::Queue< PlatformKind::CUDA > &queue, DualVector< VALUE_FP > &psi_Ms, DualVector< VALUE_FP > &inv_Ms, const int n, const int lda, DualVector< LogValue > &log_values)
Calculates the actual inv and log determinant on accelerator with psiMs and invMs widened to full pre...
Definition: DiracMatrixComputeCUDA.hpp:175

qmcplusplus::DiracMatrixComputeCUDA::infos_
DualVector< int > infos_
Definition: DiracMatrixComputeCUDA.hpp:67

cublasCreate
#define cublasCreate
Definition: cuda2hip.h:37

cublasSetStream
#define cublasSetStream
Definition: cuda2hip.h:39

qmcplusplus::RefVector
std::vector< std::reference_wrapper< T > > RefVector
Definition: template_types.hpp:32

qmcplusplus::n
int n
Definition: test_cuBLAS_LU.cpp:216

qmcplusplus::Matrix::attachReference
void attachReference(T *ref)
Definition: OhmmsMatrix.h:113

qmcplusplus::DiracMatrixComputeCUDA
class defining a compute and memory resource to compute matrix inversion and the log determinants of ...
Definition: DiracMatrixComputeCUDA.hpp:49

qmcplusplus::Matrix
Definition: OhmmsMatrix.h:27

qmcplusplus::cuBLAS::geam
cublasStatus_t geam(cublasHandle_t &handle, cublasOperation_t &transa, cublasOperation_t &transb, int m, int n, const float *alpha, const float *A, int lda, const float *beta, const float *B, int ldb, float *C, int ldc)
Definition: cuBLAS.hpp:110

qmcplusplus::RealAlias
typename RealAlias_impl< T >::value_type RealAlias
If you have a function templated on a value that can be real or complex and you need to get the base ...
Definition: complex_help.hpp:60

qmcplusplus::DiracMatrixComputeCUDA::LogValue
std::complex< FullPrecReal > LogValue
Definition: DiracMatrixComputeCUDA.hpp:52

complex_help.hpp

ResourceCollection.h

qmcplusplus::Matrix::data
pointer data()
Definition: OhmmsMatrix.h:182

qmcplusplus::DiracMatrixComputeCUDA::DiracMatrixComputeCUDA
DiracMatrixComputeCUDA(const DiracMatrixComputeCUDA &other)
Definition: DiracMatrixComputeCUDA.hpp:225

cublasErrorCheck
#define cublasErrorCheck(ans, cause)
Definition: cuBLAS.hpp:34

qmcplusplus::DiracMatrixComputeCUDA::pivots_
DualVector< int > pivots_
Definition: DiracMatrixComputeCUDA.hpp:66

algorithm.hpp
SIMD version of functions in algorithm.

OhmmsMatrix.h

QueueCUDA.hpp

qmcplusplus::compute::Queue< PlatformKind::CUDA >
Definition: QueueCUDA.hpp:25

cublasHandle_t
#define cublasHandle_t
Definition: cuda2hip.h:35

cudaMemcpyAsync
#define cudaMemcpyAsync
Definition: cuda2hip.h:136

qmcplusplus::DiracMatrixComputeCUDA::host_one
VALUE_FP host_one
Definition: DiracMatrixComputeCUDA.hpp:80