d6/d82/a01607_source.html

 //////////////////////////////////////////////////////////////////////////////////////
 // This file is distributed under the University of Illinois/NCSA Open Source License.
 // See LICENSE file in top directory for details.
 //
 // Copyright (c) 2019 QMCPACK developers.
 //
 // File developed by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
 //
 // File created by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
 //////////////////////////////////////////////////////////////////////////////////////

 #ifndef QMCPLUSPLUS_CUSOLVERINVERTOR_H
 #define QMCPLUSPLUS_CUSOLVERINVERTOR_H

 #include "OhmmsPETE/OhmmsVector.h"
 #include "OhmmsPETE/OhmmsMatrix.h"
 #include "CUDA/CUDAruntime.hpp"
 #include "CUDA/CUDAallocator.hpp"
 #include "CUDA/cusolver.hpp"
 #include "QMCWaveFunctions/detail/CUDA/delayed_update_helper.h"
 #include "CPU/math.hpp"

 namespace qmcplusplus
 {
 /** implements matrix inversion via cuSolverDN
  * @tparam T_FP high precision for matrix inversion, T_FP >= T
  */
 template<typename T_FP>
 class cuSolverInverter
 {
   /// scratch memory for cusolverDN
   Matrix<T_FP, CUDAAllocator<T_FP>> Mat1_gpu;
   /// scratch memory for cusolverDN
   Matrix<T_FP, CUDAAllocator<T_FP>> Mat2_gpu;
   /// pivot array + info
   Vector<int, CUDAHostAllocator<int>> ipiv;
   Vector<int, CUDAAllocator<int>> ipiv_gpu;
   /// diagonal terms of LU matrix
   Vector<T_FP, CUDAHostAllocator<T_FP>> LU_diag;
   Vector<T_FP, CUDAAllocator<T_FP>> LU_diag_gpu;
   /// workspace
   Vector<T_FP, CUDAAllocator<T_FP>> work_gpu;

   // CUDA specific variables
   cusolverDnHandle_t h_cusolver_;
   cudaStream_t hstream_;

   /** resize the internal storage
    * @param norb number of electrons/orbitals
    * @param delay, maximum delay 0<delay<=norb
    */
   inline void resize(int norb)
   {
     if (Mat1_gpu.rows() != norb)
     {
       Mat1_gpu.resize(norb, norb);
       // prepare cusolver auxiliary arrays
       ipiv.resize(norb + 1);
       ipiv_gpu.resize(norb + 1);
       LU_diag.resize(norb);
       LU_diag_gpu.resize(norb);
       int lwork;
       cusolverErrorCheck(cusolver::getrf_bufferSize(h_cusolver_, norb, norb, Mat1_gpu.data(), norb, &lwork),
                          "cusolver::getrf_bufferSize failed!");
       work_gpu.resize(lwork);
     }
   }

 public:
   /// default constructor
   cuSolverInverter()
   {
     cudaErrorCheck(cudaStreamCreate(&hstream_), "cudaStreamCreate failed!");
     cusolverErrorCheck(cusolverDnCreate(&h_cusolver_), "cusolverCreate failed!");
     cusolverErrorCheck(cusolverDnSetStream(h_cusolver_, hstream_), "cusolverSetStream failed!");
   }

   ~cuSolverInverter()
   {
     cusolverErrorCheck(cusolverDnDestroy(h_cusolver_), "cusolverDestroy failed!");
     cudaErrorCheck(cudaStreamDestroy(hstream_), "cudaStreamDestroy failed!");
   }

   /** compute the inverse of the transpose of matrix A and its determinant value in log
    * when T_FP and TMAT are the same
    * @tparam TREAL real type
    */
   template<typename TMAT, typename TREAL, typename = std::enable_if_t<std::is_same<TMAT, T_FP>::value>>
   std::enable_if_t<std::is_same<TMAT, T_FP>::value> invert_transpose(const Matrix<TMAT>& logdetT,
                                                                      Matrix<TMAT>& Ainv,
                                                                      Matrix<TMAT, CUDAAllocator<TMAT>>& Ainv_gpu,
                                                                      std::complex<TREAL>& log_value)
   {
     const int norb = logdetT.rows();
     resize(norb);
     cudaErrorCheck(cudaMemcpyAsync(Mat1_gpu.data(), logdetT.data(), logdetT.size() * sizeof(TMAT),
                                    cudaMemcpyHostToDevice, hstream_),
                    "cudaMemcpyAsync failed!");
     cusolverErrorCheck(cusolver::getrf(h_cusolver_, norb, norb, Mat1_gpu.data(), norb, work_gpu.data(),
                                        ipiv_gpu.data() + 1, ipiv_gpu.data()),
                        "cusolver::getrf failed!");
     cudaErrorCheck(cudaMemcpyAsync(ipiv.data(), ipiv_gpu.data(), ipiv_gpu.size() * sizeof(int), cudaMemcpyDeviceToHost,
                                    hstream_),
                    "cudaMemcpyAsync failed!");
     extract_matrix_diagonal_cuda(norb, Mat1_gpu.data(), norb, LU_diag_gpu.data(), hstream_);
     cudaErrorCheck(cudaMemcpyAsync(LU_diag.data(), LU_diag_gpu.data(), LU_diag.size() * sizeof(T_FP),
                                    cudaMemcpyDeviceToHost, hstream_),
                    "cudaMemcpyAsync failed!");
     // check LU success
     cudaErrorCheck(cudaStreamSynchronize(hstream_), "cudaStreamSynchronize after getrf failed!");
     if (ipiv[0] != 0)
     {
       std::ostringstream err;
       err << "cusolver::getrf calculation failed with devInfo = " << ipiv[0] << std::endl;
       std::cerr << err.str();
       throw std::runtime_error(err.str());
     }
     make_identity_matrix_cuda(norb, Ainv_gpu.data(), norb, hstream_);
     cusolverErrorCheck(cusolver::getrs(h_cusolver_, CUBLAS_OP_T, norb, norb, Mat1_gpu.data(), norb, ipiv_gpu.data() + 1,
                                        Ainv_gpu.data(), norb, ipiv_gpu.data()),
                        "cusolver::getrs failed!");
     cudaErrorCheck(cudaMemcpyAsync(ipiv.data(), ipiv_gpu.data(), sizeof(int), cudaMemcpyDeviceToHost, hstream_),
                    "cudaMemcpyAsync failed!");
     computeLogDet(LU_diag.data(), norb, ipiv.data() + 1, log_value);
     cudaErrorCheck(cudaMemcpyAsync(Ainv.data(), Ainv_gpu.data(), Ainv.size() * sizeof(TMAT), cudaMemcpyDeviceToHost,
                                    hstream_),
                    "cudaMemcpyAsync failed!");
     // check solve success
     cudaErrorCheck(cudaStreamSynchronize(hstream_), "cudaStreamSynchronize after getrs failed!");
     if (ipiv[0] != 0)
     {
       std::ostringstream err;
       err << "cusolver::getrs calculation failed with devInfo = " << ipiv[0] << std::endl;
       std::cerr << err.str();
       throw std::runtime_error(err.str());
     }
   }

   /** compute the inverse of the transpose of matrix A and its determinant value in log
    * when T_FP and TMAT are not the same
    * @tparam TREAL real type
    */
   template<typename TMAT, typename TREAL, typename = std::enable_if_t<!std::is_same<TMAT, T_FP>::value>>
   std::enable_if_t<!std::is_same<TMAT, T_FP>::value> invert_transpose(const Matrix<TMAT>& logdetT,
                                                                       Matrix<TMAT>& Ainv,
                                                                       Matrix<TMAT, CUDAAllocator<TMAT>>& Ainv_gpu,
                                                                       std::complex<TREAL>& log_value)
   {
     const int norb = logdetT.rows();
     resize(norb);
     Mat2_gpu.resize(norb, norb);
     cudaErrorCheck(cudaMemcpyAsync(Mat2_gpu.data(), logdetT.data(), logdetT.size() * sizeof(TMAT),
                                    cudaMemcpyHostToDevice, hstream_),
                    "cudaMemcpyAsync failed!");
     copy_matrix_cuda(norb, norb, (TMAT*)Mat2_gpu.data(), norb, Mat1_gpu.data(), norb, hstream_);
     cusolverErrorCheck(cusolver::getrf(h_cusolver_, norb, norb, Mat1_gpu.data(), norb, work_gpu.data(),
                                        ipiv_gpu.data() + 1, ipiv_gpu.data()),
                        "cusolver::getrf failed!");
     cudaErrorCheck(cudaMemcpyAsync(ipiv.data(), ipiv_gpu.data(), ipiv_gpu.size() * sizeof(int), cudaMemcpyDeviceToHost,
                                    hstream_),
                    "cudaMemcpyAsync failed!");
     extract_matrix_diagonal_cuda(norb, Mat1_gpu.data(), norb, LU_diag_gpu.data(), hstream_);
     cudaErrorCheck(cudaMemcpyAsync(LU_diag.data(), LU_diag_gpu.data(), LU_diag.size() * sizeof(T_FP),
                                    cudaMemcpyDeviceToHost, hstream_),
                    "cudaMemcpyAsync failed!");
     // check LU success
     cudaErrorCheck(cudaStreamSynchronize(hstream_), "cudaStreamSynchronize after getrf failed!");
     if (ipiv[0] != 0)
     {
       std::ostringstream err;
       err << "cusolver::getrf calculation failed with devInfo = " << ipiv[0] << std::endl;
       throw std::runtime_error(err.str());
     }
     make_identity_matrix_cuda(norb, Mat2_gpu.data(), norb, hstream_);
     cusolverErrorCheck(cusolver::getrs(h_cusolver_, CUBLAS_OP_T, norb, norb, Mat1_gpu.data(), norb, ipiv_gpu.data() + 1,
                                        Mat2_gpu.data(), norb, ipiv_gpu.data()),
                        "cusolver::getrs failed!");
     copy_matrix_cuda(norb, norb, Mat2_gpu.data(), norb, Ainv_gpu.data(), norb, hstream_);
     cudaErrorCheck(cudaMemcpyAsync(ipiv.data(), ipiv_gpu.data(), sizeof(int), cudaMemcpyDeviceToHost, hstream_),
                    "cudaMemcpyAsync failed!");
     computeLogDet(LU_diag.data(), norb, ipiv.data() + 1, log_value);
     cudaErrorCheck(cudaMemcpyAsync(Ainv.data(), Ainv_gpu.data(), Ainv.size() * sizeof(TMAT), cudaMemcpyDeviceToHost,
                                    hstream_),
                    "cudaMemcpyAsync failed!");
     // check solve success
     cudaErrorCheck(cudaStreamSynchronize(hstream_), "cudaStreamSynchronize after getrs failed!");
     if (ipiv[0] != 0)
     {
       std::ostringstream err;
       err << "cusolver::getrs calculation failed with devInfo = " << ipiv[0] << std::endl;
       throw std::runtime_error(err.str());
     }

     std::ostringstream nan_msg;
     for(int i = 0; i < norb; i++)
       if (qmcplusplus::isnan(std::norm(Ainv[i][i])))
         nan_msg << "  Ainv["<< i << "][" << i << "] has bad value " << Ainv[i][i] << std::endl;
     if (const std::string str = nan_msg.str(); !str.empty())
       throw std::runtime_error("Inverse matrix diagonal check found:\n" + str);
   }
 };
 } // namespace qmcplusplus

 #endif // QMCPLUSPLUS_CUSOLVERINVERTOR_H
qmcplusplus::cuSolverInverter::~cuSolverInverter
~cuSolverInverter()
Definition: cuSolverInverter.hpp:78

qmcplusplus::cuSolverInverter::cuSolverInverter
cuSolverInverter()
default constructor
Definition: cuSolverInverter.hpp:71

qmcplusplus::cusolver::getrs
cusolverStatus_t getrs(cusolverDnHandle_t &handle, const cublasOperation_t &transa, int m, int n, const double *A, int lda, int *ipiv, double *B, int ldb, int *info)
Definition: cusolver.hpp:116

qmcplusplus
helper functions for EinsplineSetBuilder
Definition: Configuration.h:43

make_identity_matrix_cuda
void make_identity_matrix_cuda(const int nrows, double *mat, const int lda, cudaStream_t hstream)
create identity matrix on the device

CUDAruntime.hpp
handle CUDA/HIP runtime selection.

cudaStreamDestroy
#define cudaStreamDestroy
Definition: cuda2hip.h:151

extract_matrix_diagonal_cuda
void extract_matrix_diagonal_cuda(const int nrows, const double *mat, const int lda, double *diag, cudaStream_t hstream)
extract matrix diagonal

qmcplusplus::cuSolverInverter::h_cusolver_
cusolverDnHandle_t h_cusolver_
Definition: cuSolverInverter.hpp:45

qmcplusplus::cuSolverInverter::invert_transpose
std::enable_if_t< std::is_same< TMAT, T_FP >::value > invert_transpose(const Matrix< TMAT > &logdetT, Matrix< TMAT > &Ainv, Matrix< TMAT, CUDAAllocator< TMAT >> &Ainv_gpu, std::complex< TREAL > &log_value)
compute the inverse of the transpose of matrix A and its determinant value in log when T_FP and TMAT ...
Definition: cuSolverInverter.hpp:89

qmcplusplus::Vector
Definition: OhmmsVector.h:33

copy_matrix_cuda
void copy_matrix_cuda(const int nrows, const int ncols, const double *mat_in, const int lda, float *mat_out, const int ldb, cudaStream_t hstream)
copy matrix with precision difference

cudaStream_t
#define cudaStream_t
Definition: cuda2hip.h:149

cudaStreamCreate
#define cudaStreamCreate
Definition: cuda2hip.h:150

CUDAallocator.hpp
this file provides three C++ memory allocators using CUDA specific memory allocation functions...

qmcplusplus::cuSolverInverter::Mat2_gpu
Matrix< T_FP, CUDAAllocator< T_FP > > Mat2_gpu
scratch memory for cusolverDN
Definition: cuSolverInverter.hpp:34

cusolver.hpp

qmcplusplus::cuSolverInverter::Mat1_gpu
Matrix< T_FP, CUDAAllocator< T_FP > > Mat1_gpu
scratch memory for cusolverDN
Definition: cuSolverInverter.hpp:32

qmcplusplus::cuSolverInverter::resize
void resize(int norb)
resize the internal storage
Definition: cuSolverInverter.hpp:52

qmcplusplus::Matrix::size
size_type size() const
Definition: OhmmsMatrix.h:76

qmcplusplus::cuSolverInverter::work_gpu
Vector< T_FP, CUDAAllocator< T_FP > > work_gpu
workspace
Definition: cuSolverInverter.hpp:42

norm
double norm(const zVec &c)
Definition: VectorOps.h:118

qmcplusplus::cuSolverInverter::ipiv_gpu
Vector< int, CUDAAllocator< int > > ipiv_gpu
Definition: cuSolverInverter.hpp:37

qmcplusplus::cudaErrorCheck
cudaErrorCheck(cudaMemcpyAsync(dev_lu.data(), lu.data(), sizeof(decltype(lu)::value_type) *lu.size(), cudaMemcpyHostToDevice, hstream), "cudaMemcpyAsync failed copying log_values to device")

cudaMemcpyDeviceToHost
#define cudaMemcpyDeviceToHost
Definition: cuda2hip.h:138

CUBLAS_OP_T
#define CUBLAS_OP_T
Definition: cuda2hip.h:20

qmcplusplus::cuSolverInverter::LU_diag_gpu
Vector< T_FP, CUDAAllocator< T_FP > > LU_diag_gpu
Definition: cuSolverInverter.hpp:40

cudaStreamSynchronize
#define cudaStreamSynchronize
Definition: cuda2hip.h:152

delayed_update_helper.h

qmcplusplus::computeLogDet
void computeLogDet(const T *restrict diag, int n, const int *restrict pivot, std::complex< T_FP > &logdet)
Definition: DiracMatrix.h:100

cudaMemcpyHostToDevice
#define cudaMemcpyHostToDevice
Definition: cuda2hip.h:139

qmcplusplus::cusolver::getrf_bufferSize
cusolverStatus_t getrf_bufferSize(cusolverDnHandle_t &handle, int m, int n, double *A, int lda, int *lwork)
Definition: cusolver.hpp:77

qmcplusplus::Matrix::rows
size_type rows() const
Definition: OhmmsMatrix.h:77

OhmmsVector.h
Declaraton of Vector<T,Alloc> Manage memory through Alloc directly and allow referencing an existing ...

qmcplusplus::cuSolverInverter::ipiv
Vector< int, CUDAHostAllocator< int > > ipiv
pivot array + info
Definition: cuSolverInverter.hpp:36

qmcplusplus::Matrix
Definition: OhmmsMatrix.h:27

cusolverErrorCheck
#define cusolverErrorCheck(ans, cause)
Definition: cusolver.hpp:21

qmcplusplus::cuSolverInverter::hstream_
cudaStream_t hstream_
Definition: cuSolverInverter.hpp:46

qmcplusplus::Matrix::data
pointer data()
Definition: OhmmsMatrix.h:182

qmcplusplus::CUDAAllocator
allocator for CUDA device memory
Definition: CUDAallocator.hpp:95

qmcplusplus::cuSolverInverter
implements matrix inversion via cuSolverDN
Definition: cuSolverInverter.hpp:29

OhmmsMatrix.h

qmcplusplus::cuSolverInverter::LU_diag
Vector< T_FP, CUDAHostAllocator< T_FP > > LU_diag
diagonal terms of LU matrix
Definition: cuSolverInverter.hpp:39

qmcplusplus::cusolver::getrf
cusolverStatus_t getrf(cusolverDnHandle_t &handle, int m, int n, double *A, int lda, double *work, int *ipiv, int *info)
Definition: cusolver.hpp:92

math.hpp

qmcplusplus::isnan
bool isnan(float a)
return true if the value is NaN.
Definition: math.cpp:18

cudaMemcpyAsync
#define cudaMemcpyAsync
Definition: cuda2hip.h:136

qmcplusplus::cuSolverInverter::invert_transpose
std::enable_if_t<!std::is_same< TMAT, T_FP >::value > invert_transpose(const Matrix< TMAT > &logdetT, Matrix< TMAT > &Ainv, Matrix< TMAT, CUDAAllocator< TMAT >> &Ainv_gpu, std::complex< TREAL > &log_value)
compute the inverse of the transpose of matrix A and its determinant value in log when T_FP and TMAT ...
Definition: cuSolverInverter.hpp:144