de/df6/a01670_source.html

 //////////////////////////////////////////////////////////////////////////////////////
 // This file is distributed under the University of Illinois/NCSA Open Source License.
 // See LICENSE file in top directory for details.
 //
 // Copyright (c) 2022 QMCPACK developers.
 //
 // File developed by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
 //                    Mark Dewing, mdewing@anl.gov, Argonne National Laboratory
 //
 // File created by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
 //////////////////////////////////////////////////////////////////////////////////////

 #ifndef QMCPLUSPLUS_ROCSOLVERINVERTER_H
 #define QMCPLUSPLUS_ROCSOLVERINVERTER_H

 #include "OhmmsPETE/OhmmsVector.h"
 #include "OhmmsPETE/OhmmsMatrix.h"

 #if !defined(QMC_CUDA2HIP)
 #error rocSolverInverter.hpp expects QMC_CUDA2HIP to be defined
 #endif
 // This file assumes that QMC_CUDA2HIP is defined and that creates HIP versions of these functions (despite being labeled with "CUDA")
 #include "CUDA/CUDAruntime.hpp"
 #include "CUDA/CUDAallocator.hpp"
 #include "ROCm/rocsolver.hpp"
 #include "QMCWaveFunctions/detail/CUDA/delayed_update_helper.h"
 #include "CPU/math.hpp"

 namespace qmcplusplus
 {
 /** implements matrix inversion via rocSolver
  * @tparam T_FP high precision for matrix inversion, T_FP >= T
  */
 template<typename T_FP>
 class rocSolverInverter
 {
   /// scratch memory for cusolverDN
   Matrix<T_FP, CUDAAllocator<T_FP>> Mat1_gpu;
   /// scratch memory for cusolverDN
   Matrix<T_FP, CUDAAllocator<T_FP>> Mat2_gpu;
   /// pivot array + info
   Vector<int, CUDAHostAllocator<int>> ipiv;
   Vector<int, CUDAAllocator<int>> ipiv_gpu;
   /// diagonal terms of LU matrix
   Vector<T_FP, CUDAHostAllocator<T_FP>> LU_diag;
   Vector<T_FP, CUDAAllocator<T_FP>> LU_diag_gpu;
   /// workspace
   Vector<T_FP, CUDAAllocator<T_FP>> work_gpu;

   // CUDA specific variables
   rocblas_handle h_rocsolver_;
   hipStream_t hstream_;

   /** resize the internal storage
    * @param norb number of electrons/orbitals
    * @param delay, maximum delay 0<delay<=norb
    */
   inline void resize(int norb)
   {
     if (Mat1_gpu.rows() != norb)
     {
       Mat1_gpu.resize(norb, norb);
       // prepare cusolver auxiliary arrays
       ipiv.resize(norb + 1);
       ipiv_gpu.resize(norb + 1);
       LU_diag.resize(norb);
       LU_diag_gpu.resize(norb);

       // Memory for temporary storage for solver calls.
       // The rocSOLVER library handles this memory itself.
       //  If we need more control, there are API's to get the size and set the buffer memory
 #if 0
       size_t memory_size;
       rocblas_start_device_memory_size_query(h_rocsolver_);
       rocsolverErrorCheck(rocsolver::dgetrf(h_rocsolver_, norb, norb, nullptr, norb, nullptr, nullptr);
       rocsolverErrorCheck(rocsolver::dgetri(h_rocsolver_, norb, norb, nullptr, norb, nullptr, nullptr);
       rocblas_stop_device_memory_size_query(h_rocsolver_, &memory_size);
 #endif
     }
   }

 public:
   /// default constructor
   rocSolverInverter()
   {
     cudaErrorCheck(hipStreamCreate(&hstream_), "hipStreamCreate failed!");
     rocsolverErrorCheck(rocblas_create_handle(&h_rocsolver_), "rocblas_create_handle failed!");
     rocsolverErrorCheck(rocblas_set_stream(h_rocsolver_, hstream_), "rocblas_set_stream failed!");
   }

   ~rocSolverInverter()
   {
     rocsolverErrorCheck(rocblas_destroy_handle(h_rocsolver_), "rocblas_destroy_handle failed!");
     cudaErrorCheck(hipStreamDestroy(hstream_), "hipStreamDestroy failed!");
   }

   /** compute the inverse of the transpose of matrix A and its determinant value in log
    * when T_FP and TMAT are the same
    * @tparam TREAL real type
    */
   template<typename TMAT, typename TREAL, typename = std::enable_if_t<std::is_same<TMAT, T_FP>::value>>
   std::enable_if_t<std::is_same<TMAT, T_FP>::value> invert_transpose(const Matrix<TMAT>& logdetT,
                                                                      Matrix<TMAT>& Ainv,
                                                                      Matrix<TMAT, CUDAAllocator<TMAT>>& Ainv_gpu,
                                                                      std::complex<TREAL>& log_value)
   {
     const int norb = logdetT.rows();
     resize(norb);
     cudaErrorCheck(hipMemcpyAsync(Mat1_gpu.data(), logdetT.data(), logdetT.size() * sizeof(TMAT), hipMemcpyHostToDevice,
                                   hstream_),
                    "hipMemcpyAsync for logdetT to Mat1_gpu failed!");
     rocsolverErrorCheck(rocsolver::getrf(h_rocsolver_, norb, norb, Mat1_gpu.data(), norb, ipiv_gpu.data() + 1,
                                          ipiv_gpu.data()),
                         "rocsolver::getrf failed!");
     cudaErrorCheck(hipMemcpyAsync(ipiv.data(), ipiv_gpu.data(), ipiv_gpu.size() * sizeof(int), hipMemcpyDeviceToHost,
                                   hstream_),
                    "hipMemcpyAsync for ipiv failed!");
     extract_matrix_diagonal_cuda(norb, Mat1_gpu.data(), norb, LU_diag_gpu.data(), hstream_);
     cudaErrorCheck(hipMemcpyAsync(LU_diag.data(), LU_diag_gpu.data(), LU_diag.size() * sizeof(T_FP),
                                   hipMemcpyDeviceToHost, hstream_),
                    "hipMemcpyAsync for LU_diag failed!");
     // check LU success
     cudaErrorCheck(hipStreamSynchronize(hstream_), "hipStreamSynchronize after getrf failed!");
     if (ipiv[0] != 0)
     {
       std::ostringstream err;
       err << "rocsolver::getrf calculation failed with devInfo = " << ipiv[0] << std::endl;
       std::cerr << err.str();
       throw std::runtime_error(err.str());
     }
     make_identity_matrix_cuda(norb, Ainv_gpu.data(), norb, hstream_);
     rocsolverErrorCheck(rocsolver::getrs(h_rocsolver_, rocblas_operation_transpose, norb, norb, Mat1_gpu.data(), norb,
                                          ipiv_gpu.data() + 1, Ainv_gpu.data(), norb),
                         "rocsolver::getrs failed!");
     cudaErrorCheck(hipMemcpyAsync(ipiv.data(), ipiv_gpu.data(), sizeof(int), hipMemcpyDeviceToHost, hstream_),
                    "hipMemcpyAsync for ipiv failed!");
     computeLogDet(LU_diag.data(), norb, ipiv.data() + 1, log_value);
     cudaErrorCheck(hipMemcpyAsync(Ainv.data(), Ainv_gpu.data(), Ainv.size() * sizeof(TMAT), hipMemcpyDeviceToHost,
                                   hstream_),
                    "hipMemcpyAsync for Ainv failed!");
     cudaErrorCheck(hipStreamSynchronize(hstream_), "hipStreamSynchronize after getrs failed!");
     if (ipiv[0] != 0)
     {
       std::ostringstream err;
       err << "rocsolver::getrs calculation failed with devInfo = " << ipiv[0] << std::endl;
       std::cerr << err.str();
       throw std::runtime_error(err.str());
     }
   }

   /** compute the inverse of the transpose of matrix A and its determinant value in log
    * when T_FP and TMAT are not the same
    * @tparam TREAL real type
    */
   template<typename TMAT, typename TREAL, typename = std::enable_if_t<!std::is_same<TMAT, T_FP>::value>>
   std::enable_if_t<!std::is_same<TMAT, T_FP>::value> invert_transpose(const Matrix<TMAT>& logdetT,
                                                                       Matrix<TMAT>& Ainv,
                                                                       Matrix<TMAT, CUDAAllocator<TMAT>>& Ainv_gpu,
                                                                       std::complex<TREAL>& log_value)
   {
     const int norb = logdetT.rows();
     resize(norb);
     Mat2_gpu.resize(norb, norb);
     cudaErrorCheck(hipMemcpyAsync(Mat2_gpu.data(), logdetT.data(), logdetT.size() * sizeof(TMAT), hipMemcpyHostToDevice,
                                   hstream_),
                    "hipMemcpyAsync failed!");
     copy_matrix_cuda(norb, norb, (TMAT*)Mat2_gpu.data(), norb, Mat1_gpu.data(), norb, hstream_);
     rocsolverErrorCheck(rocsolver::getrf(h_rocsolver_, norb, norb, Mat1_gpu.data(), norb, ipiv_gpu.data() + 1,
                                          ipiv_gpu.data()),
                         "rocsolver::getrf failed!");
     cudaErrorCheck(hipMemcpyAsync(ipiv.data(), ipiv_gpu.data(), ipiv_gpu.size() * sizeof(int), hipMemcpyDeviceToHost,
                                   hstream_),
                    "hipMemcpyAsync failed!");
     extract_matrix_diagonal_cuda(norb, Mat1_gpu.data(), norb, LU_diag_gpu.data(), hstream_);
     cudaErrorCheck(hipMemcpyAsync(LU_diag.data(), LU_diag_gpu.data(), LU_diag.size() * sizeof(T_FP),
                                   hipMemcpyDeviceToHost, hstream_),
                    "hipMemcpyAsync failed!");
     // check LU success
     cudaErrorCheck(hipStreamSynchronize(hstream_), "hipStreamSynchronize after getrf failed!");
     if (ipiv[0] != 0)
     {
       std::ostringstream err;
       err << "rocsolver::getrf calculation failed with devInfo = " << ipiv[0] << std::endl;
       std::cerr << err.str();
       throw std::runtime_error(err.str());
     }
     make_identity_matrix_cuda(norb, Mat2_gpu.data(), norb, hstream_);
     rocsolverErrorCheck(rocsolver::getrs(h_rocsolver_, rocblas_operation_transpose, norb, norb, Mat1_gpu.data(), norb,
                                          ipiv_gpu.data() + 1, Mat2_gpu.data(), norb),
                         "rocsolver::getrs failed!");
     copy_matrix_cuda(norb, norb, Mat2_gpu.data(), norb, Ainv_gpu.data(), norb, hstream_);
     cudaErrorCheck(hipMemcpyAsync(ipiv.data(), ipiv_gpu.data(), sizeof(int), hipMemcpyDeviceToHost, hstream_),
                    "hipMemcpyAsync failed!");
     computeLogDet(LU_diag.data(), norb, ipiv.data() + 1, log_value);
     cudaErrorCheck(hipMemcpyAsync(Ainv.data(), Ainv_gpu.data(), Ainv.size() * sizeof(TMAT), hipMemcpyDeviceToHost,
                                   hstream_),
                    "hipMemcpyAsync failed!");
     // check solve success
     cudaErrorCheck(hipStreamSynchronize(hstream_), "hipStreamSynchronize after getrs failed!");
     if (ipiv[0] != 0)
     {
       std::ostringstream err;
       err << "rocsolver::getrs calculation failed with devInfo = " << ipiv[0] << std::endl;
       std::cerr << err.str();
       throw std::runtime_error(err.str());
     }

     std::ostringstream nan_msg;
     for(int i = 0; i < norb; i++)
       if (qmcplusplus::isnan(std::norm(Ainv[i][i])))
         nan_msg << "  Ainv["<< i << "][" << i << "] has bad value " << Ainv[i][i] << std::endl;
     if (const std::string str = nan_msg.str(); !str.empty())
       throw std::runtime_error("Inverse matrix diagonal check found:\n" + str);
   }
 };
 } // namespace qmcplusplus

 #endif // QMCPLUSPLUS_ROCSOLVERINVERTER_H
qmcplusplus::rocSolverInverter::work_gpu
Vector< T_FP, CUDAAllocator< T_FP > > work_gpu
workspace
Definition: rocSolverInverter.hpp:48

qmcplusplus::rocSolverInverter::rocSolverInverter
rocSolverInverter()
default constructor
Definition: rocSolverInverter.hpp:84

qmcplusplus::rocsolver::getrs
rocblas_status getrs(rocblas_handle &handle, const rocblas_operation &transa, int m, int n, double *A, int lda, int *ipiv, double *B, int ldb)
Definition: rocsolver.hpp:117

qmcplusplus::rocsolver::getrf
rocblas_status getrf(rocblas_handle &handle, int m, int n, double *A, int lda, int *ipiv, int *info)
Definition: rocsolver.hpp:101

qmcplusplus
helper functions for EinsplineSetBuilder
Definition: Configuration.h:43

qmcplusplus::rocSolverInverter::ipiv
Vector< int, CUDAHostAllocator< int > > ipiv
pivot array + info
Definition: rocSolverInverter.hpp:42

make_identity_matrix_cuda
void make_identity_matrix_cuda(const int nrows, double *mat, const int lda, cudaStream_t hstream)
create identity matrix on the device

CUDAruntime.hpp
handle CUDA/HIP runtime selection.

extract_matrix_diagonal_cuda
void extract_matrix_diagonal_cuda(const int nrows, const double *mat, const int lda, double *diag, cudaStream_t hstream)
extract matrix diagonal

qmcplusplus::rocSolverInverter::invert_transpose
std::enable_if_t< std::is_same< TMAT, T_FP >::value > invert_transpose(const Matrix< TMAT > &logdetT, Matrix< TMAT > &Ainv, Matrix< TMAT, CUDAAllocator< TMAT >> &Ainv_gpu, std::complex< TREAL > &log_value)
compute the inverse of the transpose of matrix A and its determinant value in log when T_FP and TMAT ...
Definition: rocSolverInverter.hpp:102

qmcplusplus::Vector
Definition: OhmmsVector.h:33

qmcplusplus::rocSolverInverter::~rocSolverInverter
~rocSolverInverter()
Definition: rocSolverInverter.hpp:91

copy_matrix_cuda
void copy_matrix_cuda(const int nrows, const int ncols, const double *mat_in, const int lda, float *mat_out, const int ldb, cudaStream_t hstream)
copy matrix with precision difference

CUDAallocator.hpp
this file provides three C++ memory allocators using CUDA specific memory allocation functions...

qmcplusplus::rocSolverInverter
implements matrix inversion via rocSolver
Definition: rocSolverInverter.hpp:35

rocsolver.hpp

qmcplusplus::Matrix::size
size_type size() const
Definition: OhmmsMatrix.h:76

norm
double norm(const zVec &c)
Definition: VectorOps.h:118

qmcplusplus::rocSolverInverter::ipiv_gpu
Vector< int, CUDAAllocator< int > > ipiv_gpu
Definition: rocSolverInverter.hpp:43

dgetrf
void dgetrf(const int &n, const int &m, double *a, const int &n0, int *piv, int &st)

qmcplusplus::cudaErrorCheck
cudaErrorCheck(cudaMemcpyAsync(dev_lu.data(), lu.data(), sizeof(decltype(lu)::value_type) *lu.size(), cudaMemcpyHostToDevice, hstream), "cudaMemcpyAsync failed copying log_values to device")

qmcplusplus::rocSolverInverter::LU_diag_gpu
Vector< T_FP, CUDAAllocator< T_FP > > LU_diag_gpu
Definition: rocSolverInverter.hpp:46

qmcplusplus::rocSolverInverter::hstream_
hipStream_t hstream_
Definition: rocSolverInverter.hpp:52

delayed_update_helper.h

qmcplusplus::computeLogDet
void computeLogDet(const T *restrict diag, int n, const int *restrict pivot, std::complex< T_FP > &logdet)
Definition: DiracMatrix.h:100

qmcplusplus::rocSolverInverter::invert_transpose
std::enable_if_t<!std::is_same< TMAT, T_FP >::value > invert_transpose(const Matrix< TMAT > &logdetT, Matrix< TMAT > &Ainv, Matrix< TMAT, CUDAAllocator< TMAT >> &Ainv_gpu, std::complex< TREAL > &log_value)
compute the inverse of the transpose of matrix A and its determinant value in log when T_FP and TMAT ...
Definition: rocSolverInverter.hpp:156

qmcplusplus::Matrix::rows
size_type rows() const
Definition: OhmmsMatrix.h:77

OhmmsVector.h
Declaraton of Vector<T,Alloc> Manage memory through Alloc directly and allow referencing an existing ...

qmcplusplus::Matrix
Definition: OhmmsMatrix.h:27

qmcplusplus::rocSolverInverter::Mat2_gpu
Matrix< T_FP, CUDAAllocator< T_FP > > Mat2_gpu
scratch memory for cusolverDN
Definition: rocSolverInverter.hpp:40

qmcplusplus::Matrix::data
pointer data()
Definition: OhmmsMatrix.h:182

qmcplusplus::rocSolverInverter::h_rocsolver_
rocblas_handle h_rocsolver_
Definition: rocSolverInverter.hpp:51

rocsolverErrorCheck
#define rocsolverErrorCheck(ans, cause)
Definition: rocsolver.hpp:25

qmcplusplus::rocSolverInverter::LU_diag
Vector< T_FP, CUDAHostAllocator< T_FP > > LU_diag
diagonal terms of LU matrix
Definition: rocSolverInverter.hpp:45

qmcplusplus::CUDAAllocator
allocator for CUDA device memory
Definition: CUDAallocator.hpp:95

dgetri
void dgetri(const int &n, double *a, const int &n0, int const *piv, double *work, const int &, int &st)

OhmmsMatrix.h

qmcplusplus::rocSolverInverter::Mat1_gpu
Matrix< T_FP, CUDAAllocator< T_FP > > Mat1_gpu
scratch memory for cusolverDN
Definition: rocSolverInverter.hpp:38

math.hpp

qmcplusplus::isnan
bool isnan(float a)
return true if the value is NaN.
Definition: math.cpp:18

qmcplusplus::rocSolverInverter::resize
void resize(int norb)
resize the internal storage
Definition: rocSolverInverter.hpp:58