implements matrix inversion via cuSolverDN More...

Collaboration diagram for cuSolverInverter< T_FP >:

Public Member Functions
	cuSolverInverter ()
	default constructor More...

	~cuSolverInverter ()

template<typename TMAT , typename TREAL , typename = std::enable_if_t<std::is_same<TMAT, T_FP>::value>>
std::enable_if_t< std::is_same< TMAT, T_FP >::value >	invert_transpose (const Matrix< TMAT > &logdetT, Matrix< TMAT > &Ainv, Matrix< TMAT, CUDAAllocator< TMAT >> &Ainv_gpu, std::complex< TREAL > &log_value)
	compute the inverse of the transpose of matrix A and its determinant value in log when T_FP and TMAT are the same More...

template<typename TMAT , typename TREAL , typename = std::enable_if_t<!std::is_same<TMAT, T_FP>::value>>
std::enable_if_t<!std::is_same< TMAT, T_FP >::value >	invert_transpose (const Matrix< TMAT > &logdetT, Matrix< TMAT > &Ainv, Matrix< TMAT, CUDAAllocator< TMAT >> &Ainv_gpu, std::complex< TREAL > &log_value)
	compute the inverse of the transpose of matrix A and its determinant value in log when T_FP and TMAT are not the same More...

Private Member Functions
void	resize (int norb)
	resize the internal storage More...

Private Attributes
Matrix< T_FP, CUDAAllocator< T_FP > >	Mat1_gpu
	scratch memory for cusolverDN More...

Matrix< T_FP, CUDAAllocator< T_FP > >	Mat2_gpu
	scratch memory for cusolverDN More...

Vector< int, CUDAHostAllocator< int > >	ipiv
	pivot array + info More...

Vector< int, CUDAAllocator< int > >	ipiv_gpu

Vector< T_FP, CUDAHostAllocator< T_FP > >	LU_diag
	diagonal terms of LU matrix More...

Vector< T_FP, CUDAAllocator< T_FP > >	LU_diag_gpu

Vector< T_FP, CUDAAllocator< T_FP > >	work_gpu
	workspace More...

cusolverDnHandle_t	h_cusolver_

cudaStream_t	hstream_

Detailed Description

template<typename T_FP>
class qmcplusplus::cuSolverInverter< T_FP >

implements matrix inversion via cuSolverDN

Template Parameters

T_FP	high precision for matrix inversion, T_FP >= T

Definition at line 29 of file cuSolverInverter.hpp.

Constructor & Destructor Documentation

◆ cuSolverInverter()

cuSolverInverter ( )

inline

default constructor

Definition at line 71 of file cuSolverInverter.hpp.

References qmcplusplus::cudaErrorCheck(), cudaStreamCreate, cusolverErrorCheck, cuSolverInverter< T_FP >::h_cusolver_, and cuSolverInverter< T_FP >::hstream_.

   {
     cudaErrorCheck(cudaStreamCreate(&hstream_), "cudaStreamCreate failed!");
     cusolverErrorCheck(cusolverDnCreate(&h_cusolver_), "cusolverCreate failed!");
     cusolverErrorCheck(cusolverDnSetStream(h_cusolver_, hstream_), "cusolverSetStream failed!");
   }

◆ ~cuSolverInverter()

~cuSolverInverter ( )

inline

Definition at line 78 of file cuSolverInverter.hpp.

References qmcplusplus::cudaErrorCheck(), cudaStreamDestroy, cusolverErrorCheck, cuSolverInverter< T_FP >::h_cusolver_, and cuSolverInverter< T_FP >::hstream_.

   {
     cusolverErrorCheck(cusolverDnDestroy(h_cusolver_), "cusolverDestroy failed!");
     cudaErrorCheck(cudaStreamDestroy(hstream_), "cudaStreamDestroy failed!");
   }

Member Function Documentation

◆ invert_transpose() [1/2]

std::enable_if_t<std::is_same<TMAT, T_FP>::value> invert_transpose	(	const Matrix< TMAT > &	logdetT,
		Matrix< TMAT > &	Ainv,
		Matrix< TMAT, CUDAAllocator< TMAT >> &	Ainv_gpu,
		std::complex< TREAL > &	log_value
	)

inline

compute the inverse of the transpose of matrix A and its determinant value in log when T_FP and TMAT are the same

Template Parameters

TREAL real type

Definition at line 89 of file cuSolverInverter.hpp.

Referenced by qmcplusplus::TEMPLATE_TEST_CASE(), and qmcplusplus::TEST_CASE().

   {
     const int norb = logdetT.rows();
     resize(norb);
     cudaErrorCheck(cudaMemcpyAsync(Mat1_gpu.data(), logdetT.data(), logdetT.size() * sizeof(TMAT),
                                    cudaMemcpyHostToDevice, hstream_),
                    "cudaMemcpyAsync failed!");
     cusolverErrorCheck(cusolver::getrf(h_cusolver_, norb, norb, Mat1_gpu.data(), norb, work_gpu.data(),
                                        ipiv_gpu.data() + 1, ipiv_gpu.data()),
                        "cusolver::getrf failed!");
     cudaErrorCheck(cudaMemcpyAsync(ipiv.data(), ipiv_gpu.data(), ipiv_gpu.size() * sizeof(int), cudaMemcpyDeviceToHost,
                                    hstream_),
                    "cudaMemcpyAsync failed!");
     extract_matrix_diagonal_cuda(norb, Mat1_gpu.data(), norb, LU_diag_gpu.data(), hstream_);
     cudaErrorCheck(cudaMemcpyAsync(LU_diag.data(), LU_diag_gpu.data(), LU_diag.size() * sizeof(T_FP),
                                    cudaMemcpyDeviceToHost, hstream_),
                    "cudaMemcpyAsync failed!");
     // check LU success
     cudaErrorCheck(cudaStreamSynchronize(hstream_), "cudaStreamSynchronize after getrf failed!");
     if (ipiv[0] != 0)
     {
       std::ostringstream err;
       err << "cusolver::getrf calculation failed with devInfo = " << ipiv[0] << std::endl;
       std::cerr << err.str();
       throw std::runtime_error(err.str());
     }
     make_identity_matrix_cuda(norb, Ainv_gpu.data(), norb, hstream_);
     cusolverErrorCheck(cusolver::getrs(h_cusolver_, CUBLAS_OP_T, norb, norb, Mat1_gpu.data(), norb, ipiv_gpu.data() + 1,
                                        Ainv_gpu.data(), norb, ipiv_gpu.data()),
                        "cusolver::getrs failed!");
     cudaErrorCheck(cudaMemcpyAsync(ipiv.data(), ipiv_gpu.data(), sizeof(int), cudaMemcpyDeviceToHost, hstream_),
                    "cudaMemcpyAsync failed!");
     computeLogDet(LU_diag.data(), norb, ipiv.data() + 1, log_value);
     cudaErrorCheck(cudaMemcpyAsync(Ainv.data(), Ainv_gpu.data(), Ainv.size() * sizeof(TMAT), cudaMemcpyDeviceToHost,
                                    hstream_),
                    "cudaMemcpyAsync failed!");
     // check solve success
     cudaErrorCheck(cudaStreamSynchronize(hstream_), "cudaStreamSynchronize after getrs failed!");
     if (ipiv[0] != 0)
     {
       std::ostringstream err;
       err << "cusolver::getrs calculation failed with devInfo = " << ipiv[0] << std::endl;
       std::cerr << err.str();
       throw std::runtime_error(err.str());
     }
   }

◆ invert_transpose() [2/2]

std::enable_if_t<!std::is_same<TMAT, T_FP>::value> invert_transpose	(	const Matrix< TMAT > &	logdetT,
		Matrix< TMAT > &	Ainv,
		Matrix< TMAT, CUDAAllocator< TMAT >> &	Ainv_gpu,
		std::complex< TREAL > &	log_value
	)

inline

compute the inverse of the transpose of matrix A and its determinant value in log when T_FP and TMAT are not the same

Template Parameters

TREAL real type

Definition at line 144 of file cuSolverInverter.hpp.

   {
     const int norb = logdetT.rows();
     resize(norb);
     Mat2_gpu.resize(norb, norb);
     cudaErrorCheck(cudaMemcpyAsync(Mat2_gpu.data(), logdetT.data(), logdetT.size() * sizeof(TMAT),
                                    cudaMemcpyHostToDevice, hstream_),
                    "cudaMemcpyAsync failed!");
     copy_matrix_cuda(norb, norb, (TMAT*)Mat2_gpu.data(), norb, Mat1_gpu.data(), norb, hstream_);
     cusolverErrorCheck(cusolver::getrf(h_cusolver_, norb, norb, Mat1_gpu.data(), norb, work_gpu.data(),
                                        ipiv_gpu.data() + 1, ipiv_gpu.data()),
                        "cusolver::getrf failed!");
     cudaErrorCheck(cudaMemcpyAsync(ipiv.data(), ipiv_gpu.data(), ipiv_gpu.size() * sizeof(int), cudaMemcpyDeviceToHost,
                                    hstream_),
                    "cudaMemcpyAsync failed!");
     extract_matrix_diagonal_cuda(norb, Mat1_gpu.data(), norb, LU_diag_gpu.data(), hstream_);
     cudaErrorCheck(cudaMemcpyAsync(LU_diag.data(), LU_diag_gpu.data(), LU_diag.size() * sizeof(T_FP),
                                    cudaMemcpyDeviceToHost, hstream_),
                    "cudaMemcpyAsync failed!");
     // check LU success
     cudaErrorCheck(cudaStreamSynchronize(hstream_), "cudaStreamSynchronize after getrf failed!");
     if (ipiv[0] != 0)
     {
       std::ostringstream err;
       err << "cusolver::getrf calculation failed with devInfo = " << ipiv[0] << std::endl;
       throw std::runtime_error(err.str());
     }
     make_identity_matrix_cuda(norb, Mat2_gpu.data(), norb, hstream_);
     cusolverErrorCheck(cusolver::getrs(h_cusolver_, CUBLAS_OP_T, norb, norb, Mat1_gpu.data(), norb, ipiv_gpu.data() + 1,
                                        Mat2_gpu.data(), norb, ipiv_gpu.data()),
                        "cusolver::getrs failed!");
     copy_matrix_cuda(norb, norb, Mat2_gpu.data(), norb, Ainv_gpu.data(), norb, hstream_);
     cudaErrorCheck(cudaMemcpyAsync(ipiv.data(), ipiv_gpu.data(), sizeof(int), cudaMemcpyDeviceToHost, hstream_),
                    "cudaMemcpyAsync failed!");
     computeLogDet(LU_diag.data(), norb, ipiv.data() + 1, log_value);
     cudaErrorCheck(cudaMemcpyAsync(Ainv.data(), Ainv_gpu.data(), Ainv.size() * sizeof(TMAT), cudaMemcpyDeviceToHost,
                                    hstream_),
                    "cudaMemcpyAsync failed!");
     // check solve success
     cudaErrorCheck(cudaStreamSynchronize(hstream_), "cudaStreamSynchronize after getrs failed!");
     if (ipiv[0] != 0)
     {
       std::ostringstream err;
       err << "cusolver::getrs calculation failed with devInfo = " << ipiv[0] << std::endl;
       throw std::runtime_error(err.str());
     }
 
     std::ostringstream nan_msg;
     for(int i = 0; i < norb; i++)
       if (qmcplusplus::isnan(std::norm(Ainv[i][i])))
         nan_msg << "  Ainv["<< i << "][" << i << "] has bad value " << Ainv[i][i] << std::endl;
     if (const std::string str = nan_msg.str(); !str.empty())
       throw std::runtime_error("Inverse matrix diagonal check found:\n" + str);
   }

◆ resize()

void resize ( int norb )

inlineprivate

resize the internal storage

Parameters

norb	number of electrons/orbitals
delay,maximum	delay 0<delay<=norb

Definition at line 52 of file cuSolverInverter.hpp.

References cusolverErrorCheck, qmcplusplus::cusolver::getrf_bufferSize(), cuSolverInverter< T_FP >::h_cusolver_, cuSolverInverter< T_FP >::ipiv, cuSolverInverter< T_FP >::ipiv_gpu, cuSolverInverter< T_FP >::LU_diag, cuSolverInverter< T_FP >::LU_diag_gpu, cuSolverInverter< T_FP >::Mat1_gpu, and cuSolverInverter< T_FP >::work_gpu.

Referenced by cuSolverInverter< T_FP >::invert_transpose().

   {
     if (Mat1_gpu.rows() != norb)
     {
       Mat1_gpu.resize(norb, norb);
       // prepare cusolver auxiliary arrays
       ipiv.resize(norb + 1);
       ipiv_gpu.resize(norb + 1);
       LU_diag.resize(norb);
       LU_diag_gpu.resize(norb);
       int lwork;
       cusolverErrorCheck(cusolver::getrf_bufferSize(h_cusolver_, norb, norb, Mat1_gpu.data(), norb, &lwork),
                          "cusolver::getrf_bufferSize failed!");
       work_gpu.resize(lwork);
     }
   }