implements matrix inversion via rocSolver More...

Collaboration diagram for rocSolverInverter< T_FP >:

Public Member Functions
	rocSolverInverter ()
	default constructor More...

	~rocSolverInverter ()

template<typename TMAT , typename TREAL , typename = std::enable_if_t<std::is_same<TMAT, T_FP>::value>>
std::enable_if_t< std::is_same< TMAT, T_FP >::value >	invert_transpose (const Matrix< TMAT > &logdetT, Matrix< TMAT > &Ainv, Matrix< TMAT, CUDAAllocator< TMAT >> &Ainv_gpu, std::complex< TREAL > &log_value)
	compute the inverse of the transpose of matrix A and its determinant value in log when T_FP and TMAT are the same More...

template<typename TMAT , typename TREAL , typename = std::enable_if_t<!std::is_same<TMAT, T_FP>::value>>
std::enable_if_t<!std::is_same< TMAT, T_FP >::value >	invert_transpose (const Matrix< TMAT > &logdetT, Matrix< TMAT > &Ainv, Matrix< TMAT, CUDAAllocator< TMAT >> &Ainv_gpu, std::complex< TREAL > &log_value)
	compute the inverse of the transpose of matrix A and its determinant value in log when T_FP and TMAT are not the same More...

Private Member Functions
void	resize (int norb)
	resize the internal storage More...

Private Attributes
Matrix< T_FP, CUDAAllocator< T_FP > >	Mat1_gpu
	scratch memory for cusolverDN More...

Matrix< T_FP, CUDAAllocator< T_FP > >	Mat2_gpu
	scratch memory for cusolverDN More...

Vector< int, CUDAHostAllocator< int > >	ipiv
	pivot array + info More...

Vector< int, CUDAAllocator< int > >	ipiv_gpu

Vector< T_FP, CUDAHostAllocator< T_FP > >	LU_diag
	diagonal terms of LU matrix More...

Vector< T_FP, CUDAAllocator< T_FP > >	LU_diag_gpu

Vector< T_FP, CUDAAllocator< T_FP > >	work_gpu
	workspace More...

rocblas_handle	h_rocsolver_

hipStream_t	hstream_

Detailed Description

template<typename T_FP>
class qmcplusplus::rocSolverInverter< T_FP >

implements matrix inversion via rocSolver

Template Parameters

T_FP	high precision for matrix inversion, T_FP >= T

Definition at line 35 of file rocSolverInverter.hpp.

Constructor & Destructor Documentation

◆ rocSolverInverter()

rocSolverInverter ( )

inline

default constructor

Definition at line 84 of file rocSolverInverter.hpp.

References qmcplusplus::cudaErrorCheck(), rocSolverInverter< T_FP >::h_rocsolver_, rocSolverInverter< T_FP >::hstream_, and rocsolverErrorCheck.

   {
     cudaErrorCheck(hipStreamCreate(&hstream_), "hipStreamCreate failed!");
     rocsolverErrorCheck(rocblas_create_handle(&h_rocsolver_), "rocblas_create_handle failed!");
     rocsolverErrorCheck(rocblas_set_stream(h_rocsolver_, hstream_), "rocblas_set_stream failed!");
   }

◆ ~rocSolverInverter()

~rocSolverInverter ( )

inline

Definition at line 91 of file rocSolverInverter.hpp.

References qmcplusplus::cudaErrorCheck(), rocSolverInverter< T_FP >::h_rocsolver_, rocSolverInverter< T_FP >::hstream_, and rocsolverErrorCheck.

   {
     rocsolverErrorCheck(rocblas_destroy_handle(h_rocsolver_), "rocblas_destroy_handle failed!");
     cudaErrorCheck(hipStreamDestroy(hstream_), "hipStreamDestroy failed!");
   }

Member Function Documentation

◆ invert_transpose() [1/2]

std::enable_if_t<std::is_same<TMAT, T_FP>::value> invert_transpose	(	const Matrix< TMAT > &	logdetT,
		Matrix< TMAT > &	Ainv,
		Matrix< TMAT, CUDAAllocator< TMAT >> &	Ainv_gpu,
		std::complex< TREAL > &	log_value
	)

inline

compute the inverse of the transpose of matrix A and its determinant value in log when T_FP and TMAT are the same

Template Parameters

TREAL real type

Definition at line 102 of file rocSolverInverter.hpp.

Referenced by DelayedUpdateCUDA< T, T_FP >::invert_transpose(), qmcplusplus::TEMPLATE_TEST_CASE(), and qmcplusplus::TEST_CASE().

   {
     const int norb = logdetT.rows();
     resize(norb);
     cudaErrorCheck(hipMemcpyAsync(Mat1_gpu.data(), logdetT.data(), logdetT.size() * sizeof(TMAT), hipMemcpyHostToDevice,
                                   hstream_),
                    "hipMemcpyAsync for logdetT to Mat1_gpu failed!");
     rocsolverErrorCheck(rocsolver::getrf(h_rocsolver_, norb, norb, Mat1_gpu.data(), norb, ipiv_gpu.data() + 1,
                                          ipiv_gpu.data()),
                         "rocsolver::getrf failed!");
     cudaErrorCheck(hipMemcpyAsync(ipiv.data(), ipiv_gpu.data(), ipiv_gpu.size() * sizeof(int), hipMemcpyDeviceToHost,
                                   hstream_),
                    "hipMemcpyAsync for ipiv failed!");
     extract_matrix_diagonal_cuda(norb, Mat1_gpu.data(), norb, LU_diag_gpu.data(), hstream_);
     cudaErrorCheck(hipMemcpyAsync(LU_diag.data(), LU_diag_gpu.data(), LU_diag.size() * sizeof(T_FP),
                                   hipMemcpyDeviceToHost, hstream_),
                    "hipMemcpyAsync for LU_diag failed!");
     // check LU success
     cudaErrorCheck(hipStreamSynchronize(hstream_), "hipStreamSynchronize after getrf failed!");
     if (ipiv[0] != 0)
     {
       std::ostringstream err;
       err << "rocsolver::getrf calculation failed with devInfo = " << ipiv[0] << std::endl;
       std::cerr << err.str();
       throw std::runtime_error(err.str());
     }
     make_identity_matrix_cuda(norb, Ainv_gpu.data(), norb, hstream_);
     rocsolverErrorCheck(rocsolver::getrs(h_rocsolver_, rocblas_operation_transpose, norb, norb, Mat1_gpu.data(), norb,
                                          ipiv_gpu.data() + 1, Ainv_gpu.data(), norb),
                         "rocsolver::getrs failed!");
     cudaErrorCheck(hipMemcpyAsync(ipiv.data(), ipiv_gpu.data(), sizeof(int), hipMemcpyDeviceToHost, hstream_),
                    "hipMemcpyAsync for ipiv failed!");
     computeLogDet(LU_diag.data(), norb, ipiv.data() + 1, log_value);
     cudaErrorCheck(hipMemcpyAsync(Ainv.data(), Ainv_gpu.data(), Ainv.size() * sizeof(TMAT), hipMemcpyDeviceToHost,
                                   hstream_),
                    "hipMemcpyAsync for Ainv failed!");
     cudaErrorCheck(hipStreamSynchronize(hstream_), "hipStreamSynchronize after getrs failed!");
     if (ipiv[0] != 0)
     {
       std::ostringstream err;
       err << "rocsolver::getrs calculation failed with devInfo = " << ipiv[0] << std::endl;
       std::cerr << err.str();
       throw std::runtime_error(err.str());
     }
   }

◆ invert_transpose() [2/2]

std::enable_if_t<!std::is_same<TMAT, T_FP>::value> invert_transpose	(	const Matrix< TMAT > &	logdetT,
		Matrix< TMAT > &	Ainv,
		Matrix< TMAT, CUDAAllocator< TMAT >> &	Ainv_gpu,
		std::complex< TREAL > &	log_value
	)

inline

compute the inverse of the transpose of matrix A and its determinant value in log when T_FP and TMAT are not the same

Template Parameters

TREAL real type

Definition at line 156 of file rocSolverInverter.hpp.

References qmcplusplus::computeLogDet(), copy_matrix_cuda(), qmcplusplus::cudaErrorCheck(), Matrix< T, Alloc >::data(), extract_matrix_diagonal_cuda(), qmcplusplus::rocsolver::getrf(), qmcplusplus::rocsolver::getrs(), rocSolverInverter< T_FP >::h_rocsolver_, rocSolverInverter< T_FP >::hstream_, rocSolverInverter< T_FP >::ipiv, rocSolverInverter< T_FP >::ipiv_gpu, qmcplusplus::isnan(), rocSolverInverter< T_FP >::LU_diag, rocSolverInverter< T_FP >::LU_diag_gpu, make_identity_matrix_cuda(), rocSolverInverter< T_FP >::Mat1_gpu, rocSolverInverter< T_FP >::Mat2_gpu, norm(), rocSolverInverter< T_FP >::resize(), rocsolverErrorCheck, Matrix< T, Alloc >::rows(), and Matrix< T, Alloc >::size().

   {
     const int norb = logdetT.rows();
     resize(norb);
     Mat2_gpu.resize(norb, norb);
     cudaErrorCheck(hipMemcpyAsync(Mat2_gpu.data(), logdetT.data(), logdetT.size() * sizeof(TMAT), hipMemcpyHostToDevice,
                                   hstream_),
                    "hipMemcpyAsync failed!");
     copy_matrix_cuda(norb, norb, (TMAT*)Mat2_gpu.data(), norb, Mat1_gpu.data(), norb, hstream_);
     rocsolverErrorCheck(rocsolver::getrf(h_rocsolver_, norb, norb, Mat1_gpu.data(), norb, ipiv_gpu.data() + 1,
                                          ipiv_gpu.data()),
                         "rocsolver::getrf failed!");
     cudaErrorCheck(hipMemcpyAsync(ipiv.data(), ipiv_gpu.data(), ipiv_gpu.size() * sizeof(int), hipMemcpyDeviceToHost,
                                   hstream_),
                    "hipMemcpyAsync failed!");
     extract_matrix_diagonal_cuda(norb, Mat1_gpu.data(), norb, LU_diag_gpu.data(), hstream_);
     cudaErrorCheck(hipMemcpyAsync(LU_diag.data(), LU_diag_gpu.data(), LU_diag.size() * sizeof(T_FP),
                                   hipMemcpyDeviceToHost, hstream_),
                    "hipMemcpyAsync failed!");
     // check LU success
     cudaErrorCheck(hipStreamSynchronize(hstream_), "hipStreamSynchronize after getrf failed!");
     if (ipiv[0] != 0)
     {
       std::ostringstream err;
       err << "rocsolver::getrf calculation failed with devInfo = " << ipiv[0] << std::endl;
       std::cerr << err.str();
       throw std::runtime_error(err.str());
     }
     make_identity_matrix_cuda(norb, Mat2_gpu.data(), norb, hstream_);
     rocsolverErrorCheck(rocsolver::getrs(h_rocsolver_, rocblas_operation_transpose, norb, norb, Mat1_gpu.data(), norb,
                                          ipiv_gpu.data() + 1, Mat2_gpu.data(), norb),
                         "rocsolver::getrs failed!");
     copy_matrix_cuda(norb, norb, Mat2_gpu.data(), norb, Ainv_gpu.data(), norb, hstream_);
     cudaErrorCheck(hipMemcpyAsync(ipiv.data(), ipiv_gpu.data(), sizeof(int), hipMemcpyDeviceToHost, hstream_),
                    "hipMemcpyAsync failed!");
     computeLogDet(LU_diag.data(), norb, ipiv.data() + 1, log_value);
     cudaErrorCheck(hipMemcpyAsync(Ainv.data(), Ainv_gpu.data(), Ainv.size() * sizeof(TMAT), hipMemcpyDeviceToHost,
                                   hstream_),
                    "hipMemcpyAsync failed!");
     // check solve success
     cudaErrorCheck(hipStreamSynchronize(hstream_), "hipStreamSynchronize after getrs failed!");
     if (ipiv[0] != 0)
     {
       std::ostringstream err;
       err << "rocsolver::getrs calculation failed with devInfo = " << ipiv[0] << std::endl;
       std::cerr << err.str();
       throw std::runtime_error(err.str());
     }
 
     std::ostringstream nan_msg;
     for(int i = 0; i < norb; i++)
       if (qmcplusplus::isnan(std::norm(Ainv[i][i])))
         nan_msg << "  Ainv["<< i << "][" << i << "] has bad value " << Ainv[i][i] << std::endl;
     if (const std::string str = nan_msg.str(); !str.empty())
       throw std::runtime_error("Inverse matrix diagonal check found:\n" + str);
   }

◆ resize()

void resize ( int norb )

inlineprivate

resize the internal storage

Parameters

norb	number of electrons/orbitals
delay,maximum	delay 0<delay<=norb

Definition at line 58 of file rocSolverInverter.hpp.

References dgetrf(), dgetri(), rocSolverInverter< T_FP >::h_rocsolver_, rocSolverInverter< T_FP >::ipiv, rocSolverInverter< T_FP >::ipiv_gpu, rocSolverInverter< T_FP >::LU_diag, rocSolverInverter< T_FP >::LU_diag_gpu, rocSolverInverter< T_FP >::Mat1_gpu, and rocsolverErrorCheck.

Referenced by rocSolverInverter< T_FP >::invert_transpose().

   {
     if (Mat1_gpu.rows() != norb)
     {
       Mat1_gpu.resize(norb, norb);
       // prepare cusolver auxiliary arrays
       ipiv.resize(norb + 1);
       ipiv_gpu.resize(norb + 1);
       LU_diag.resize(norb);
       LU_diag_gpu.resize(norb);
 
       // Memory for temporary storage for solver calls.
       // The rocSOLVER library handles this memory itself.
       //  If we need more control, there are API's to get the size and set the buffer memory
 #if 0
       size_t memory_size;
       rocblas_start_device_memory_size_query(h_rocsolver_);
       rocsolverErrorCheck(rocsolver::dgetrf(h_rocsolver_, norb, norb, nullptr, norb, nullptr, nullptr);
       rocsolverErrorCheck(rocsolver::dgetri(h_rocsolver_, norb, norb, nullptr, norb, nullptr, nullptr);
       rocblas_stop_device_memory_size_query(h_rocsolver_, &memory_size);
 #endif
     }
   }

Member Data Documentation

◆ h_rocsolver_

rocblas_handle h_rocsolver_

private

Definition at line 51 of file rocSolverInverter.hpp.

Referenced by rocSolverInverter< T_FP >::invert_transpose(), rocSolverInverter< T_FP >::resize(), rocSolverInverter< T_FP >::rocSolverInverter(), and rocSolverInverter< T_FP >::~rocSolverInverter().

◆ hstream_

hipStream_t hstream_

private

Definition at line 52 of file rocSolverInverter.hpp.

Referenced by rocSolverInverter< T_FP >::invert_transpose(), rocSolverInverter< T_FP >::rocSolverInverter(), and rocSolverInverter< T_FP >::~rocSolverInverter().

◆ ipiv

Vector<int, CUDAHostAllocator<int> > ipiv

private

pivot array + info

Definition at line 42 of file rocSolverInverter.hpp.

Referenced by rocSolverInverter< T_FP >::invert_transpose(), and rocSolverInverter< T_FP >::resize().

◆ ipiv_gpu

Vector<int, CUDAAllocator<int> > ipiv_gpu

private

Definition at line 43 of file rocSolverInverter.hpp.

Referenced by rocSolverInverter< T_FP >::invert_transpose(), and rocSolverInverter< T_FP >::resize().

◆ LU_diag

Vector<T_FP, CUDAHostAllocator<T_FP> > LU_diag

private

diagonal terms of LU matrix

Definition at line 45 of file rocSolverInverter.hpp.

Referenced by rocSolverInverter< T_FP >::invert_transpose(), and rocSolverInverter< T_FP >::resize().

◆ LU_diag_gpu

Vector<T_FP, CUDAAllocator<T_FP> > LU_diag_gpu

private

Definition at line 46 of file rocSolverInverter.hpp.

Referenced by rocSolverInverter< T_FP >::invert_transpose(), and rocSolverInverter< T_FP >::resize().

◆ Mat1_gpu

Matrix<T_FP, CUDAAllocator<T_FP> > Mat1_gpu

private

scratch memory for cusolverDN

Definition at line 38 of file rocSolverInverter.hpp.

Referenced by rocSolverInverter< T_FP >::invert_transpose(), and rocSolverInverter< T_FP >::resize().

◆ Mat2_gpu

Matrix<T_FP, CUDAAllocator<T_FP> > Mat2_gpu

private

scratch memory for cusolverDN

Definition at line 40 of file rocSolverInverter.hpp.

Referenced by rocSolverInverter< T_FP >::invert_transpose().

◆ work_gpu

Vector<T_FP, CUDAAllocator<T_FP> > work_gpu

private

workspace

Definition at line 48 of file rocSolverInverter.hpp.

The documentation for this class was generated from the following file:

/home/pk7/projects/qmc/for_cron_doxygen/qmcpack/src/QMCWaveFunctions/Fermion/rocSolverInverter.hpp

Public Member Functions

Private Member Functions

Private Attributes

Detailed Description

template<typename T_FP> class qmcplusplus::rocSolverInverter< T_FP >

Constructor & Destructor Documentation

◆ rocSolverInverter()

◆ ~rocSolverInverter()

Member Function Documentation

◆ invert_transpose() [1/2]

◆ invert_transpose() [2/2]

◆ resize()

Member Data Documentation

◆ h_rocsolver_

◆ hstream_

◆ ipiv

◆ ipiv_gpu

◆ LU_diag

◆ LU_diag_gpu

◆ Mat1_gpu

◆ Mat2_gpu

◆ work_gpu

template<typename T_FP>
class qmcplusplus::rocSolverInverter< T_FP >