QMCPACK
rocSolverInverter< T_FP > Class Template Reference

implements matrix inversion via rocSolver More...

+ Collaboration diagram for rocSolverInverter< T_FP >:

Public Member Functions

 rocSolverInverter ()
 default constructor More...
 
 ~rocSolverInverter ()
 
template<typename TMAT , typename TREAL , typename = std::enable_if_t<std::is_same<TMAT, T_FP>::value>>
std::enable_if_t< std::is_same< TMAT, T_FP >::value > invert_transpose (const Matrix< TMAT > &logdetT, Matrix< TMAT > &Ainv, Matrix< TMAT, CUDAAllocator< TMAT >> &Ainv_gpu, std::complex< TREAL > &log_value)
 compute the inverse of the transpose of matrix A and its determinant value in log when T_FP and TMAT are the same More...
 
template<typename TMAT , typename TREAL , typename = std::enable_if_t<!std::is_same<TMAT, T_FP>::value>>
std::enable_if_t<!std::is_same< TMAT, T_FP >::value > invert_transpose (const Matrix< TMAT > &logdetT, Matrix< TMAT > &Ainv, Matrix< TMAT, CUDAAllocator< TMAT >> &Ainv_gpu, std::complex< TREAL > &log_value)
 compute the inverse of the transpose of matrix A and its determinant value in log when T_FP and TMAT are not the same More...
 

Private Member Functions

void resize (int norb)
 resize the internal storage More...
 

Private Attributes

Matrix< T_FP, CUDAAllocator< T_FP > > Mat1_gpu
 scratch memory for cusolverDN More...
 
Matrix< T_FP, CUDAAllocator< T_FP > > Mat2_gpu
 scratch memory for cusolverDN More...
 
Vector< int, CUDAHostAllocator< int > > ipiv
 pivot array + info More...
 
Vector< int, CUDAAllocator< int > > ipiv_gpu
 
Vector< T_FP, CUDAHostAllocator< T_FP > > LU_diag
 diagonal terms of LU matrix More...
 
Vector< T_FP, CUDAAllocator< T_FP > > LU_diag_gpu
 
Vector< T_FP, CUDAAllocator< T_FP > > work_gpu
 workspace More...
 
rocblas_handle h_rocsolver_
 
hipStream_t hstream_
 

Detailed Description

template<typename T_FP>
class qmcplusplus::rocSolverInverter< T_FP >

implements matrix inversion via rocSolver

Template Parameters
T_FPhigh precision for matrix inversion, T_FP >= T

Definition at line 35 of file rocSolverInverter.hpp.

Constructor & Destructor Documentation

◆ rocSolverInverter()

rocSolverInverter ( )
inline

default constructor

Definition at line 84 of file rocSolverInverter.hpp.

References qmcplusplus::cudaErrorCheck(), rocSolverInverter< T_FP >::h_rocsolver_, rocSolverInverter< T_FP >::hstream_, and rocsolverErrorCheck.

85  {
86  cudaErrorCheck(hipStreamCreate(&hstream_), "hipStreamCreate failed!");
87  rocsolverErrorCheck(rocblas_create_handle(&h_rocsolver_), "rocblas_create_handle failed!");
88  rocsolverErrorCheck(rocblas_set_stream(h_rocsolver_, hstream_), "rocblas_set_stream failed!");
89  }
cudaErrorCheck(cudaMemcpyAsync(dev_lu.data(), lu.data(), sizeof(decltype(lu)::value_type) *lu.size(), cudaMemcpyHostToDevice, hstream), "cudaMemcpyAsync failed copying log_values to device")
#define rocsolverErrorCheck(ans, cause)
Definition: rocsolver.hpp:25

◆ ~rocSolverInverter()

~rocSolverInverter ( )
inline

Definition at line 91 of file rocSolverInverter.hpp.

References qmcplusplus::cudaErrorCheck(), rocSolverInverter< T_FP >::h_rocsolver_, rocSolverInverter< T_FP >::hstream_, and rocsolverErrorCheck.

92  {
93  rocsolverErrorCheck(rocblas_destroy_handle(h_rocsolver_), "rocblas_destroy_handle failed!");
94  cudaErrorCheck(hipStreamDestroy(hstream_), "hipStreamDestroy failed!");
95  }
cudaErrorCheck(cudaMemcpyAsync(dev_lu.data(), lu.data(), sizeof(decltype(lu)::value_type) *lu.size(), cudaMemcpyHostToDevice, hstream), "cudaMemcpyAsync failed copying log_values to device")
#define rocsolverErrorCheck(ans, cause)
Definition: rocsolver.hpp:25

Member Function Documentation

◆ invert_transpose() [1/2]

std::enable_if_t<std::is_same<TMAT, T_FP>::value> invert_transpose ( const Matrix< TMAT > &  logdetT,
Matrix< TMAT > &  Ainv,
Matrix< TMAT, CUDAAllocator< TMAT >> &  Ainv_gpu,
std::complex< TREAL > &  log_value 
)
inline

compute the inverse of the transpose of matrix A and its determinant value in log when T_FP and TMAT are the same

Template Parameters
TREALreal type

Definition at line 102 of file rocSolverInverter.hpp.

References qmcplusplus::computeLogDet(), qmcplusplus::cudaErrorCheck(), Matrix< T, Alloc >::data(), extract_matrix_diagonal_cuda(), qmcplusplus::rocsolver::getrf(), qmcplusplus::rocsolver::getrs(), rocSolverInverter< T_FP >::h_rocsolver_, rocSolverInverter< T_FP >::hstream_, rocSolverInverter< T_FP >::ipiv, rocSolverInverter< T_FP >::ipiv_gpu, rocSolverInverter< T_FP >::LU_diag, rocSolverInverter< T_FP >::LU_diag_gpu, make_identity_matrix_cuda(), rocSolverInverter< T_FP >::Mat1_gpu, rocSolverInverter< T_FP >::resize(), rocsolverErrorCheck, Matrix< T, Alloc >::rows(), and Matrix< T, Alloc >::size().

Referenced by DelayedUpdateCUDA< T, T_FP >::invert_transpose(), qmcplusplus::TEMPLATE_TEST_CASE(), and qmcplusplus::TEST_CASE().

106  {
107  const int norb = logdetT.rows();
108  resize(norb);
109  cudaErrorCheck(hipMemcpyAsync(Mat1_gpu.data(), logdetT.data(), logdetT.size() * sizeof(TMAT), hipMemcpyHostToDevice,
110  hstream_),
111  "hipMemcpyAsync for logdetT to Mat1_gpu failed!");
112  rocsolverErrorCheck(rocsolver::getrf(h_rocsolver_, norb, norb, Mat1_gpu.data(), norb, ipiv_gpu.data() + 1,
113  ipiv_gpu.data()),
114  "rocsolver::getrf failed!");
115  cudaErrorCheck(hipMemcpyAsync(ipiv.data(), ipiv_gpu.data(), ipiv_gpu.size() * sizeof(int), hipMemcpyDeviceToHost,
116  hstream_),
117  "hipMemcpyAsync for ipiv failed!");
118  extract_matrix_diagonal_cuda(norb, Mat1_gpu.data(), norb, LU_diag_gpu.data(), hstream_);
119  cudaErrorCheck(hipMemcpyAsync(LU_diag.data(), LU_diag_gpu.data(), LU_diag.size() * sizeof(T_FP),
120  hipMemcpyDeviceToHost, hstream_),
121  "hipMemcpyAsync for LU_diag failed!");
122  // check LU success
123  cudaErrorCheck(hipStreamSynchronize(hstream_), "hipStreamSynchronize after getrf failed!");
124  if (ipiv[0] != 0)
125  {
126  std::ostringstream err;
127  err << "rocsolver::getrf calculation failed with devInfo = " << ipiv[0] << std::endl;
128  std::cerr << err.str();
129  throw std::runtime_error(err.str());
130  }
131  make_identity_matrix_cuda(norb, Ainv_gpu.data(), norb, hstream_);
132  rocsolverErrorCheck(rocsolver::getrs(h_rocsolver_, rocblas_operation_transpose, norb, norb, Mat1_gpu.data(), norb,
133  ipiv_gpu.data() + 1, Ainv_gpu.data(), norb),
134  "rocsolver::getrs failed!");
135  cudaErrorCheck(hipMemcpyAsync(ipiv.data(), ipiv_gpu.data(), sizeof(int), hipMemcpyDeviceToHost, hstream_),
136  "hipMemcpyAsync for ipiv failed!");
137  computeLogDet(LU_diag.data(), norb, ipiv.data() + 1, log_value);
138  cudaErrorCheck(hipMemcpyAsync(Ainv.data(), Ainv_gpu.data(), Ainv.size() * sizeof(TMAT), hipMemcpyDeviceToHost,
139  hstream_),
140  "hipMemcpyAsync for Ainv failed!");
141  cudaErrorCheck(hipStreamSynchronize(hstream_), "hipStreamSynchronize after getrs failed!");
142  if (ipiv[0] != 0)
143  {
144  std::ostringstream err;
145  err << "rocsolver::getrs calculation failed with devInfo = " << ipiv[0] << std::endl;
146  std::cerr << err.str();
147  throw std::runtime_error(err.str());
148  }
149  }
rocblas_status getrs(rocblas_handle &handle, const rocblas_operation &transa, int m, int n, double *A, int lda, int *ipiv, double *B, int ldb)
Definition: rocsolver.hpp:117
rocblas_status getrf(rocblas_handle &handle, int m, int n, double *A, int lda, int *ipiv, int *info)
Definition: rocsolver.hpp:101
Vector< int, CUDAHostAllocator< int > > ipiv
pivot array + info
void make_identity_matrix_cuda(const int nrows, double *mat, const int lda, cudaStream_t hstream)
create identity matrix on the device
void extract_matrix_diagonal_cuda(const int nrows, const double *mat, const int lda, double *diag, cudaStream_t hstream)
extract matrix diagonal
Vector< int, CUDAAllocator< int > > ipiv_gpu
cudaErrorCheck(cudaMemcpyAsync(dev_lu.data(), lu.data(), sizeof(decltype(lu)::value_type) *lu.size(), cudaMemcpyHostToDevice, hstream), "cudaMemcpyAsync failed copying log_values to device")
Vector< T_FP, CUDAAllocator< T_FP > > LU_diag_gpu
void computeLogDet(const T *restrict diag, int n, const int *restrict pivot, std::complex< T_FP > &logdet)
Definition: DiracMatrix.h:100
#define rocsolverErrorCheck(ans, cause)
Definition: rocsolver.hpp:25
Vector< T_FP, CUDAHostAllocator< T_FP > > LU_diag
diagonal terms of LU matrix
Matrix< T_FP, CUDAAllocator< T_FP > > Mat1_gpu
scratch memory for cusolverDN
void resize(int norb)
resize the internal storage

◆ invert_transpose() [2/2]

std::enable_if_t<!std::is_same<TMAT, T_FP>::value> invert_transpose ( const Matrix< TMAT > &  logdetT,
Matrix< TMAT > &  Ainv,
Matrix< TMAT, CUDAAllocator< TMAT >> &  Ainv_gpu,
std::complex< TREAL > &  log_value 
)
inline

compute the inverse of the transpose of matrix A and its determinant value in log when T_FP and TMAT are not the same

Template Parameters
TREALreal type

Definition at line 156 of file rocSolverInverter.hpp.

References qmcplusplus::computeLogDet(), copy_matrix_cuda(), qmcplusplus::cudaErrorCheck(), Matrix< T, Alloc >::data(), extract_matrix_diagonal_cuda(), qmcplusplus::rocsolver::getrf(), qmcplusplus::rocsolver::getrs(), rocSolverInverter< T_FP >::h_rocsolver_, rocSolverInverter< T_FP >::hstream_, rocSolverInverter< T_FP >::ipiv, rocSolverInverter< T_FP >::ipiv_gpu, qmcplusplus::isnan(), rocSolverInverter< T_FP >::LU_diag, rocSolverInverter< T_FP >::LU_diag_gpu, make_identity_matrix_cuda(), rocSolverInverter< T_FP >::Mat1_gpu, rocSolverInverter< T_FP >::Mat2_gpu, norm(), rocSolverInverter< T_FP >::resize(), rocsolverErrorCheck, Matrix< T, Alloc >::rows(), and Matrix< T, Alloc >::size().

160  {
161  const int norb = logdetT.rows();
162  resize(norb);
163  Mat2_gpu.resize(norb, norb);
164  cudaErrorCheck(hipMemcpyAsync(Mat2_gpu.data(), logdetT.data(), logdetT.size() * sizeof(TMAT), hipMemcpyHostToDevice,
165  hstream_),
166  "hipMemcpyAsync failed!");
167  copy_matrix_cuda(norb, norb, (TMAT*)Mat2_gpu.data(), norb, Mat1_gpu.data(), norb, hstream_);
168  rocsolverErrorCheck(rocsolver::getrf(h_rocsolver_, norb, norb, Mat1_gpu.data(), norb, ipiv_gpu.data() + 1,
169  ipiv_gpu.data()),
170  "rocsolver::getrf failed!");
171  cudaErrorCheck(hipMemcpyAsync(ipiv.data(), ipiv_gpu.data(), ipiv_gpu.size() * sizeof(int), hipMemcpyDeviceToHost,
172  hstream_),
173  "hipMemcpyAsync failed!");
174  extract_matrix_diagonal_cuda(norb, Mat1_gpu.data(), norb, LU_diag_gpu.data(), hstream_);
175  cudaErrorCheck(hipMemcpyAsync(LU_diag.data(), LU_diag_gpu.data(), LU_diag.size() * sizeof(T_FP),
176  hipMemcpyDeviceToHost, hstream_),
177  "hipMemcpyAsync failed!");
178  // check LU success
179  cudaErrorCheck(hipStreamSynchronize(hstream_), "hipStreamSynchronize after getrf failed!");
180  if (ipiv[0] != 0)
181  {
182  std::ostringstream err;
183  err << "rocsolver::getrf calculation failed with devInfo = " << ipiv[0] << std::endl;
184  std::cerr << err.str();
185  throw std::runtime_error(err.str());
186  }
187  make_identity_matrix_cuda(norb, Mat2_gpu.data(), norb, hstream_);
188  rocsolverErrorCheck(rocsolver::getrs(h_rocsolver_, rocblas_operation_transpose, norb, norb, Mat1_gpu.data(), norb,
189  ipiv_gpu.data() + 1, Mat2_gpu.data(), norb),
190  "rocsolver::getrs failed!");
191  copy_matrix_cuda(norb, norb, Mat2_gpu.data(), norb, Ainv_gpu.data(), norb, hstream_);
192  cudaErrorCheck(hipMemcpyAsync(ipiv.data(), ipiv_gpu.data(), sizeof(int), hipMemcpyDeviceToHost, hstream_),
193  "hipMemcpyAsync failed!");
194  computeLogDet(LU_diag.data(), norb, ipiv.data() + 1, log_value);
195  cudaErrorCheck(hipMemcpyAsync(Ainv.data(), Ainv_gpu.data(), Ainv.size() * sizeof(TMAT), hipMemcpyDeviceToHost,
196  hstream_),
197  "hipMemcpyAsync failed!");
198  // check solve success
199  cudaErrorCheck(hipStreamSynchronize(hstream_), "hipStreamSynchronize after getrs failed!");
200  if (ipiv[0] != 0)
201  {
202  std::ostringstream err;
203  err << "rocsolver::getrs calculation failed with devInfo = " << ipiv[0] << std::endl;
204  std::cerr << err.str();
205  throw std::runtime_error(err.str());
206  }
207 
208  std::ostringstream nan_msg;
209  for(int i = 0; i < norb; i++)
210  if (qmcplusplus::isnan(std::norm(Ainv[i][i])))
211  nan_msg << " Ainv["<< i << "][" << i << "] has bad value " << Ainv[i][i] << std::endl;
212  if (const std::string str = nan_msg.str(); !str.empty())
213  throw std::runtime_error("Inverse matrix diagonal check found:\n" + str);
214  }
rocblas_status getrs(rocblas_handle &handle, const rocblas_operation &transa, int m, int n, double *A, int lda, int *ipiv, double *B, int ldb)
Definition: rocsolver.hpp:117
rocblas_status getrf(rocblas_handle &handle, int m, int n, double *A, int lda, int *ipiv, int *info)
Definition: rocsolver.hpp:101
Vector< int, CUDAHostAllocator< int > > ipiv
pivot array + info
void make_identity_matrix_cuda(const int nrows, double *mat, const int lda, cudaStream_t hstream)
create identity matrix on the device
void extract_matrix_diagonal_cuda(const int nrows, const double *mat, const int lda, double *diag, cudaStream_t hstream)
extract matrix diagonal
void copy_matrix_cuda(const int nrows, const int ncols, const double *mat_in, const int lda, float *mat_out, const int ldb, cudaStream_t hstream)
copy matrix with precision difference
double norm(const zVec &c)
Definition: VectorOps.h:118
Vector< int, CUDAAllocator< int > > ipiv_gpu
cudaErrorCheck(cudaMemcpyAsync(dev_lu.data(), lu.data(), sizeof(decltype(lu)::value_type) *lu.size(), cudaMemcpyHostToDevice, hstream), "cudaMemcpyAsync failed copying log_values to device")
Vector< T_FP, CUDAAllocator< T_FP > > LU_diag_gpu
void computeLogDet(const T *restrict diag, int n, const int *restrict pivot, std::complex< T_FP > &logdet)
Definition: DiracMatrix.h:100
Matrix< T_FP, CUDAAllocator< T_FP > > Mat2_gpu
scratch memory for cusolverDN
#define rocsolverErrorCheck(ans, cause)
Definition: rocsolver.hpp:25
Vector< T_FP, CUDAHostAllocator< T_FP > > LU_diag
diagonal terms of LU matrix
Matrix< T_FP, CUDAAllocator< T_FP > > Mat1_gpu
scratch memory for cusolverDN
bool isnan(float a)
return true if the value is NaN.
Definition: math.cpp:18
void resize(int norb)
resize the internal storage

◆ resize()

void resize ( int  norb)
inlineprivate

resize the internal storage

Parameters
norbnumber of electrons/orbitals
delay,maximumdelay 0<delay<=norb

Definition at line 58 of file rocSolverInverter.hpp.

References dgetrf(), dgetri(), rocSolverInverter< T_FP >::h_rocsolver_, rocSolverInverter< T_FP >::ipiv, rocSolverInverter< T_FP >::ipiv_gpu, rocSolverInverter< T_FP >::LU_diag, rocSolverInverter< T_FP >::LU_diag_gpu, rocSolverInverter< T_FP >::Mat1_gpu, and rocsolverErrorCheck.

Referenced by rocSolverInverter< T_FP >::invert_transpose().

59  {
60  if (Mat1_gpu.rows() != norb)
61  {
62  Mat1_gpu.resize(norb, norb);
63  // prepare cusolver auxiliary arrays
64  ipiv.resize(norb + 1);
65  ipiv_gpu.resize(norb + 1);
66  LU_diag.resize(norb);
67  LU_diag_gpu.resize(norb);
68 
69  // Memory for temporary storage for solver calls.
70  // The rocSOLVER library handles this memory itself.
71  // If we need more control, there are API's to get the size and set the buffer memory
72 #if 0
73  size_t memory_size;
74  rocblas_start_device_memory_size_query(h_rocsolver_);
75  rocsolverErrorCheck(rocsolver::dgetrf(h_rocsolver_, norb, norb, nullptr, norb, nullptr, nullptr);
76  rocsolverErrorCheck(rocsolver::dgetri(h_rocsolver_, norb, norb, nullptr, norb, nullptr, nullptr);
77  rocblas_stop_device_memory_size_query(h_rocsolver_, &memory_size);
78 #endif
79  }
80  }
Vector< int, CUDAHostAllocator< int > > ipiv
pivot array + info
Vector< int, CUDAAllocator< int > > ipiv_gpu
void dgetrf(const int &n, const int &m, double *a, const int &n0, int *piv, int &st)
Vector< T_FP, CUDAAllocator< T_FP > > LU_diag_gpu
#define rocsolverErrorCheck(ans, cause)
Definition: rocsolver.hpp:25
Vector< T_FP, CUDAHostAllocator< T_FP > > LU_diag
diagonal terms of LU matrix
void dgetri(const int &n, double *a, const int &n0, int const *piv, double *work, const int &, int &st)
Matrix< T_FP, CUDAAllocator< T_FP > > Mat1_gpu
scratch memory for cusolverDN

Member Data Documentation

◆ h_rocsolver_

◆ hstream_

◆ ipiv

Vector<int, CUDAHostAllocator<int> > ipiv
private

pivot array + info

Definition at line 42 of file rocSolverInverter.hpp.

Referenced by rocSolverInverter< T_FP >::invert_transpose(), and rocSolverInverter< T_FP >::resize().

◆ ipiv_gpu

Vector<int, CUDAAllocator<int> > ipiv_gpu
private

◆ LU_diag

Vector<T_FP, CUDAHostAllocator<T_FP> > LU_diag
private

diagonal terms of LU matrix

Definition at line 45 of file rocSolverInverter.hpp.

Referenced by rocSolverInverter< T_FP >::invert_transpose(), and rocSolverInverter< T_FP >::resize().

◆ LU_diag_gpu

Vector<T_FP, CUDAAllocator<T_FP> > LU_diag_gpu
private

◆ Mat1_gpu

Matrix<T_FP, CUDAAllocator<T_FP> > Mat1_gpu
private

scratch memory for cusolverDN

Definition at line 38 of file rocSolverInverter.hpp.

Referenced by rocSolverInverter< T_FP >::invert_transpose(), and rocSolverInverter< T_FP >::resize().

◆ Mat2_gpu

Matrix<T_FP, CUDAAllocator<T_FP> > Mat2_gpu
private

scratch memory for cusolverDN

Definition at line 40 of file rocSolverInverter.hpp.

Referenced by rocSolverInverter< T_FP >::invert_transpose().

◆ work_gpu

Vector<T_FP, CUDAAllocator<T_FP> > work_gpu
private

workspace

Definition at line 48 of file rocSolverInverter.hpp.


The documentation for this class was generated from the following file: