QMCPACK
cuSolverInverter< T_FP > Class Template Reference

implements matrix inversion via cuSolverDN More...

+ Collaboration diagram for cuSolverInverter< T_FP >:

Public Member Functions

 cuSolverInverter ()
 default constructor More...
 
 ~cuSolverInverter ()
 
template<typename TMAT , typename TREAL , typename = std::enable_if_t<std::is_same<TMAT, T_FP>::value>>
std::enable_if_t< std::is_same< TMAT, T_FP >::value > invert_transpose (const Matrix< TMAT > &logdetT, Matrix< TMAT > &Ainv, Matrix< TMAT, CUDAAllocator< TMAT >> &Ainv_gpu, std::complex< TREAL > &log_value)
 compute the inverse of the transpose of matrix A and its determinant value in log when T_FP and TMAT are the same More...
 
template<typename TMAT , typename TREAL , typename = std::enable_if_t<!std::is_same<TMAT, T_FP>::value>>
std::enable_if_t<!std::is_same< TMAT, T_FP >::value > invert_transpose (const Matrix< TMAT > &logdetT, Matrix< TMAT > &Ainv, Matrix< TMAT, CUDAAllocator< TMAT >> &Ainv_gpu, std::complex< TREAL > &log_value)
 compute the inverse of the transpose of matrix A and its determinant value in log when T_FP and TMAT are not the same More...
 

Private Member Functions

void resize (int norb)
 resize the internal storage More...
 

Private Attributes

Matrix< T_FP, CUDAAllocator< T_FP > > Mat1_gpu
 scratch memory for cusolverDN More...
 
Matrix< T_FP, CUDAAllocator< T_FP > > Mat2_gpu
 scratch memory for cusolverDN More...
 
Vector< int, CUDAHostAllocator< int > > ipiv
 pivot array + info More...
 
Vector< int, CUDAAllocator< int > > ipiv_gpu
 
Vector< T_FP, CUDAHostAllocator< T_FP > > LU_diag
 diagonal terms of LU matrix More...
 
Vector< T_FP, CUDAAllocator< T_FP > > LU_diag_gpu
 
Vector< T_FP, CUDAAllocator< T_FP > > work_gpu
 workspace More...
 
cusolverDnHandle_t h_cusolver_
 
cudaStream_t hstream_
 

Detailed Description

template<typename T_FP>
class qmcplusplus::cuSolverInverter< T_FP >

implements matrix inversion via cuSolverDN

Template Parameters
T_FPhigh precision for matrix inversion, T_FP >= T

Definition at line 29 of file cuSolverInverter.hpp.

Constructor & Destructor Documentation

◆ cuSolverInverter()

cuSolverInverter ( )
inline

default constructor

Definition at line 71 of file cuSolverInverter.hpp.

References qmcplusplus::cudaErrorCheck(), cudaStreamCreate, cusolverErrorCheck, cuSolverInverter< T_FP >::h_cusolver_, and cuSolverInverter< T_FP >::hstream_.

72  {
73  cudaErrorCheck(cudaStreamCreate(&hstream_), "cudaStreamCreate failed!");
74  cusolverErrorCheck(cusolverDnCreate(&h_cusolver_), "cusolverCreate failed!");
75  cusolverErrorCheck(cusolverDnSetStream(h_cusolver_, hstream_), "cusolverSetStream failed!");
76  }
#define cudaStreamCreate
Definition: cuda2hip.h:150
cudaErrorCheck(cudaMemcpyAsync(dev_lu.data(), lu.data(), sizeof(decltype(lu)::value_type) *lu.size(), cudaMemcpyHostToDevice, hstream), "cudaMemcpyAsync failed copying log_values to device")
#define cusolverErrorCheck(ans, cause)
Definition: cusolver.hpp:21

◆ ~cuSolverInverter()

~cuSolverInverter ( )
inline

Definition at line 78 of file cuSolverInverter.hpp.

References qmcplusplus::cudaErrorCheck(), cudaStreamDestroy, cusolverErrorCheck, cuSolverInverter< T_FP >::h_cusolver_, and cuSolverInverter< T_FP >::hstream_.

79  {
80  cusolverErrorCheck(cusolverDnDestroy(h_cusolver_), "cusolverDestroy failed!");
81  cudaErrorCheck(cudaStreamDestroy(hstream_), "cudaStreamDestroy failed!");
82  }
#define cudaStreamDestroy
Definition: cuda2hip.h:151
cudaErrorCheck(cudaMemcpyAsync(dev_lu.data(), lu.data(), sizeof(decltype(lu)::value_type) *lu.size(), cudaMemcpyHostToDevice, hstream), "cudaMemcpyAsync failed copying log_values to device")
#define cusolverErrorCheck(ans, cause)
Definition: cusolver.hpp:21

Member Function Documentation

◆ invert_transpose() [1/2]

std::enable_if_t<std::is_same<TMAT, T_FP>::value> invert_transpose ( const Matrix< TMAT > &  logdetT,
Matrix< TMAT > &  Ainv,
Matrix< TMAT, CUDAAllocator< TMAT >> &  Ainv_gpu,
std::complex< TREAL > &  log_value 
)
inline

compute the inverse of the transpose of matrix A and its determinant value in log when T_FP and TMAT are the same

Template Parameters
TREALreal type

Definition at line 89 of file cuSolverInverter.hpp.

References qmcplusplus::computeLogDet(), CUBLAS_OP_T, qmcplusplus::cudaErrorCheck(), cudaMemcpyAsync, cudaMemcpyDeviceToHost, cudaMemcpyHostToDevice, cudaStreamSynchronize, cusolverErrorCheck, Matrix< T, Alloc >::data(), extract_matrix_diagonal_cuda(), qmcplusplus::cusolver::getrf(), qmcplusplus::cusolver::getrs(), cuSolverInverter< T_FP >::h_cusolver_, cuSolverInverter< T_FP >::hstream_, cuSolverInverter< T_FP >::ipiv, cuSolverInverter< T_FP >::ipiv_gpu, cuSolverInverter< T_FP >::LU_diag, cuSolverInverter< T_FP >::LU_diag_gpu, make_identity_matrix_cuda(), cuSolverInverter< T_FP >::Mat1_gpu, cuSolverInverter< T_FP >::resize(), Matrix< T, Alloc >::rows(), Matrix< T, Alloc >::size(), and cuSolverInverter< T_FP >::work_gpu.

Referenced by qmcplusplus::TEMPLATE_TEST_CASE(), and qmcplusplus::TEST_CASE().

93  {
94  const int norb = logdetT.rows();
95  resize(norb);
96  cudaErrorCheck(cudaMemcpyAsync(Mat1_gpu.data(), logdetT.data(), logdetT.size() * sizeof(TMAT),
98  "cudaMemcpyAsync failed!");
99  cusolverErrorCheck(cusolver::getrf(h_cusolver_, norb, norb, Mat1_gpu.data(), norb, work_gpu.data(),
100  ipiv_gpu.data() + 1, ipiv_gpu.data()),
101  "cusolver::getrf failed!");
102  cudaErrorCheck(cudaMemcpyAsync(ipiv.data(), ipiv_gpu.data(), ipiv_gpu.size() * sizeof(int), cudaMemcpyDeviceToHost,
103  hstream_),
104  "cudaMemcpyAsync failed!");
105  extract_matrix_diagonal_cuda(norb, Mat1_gpu.data(), norb, LU_diag_gpu.data(), hstream_);
106  cudaErrorCheck(cudaMemcpyAsync(LU_diag.data(), LU_diag_gpu.data(), LU_diag.size() * sizeof(T_FP),
108  "cudaMemcpyAsync failed!");
109  // check LU success
110  cudaErrorCheck(cudaStreamSynchronize(hstream_), "cudaStreamSynchronize after getrf failed!");
111  if (ipiv[0] != 0)
112  {
113  std::ostringstream err;
114  err << "cusolver::getrf calculation failed with devInfo = " << ipiv[0] << std::endl;
115  std::cerr << err.str();
116  throw std::runtime_error(err.str());
117  }
118  make_identity_matrix_cuda(norb, Ainv_gpu.data(), norb, hstream_);
119  cusolverErrorCheck(cusolver::getrs(h_cusolver_, CUBLAS_OP_T, norb, norb, Mat1_gpu.data(), norb, ipiv_gpu.data() + 1,
120  Ainv_gpu.data(), norb, ipiv_gpu.data()),
121  "cusolver::getrs failed!");
123  "cudaMemcpyAsync failed!");
124  computeLogDet(LU_diag.data(), norb, ipiv.data() + 1, log_value);
125  cudaErrorCheck(cudaMemcpyAsync(Ainv.data(), Ainv_gpu.data(), Ainv.size() * sizeof(TMAT), cudaMemcpyDeviceToHost,
126  hstream_),
127  "cudaMemcpyAsync failed!");
128  // check solve success
129  cudaErrorCheck(cudaStreamSynchronize(hstream_), "cudaStreamSynchronize after getrs failed!");
130  if (ipiv[0] != 0)
131  {
132  std::ostringstream err;
133  err << "cusolver::getrs calculation failed with devInfo = " << ipiv[0] << std::endl;
134  std::cerr << err.str();
135  throw std::runtime_error(err.str());
136  }
137  }
cusolverStatus_t getrs(cusolverDnHandle_t &handle, const cublasOperation_t &transa, int m, int n, const double *A, int lda, int *ipiv, double *B, int ldb, int *info)
Definition: cusolver.hpp:116
void make_identity_matrix_cuda(const int nrows, double *mat, const int lda, cudaStream_t hstream)
create identity matrix on the device
void extract_matrix_diagonal_cuda(const int nrows, const double *mat, const int lda, double *diag, cudaStream_t hstream)
extract matrix diagonal
Matrix< T_FP, CUDAAllocator< T_FP > > Mat1_gpu
scratch memory for cusolverDN
void resize(int norb)
resize the internal storage
Vector< T_FP, CUDAAllocator< T_FP > > work_gpu
workspace
Vector< int, CUDAAllocator< int > > ipiv_gpu
cudaErrorCheck(cudaMemcpyAsync(dev_lu.data(), lu.data(), sizeof(decltype(lu)::value_type) *lu.size(), cudaMemcpyHostToDevice, hstream), "cudaMemcpyAsync failed copying log_values to device")
#define cudaMemcpyDeviceToHost
Definition: cuda2hip.h:138
#define CUBLAS_OP_T
Definition: cuda2hip.h:20
Vector< T_FP, CUDAAllocator< T_FP > > LU_diag_gpu
#define cudaStreamSynchronize
Definition: cuda2hip.h:152
void computeLogDet(const T *restrict diag, int n, const int *restrict pivot, std::complex< T_FP > &logdet)
Definition: DiracMatrix.h:100
#define cudaMemcpyHostToDevice
Definition: cuda2hip.h:139
Vector< int, CUDAHostAllocator< int > > ipiv
pivot array + info
#define cusolverErrorCheck(ans, cause)
Definition: cusolver.hpp:21
Vector< T_FP, CUDAHostAllocator< T_FP > > LU_diag
diagonal terms of LU matrix
cusolverStatus_t getrf(cusolverDnHandle_t &handle, int m, int n, double *A, int lda, double *work, int *ipiv, int *info)
Definition: cusolver.hpp:92
#define cudaMemcpyAsync
Definition: cuda2hip.h:136

◆ invert_transpose() [2/2]

std::enable_if_t<!std::is_same<TMAT, T_FP>::value> invert_transpose ( const Matrix< TMAT > &  logdetT,
Matrix< TMAT > &  Ainv,
Matrix< TMAT, CUDAAllocator< TMAT >> &  Ainv_gpu,
std::complex< TREAL > &  log_value 
)
inline

compute the inverse of the transpose of matrix A and its determinant value in log when T_FP and TMAT are not the same

Template Parameters
TREALreal type

Definition at line 144 of file cuSolverInverter.hpp.

References qmcplusplus::computeLogDet(), copy_matrix_cuda(), CUBLAS_OP_T, qmcplusplus::cudaErrorCheck(), cudaMemcpyAsync, cudaMemcpyDeviceToHost, cudaMemcpyHostToDevice, cudaStreamSynchronize, cusolverErrorCheck, Matrix< T, Alloc >::data(), extract_matrix_diagonal_cuda(), qmcplusplus::cusolver::getrf(), qmcplusplus::cusolver::getrs(), cuSolverInverter< T_FP >::h_cusolver_, cuSolverInverter< T_FP >::hstream_, cuSolverInverter< T_FP >::ipiv, cuSolverInverter< T_FP >::ipiv_gpu, qmcplusplus::isnan(), cuSolverInverter< T_FP >::LU_diag, cuSolverInverter< T_FP >::LU_diag_gpu, make_identity_matrix_cuda(), cuSolverInverter< T_FP >::Mat1_gpu, cuSolverInverter< T_FP >::Mat2_gpu, norm(), cuSolverInverter< T_FP >::resize(), Matrix< T, Alloc >::rows(), Matrix< T, Alloc >::size(), and cuSolverInverter< T_FP >::work_gpu.

148  {
149  const int norb = logdetT.rows();
150  resize(norb);
151  Mat2_gpu.resize(norb, norb);
152  cudaErrorCheck(cudaMemcpyAsync(Mat2_gpu.data(), logdetT.data(), logdetT.size() * sizeof(TMAT),
154  "cudaMemcpyAsync failed!");
155  copy_matrix_cuda(norb, norb, (TMAT*)Mat2_gpu.data(), norb, Mat1_gpu.data(), norb, hstream_);
156  cusolverErrorCheck(cusolver::getrf(h_cusolver_, norb, norb, Mat1_gpu.data(), norb, work_gpu.data(),
157  ipiv_gpu.data() + 1, ipiv_gpu.data()),
158  "cusolver::getrf failed!");
159  cudaErrorCheck(cudaMemcpyAsync(ipiv.data(), ipiv_gpu.data(), ipiv_gpu.size() * sizeof(int), cudaMemcpyDeviceToHost,
160  hstream_),
161  "cudaMemcpyAsync failed!");
162  extract_matrix_diagonal_cuda(norb, Mat1_gpu.data(), norb, LU_diag_gpu.data(), hstream_);
163  cudaErrorCheck(cudaMemcpyAsync(LU_diag.data(), LU_diag_gpu.data(), LU_diag.size() * sizeof(T_FP),
165  "cudaMemcpyAsync failed!");
166  // check LU success
167  cudaErrorCheck(cudaStreamSynchronize(hstream_), "cudaStreamSynchronize after getrf failed!");
168  if (ipiv[0] != 0)
169  {
170  std::ostringstream err;
171  err << "cusolver::getrf calculation failed with devInfo = " << ipiv[0] << std::endl;
172  throw std::runtime_error(err.str());
173  }
174  make_identity_matrix_cuda(norb, Mat2_gpu.data(), norb, hstream_);
175  cusolverErrorCheck(cusolver::getrs(h_cusolver_, CUBLAS_OP_T, norb, norb, Mat1_gpu.data(), norb, ipiv_gpu.data() + 1,
176  Mat2_gpu.data(), norb, ipiv_gpu.data()),
177  "cusolver::getrs failed!");
178  copy_matrix_cuda(norb, norb, Mat2_gpu.data(), norb, Ainv_gpu.data(), norb, hstream_);
180  "cudaMemcpyAsync failed!");
181  computeLogDet(LU_diag.data(), norb, ipiv.data() + 1, log_value);
182  cudaErrorCheck(cudaMemcpyAsync(Ainv.data(), Ainv_gpu.data(), Ainv.size() * sizeof(TMAT), cudaMemcpyDeviceToHost,
183  hstream_),
184  "cudaMemcpyAsync failed!");
185  // check solve success
186  cudaErrorCheck(cudaStreamSynchronize(hstream_), "cudaStreamSynchronize after getrs failed!");
187  if (ipiv[0] != 0)
188  {
189  std::ostringstream err;
190  err << "cusolver::getrs calculation failed with devInfo = " << ipiv[0] << std::endl;
191  throw std::runtime_error(err.str());
192  }
193 
194  std::ostringstream nan_msg;
195  for(int i = 0; i < norb; i++)
196  if (qmcplusplus::isnan(std::norm(Ainv[i][i])))
197  nan_msg << " Ainv["<< i << "][" << i << "] has bad value " << Ainv[i][i] << std::endl;
198  if (const std::string str = nan_msg.str(); !str.empty())
199  throw std::runtime_error("Inverse matrix diagonal check found:\n" + str);
200  }
cusolverStatus_t getrs(cusolverDnHandle_t &handle, const cublasOperation_t &transa, int m, int n, const double *A, int lda, int *ipiv, double *B, int ldb, int *info)
Definition: cusolver.hpp:116
void make_identity_matrix_cuda(const int nrows, double *mat, const int lda, cudaStream_t hstream)
create identity matrix on the device
void extract_matrix_diagonal_cuda(const int nrows, const double *mat, const int lda, double *diag, cudaStream_t hstream)
extract matrix diagonal
void copy_matrix_cuda(const int nrows, const int ncols, const double *mat_in, const int lda, float *mat_out, const int ldb, cudaStream_t hstream)
copy matrix with precision difference
Matrix< T_FP, CUDAAllocator< T_FP > > Mat2_gpu
scratch memory for cusolverDN
Matrix< T_FP, CUDAAllocator< T_FP > > Mat1_gpu
scratch memory for cusolverDN
void resize(int norb)
resize the internal storage
Vector< T_FP, CUDAAllocator< T_FP > > work_gpu
workspace
double norm(const zVec &c)
Definition: VectorOps.h:118
Vector< int, CUDAAllocator< int > > ipiv_gpu
cudaErrorCheck(cudaMemcpyAsync(dev_lu.data(), lu.data(), sizeof(decltype(lu)::value_type) *lu.size(), cudaMemcpyHostToDevice, hstream), "cudaMemcpyAsync failed copying log_values to device")
#define cudaMemcpyDeviceToHost
Definition: cuda2hip.h:138
#define CUBLAS_OP_T
Definition: cuda2hip.h:20
Vector< T_FP, CUDAAllocator< T_FP > > LU_diag_gpu
#define cudaStreamSynchronize
Definition: cuda2hip.h:152
void computeLogDet(const T *restrict diag, int n, const int *restrict pivot, std::complex< T_FP > &logdet)
Definition: DiracMatrix.h:100
#define cudaMemcpyHostToDevice
Definition: cuda2hip.h:139
Vector< int, CUDAHostAllocator< int > > ipiv
pivot array + info
#define cusolverErrorCheck(ans, cause)
Definition: cusolver.hpp:21
Vector< T_FP, CUDAHostAllocator< T_FP > > LU_diag
diagonal terms of LU matrix
cusolverStatus_t getrf(cusolverDnHandle_t &handle, int m, int n, double *A, int lda, double *work, int *ipiv, int *info)
Definition: cusolver.hpp:92
bool isnan(float a)
return true if the value is NaN.
Definition: math.cpp:18
#define cudaMemcpyAsync
Definition: cuda2hip.h:136

◆ resize()

void resize ( int  norb)
inlineprivate

resize the internal storage

Parameters
norbnumber of electrons/orbitals
delay,maximumdelay 0<delay<=norb

Definition at line 52 of file cuSolverInverter.hpp.

References cusolverErrorCheck, qmcplusplus::cusolver::getrf_bufferSize(), cuSolverInverter< T_FP >::h_cusolver_, cuSolverInverter< T_FP >::ipiv, cuSolverInverter< T_FP >::ipiv_gpu, cuSolverInverter< T_FP >::LU_diag, cuSolverInverter< T_FP >::LU_diag_gpu, cuSolverInverter< T_FP >::Mat1_gpu, and cuSolverInverter< T_FP >::work_gpu.

Referenced by cuSolverInverter< T_FP >::invert_transpose().

53  {
54  if (Mat1_gpu.rows() != norb)
55  {
56  Mat1_gpu.resize(norb, norb);
57  // prepare cusolver auxiliary arrays
58  ipiv.resize(norb + 1);
59  ipiv_gpu.resize(norb + 1);
60  LU_diag.resize(norb);
61  LU_diag_gpu.resize(norb);
62  int lwork;
63  cusolverErrorCheck(cusolver::getrf_bufferSize(h_cusolver_, norb, norb, Mat1_gpu.data(), norb, &lwork),
64  "cusolver::getrf_bufferSize failed!");
65  work_gpu.resize(lwork);
66  }
67  }
Matrix< T_FP, CUDAAllocator< T_FP > > Mat1_gpu
scratch memory for cusolverDN
Vector< T_FP, CUDAAllocator< T_FP > > work_gpu
workspace
Vector< int, CUDAAllocator< int > > ipiv_gpu
Vector< T_FP, CUDAAllocator< T_FP > > LU_diag_gpu
cusolverStatus_t getrf_bufferSize(cusolverDnHandle_t &handle, int m, int n, double *A, int lda, int *lwork)
Definition: cusolver.hpp:77
Vector< int, CUDAHostAllocator< int > > ipiv
pivot array + info
#define cusolverErrorCheck(ans, cause)
Definition: cusolver.hpp:21
Vector< T_FP, CUDAHostAllocator< T_FP > > LU_diag
diagonal terms of LU matrix

Member Data Documentation

◆ h_cusolver_

◆ hstream_

◆ ipiv

Vector<int, CUDAHostAllocator<int> > ipiv
private

pivot array + info

Definition at line 36 of file cuSolverInverter.hpp.

Referenced by cuSolverInverter< T_FP >::invert_transpose(), and cuSolverInverter< T_FP >::resize().

◆ ipiv_gpu

Vector<int, CUDAAllocator<int> > ipiv_gpu
private

◆ LU_diag

Vector<T_FP, CUDAHostAllocator<T_FP> > LU_diag
private

diagonal terms of LU matrix

Definition at line 39 of file cuSolverInverter.hpp.

Referenced by cuSolverInverter< T_FP >::invert_transpose(), and cuSolverInverter< T_FP >::resize().

◆ LU_diag_gpu

Vector<T_FP, CUDAAllocator<T_FP> > LU_diag_gpu
private

◆ Mat1_gpu

Matrix<T_FP, CUDAAllocator<T_FP> > Mat1_gpu
private

scratch memory for cusolverDN

Definition at line 32 of file cuSolverInverter.hpp.

Referenced by cuSolverInverter< T_FP >::invert_transpose(), and cuSolverInverter< T_FP >::resize().

◆ Mat2_gpu

Matrix<T_FP, CUDAAllocator<T_FP> > Mat2_gpu
private

scratch memory for cusolverDN

Definition at line 34 of file cuSolverInverter.hpp.

Referenced by cuSolverInverter< T_FP >::invert_transpose().

◆ work_gpu

Vector<T_FP, CUDAAllocator<T_FP> > work_gpu
private

The documentation for this class was generated from the following file: