QMCPACK
cuSolverInverter.hpp
Go to the documentation of this file.
1 //////////////////////////////////////////////////////////////////////////////////////
2 // This file is distributed under the University of Illinois/NCSA Open Source License.
3 // See LICENSE file in top directory for details.
4 //
5 // Copyright (c) 2019 QMCPACK developers.
6 //
7 // File developed by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
8 //
9 // File created by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
10 //////////////////////////////////////////////////////////////////////////////////////
11 
12 #ifndef QMCPLUSPLUS_CUSOLVERINVERTOR_H
13 #define QMCPLUSPLUS_CUSOLVERINVERTOR_H
14 
15 #include "OhmmsPETE/OhmmsVector.h"
16 #include "OhmmsPETE/OhmmsMatrix.h"
17 #include "CUDA/CUDAruntime.hpp"
18 #include "CUDA/CUDAallocator.hpp"
19 #include "CUDA/cusolver.hpp"
21 #include "CPU/math.hpp"
22 
23 namespace qmcplusplus
24 {
25 /** implements matrix inversion via cuSolverDN
26  * @tparam T_FP high precision for matrix inversion, T_FP >= T
27  */
28 template<typename T_FP>
30 {
31  /// scratch memory for cusolverDN
33  /// scratch memory for cusolverDN
35  /// pivot array + info
38  /// diagonal terms of LU matrix
41  /// workspace
43 
44  // CUDA specific variables
45  cusolverDnHandle_t h_cusolver_;
47 
48  /** resize the internal storage
49  * @param norb number of electrons/orbitals
50  * @param delay, maximum delay 0<delay<=norb
51  */
52  inline void resize(int norb)
53  {
54  if (Mat1_gpu.rows() != norb)
55  {
56  Mat1_gpu.resize(norb, norb);
57  // prepare cusolver auxiliary arrays
58  ipiv.resize(norb + 1);
59  ipiv_gpu.resize(norb + 1);
60  LU_diag.resize(norb);
61  LU_diag_gpu.resize(norb);
62  int lwork;
63  cusolverErrorCheck(cusolver::getrf_bufferSize(h_cusolver_, norb, norb, Mat1_gpu.data(), norb, &lwork),
64  "cusolver::getrf_bufferSize failed!");
65  work_gpu.resize(lwork);
66  }
67  }
68 
69 public:
70  /// default constructor
72  {
73  cudaErrorCheck(cudaStreamCreate(&hstream_), "cudaStreamCreate failed!");
74  cusolverErrorCheck(cusolverDnCreate(&h_cusolver_), "cusolverCreate failed!");
75  cusolverErrorCheck(cusolverDnSetStream(h_cusolver_, hstream_), "cusolverSetStream failed!");
76  }
77 
79  {
80  cusolverErrorCheck(cusolverDnDestroy(h_cusolver_), "cusolverDestroy failed!");
81  cudaErrorCheck(cudaStreamDestroy(hstream_), "cudaStreamDestroy failed!");
82  }
83 
84  /** compute the inverse of the transpose of matrix A and its determinant value in log
85  * when T_FP and TMAT are the same
86  * @tparam TREAL real type
87  */
88  template<typename TMAT, typename TREAL, typename = std::enable_if_t<std::is_same<TMAT, T_FP>::value>>
89  std::enable_if_t<std::is_same<TMAT, T_FP>::value> invert_transpose(const Matrix<TMAT>& logdetT,
90  Matrix<TMAT>& Ainv,
91  Matrix<TMAT, CUDAAllocator<TMAT>>& Ainv_gpu,
92  std::complex<TREAL>& log_value)
93  {
94  const int norb = logdetT.rows();
95  resize(norb);
96  cudaErrorCheck(cudaMemcpyAsync(Mat1_gpu.data(), logdetT.data(), logdetT.size() * sizeof(TMAT),
98  "cudaMemcpyAsync failed!");
99  cusolverErrorCheck(cusolver::getrf(h_cusolver_, norb, norb, Mat1_gpu.data(), norb, work_gpu.data(),
100  ipiv_gpu.data() + 1, ipiv_gpu.data()),
101  "cusolver::getrf failed!");
102  cudaErrorCheck(cudaMemcpyAsync(ipiv.data(), ipiv_gpu.data(), ipiv_gpu.size() * sizeof(int), cudaMemcpyDeviceToHost,
103  hstream_),
104  "cudaMemcpyAsync failed!");
105  extract_matrix_diagonal_cuda(norb, Mat1_gpu.data(), norb, LU_diag_gpu.data(), hstream_);
106  cudaErrorCheck(cudaMemcpyAsync(LU_diag.data(), LU_diag_gpu.data(), LU_diag.size() * sizeof(T_FP),
108  "cudaMemcpyAsync failed!");
109  // check LU success
110  cudaErrorCheck(cudaStreamSynchronize(hstream_), "cudaStreamSynchronize after getrf failed!");
111  if (ipiv[0] != 0)
112  {
113  std::ostringstream err;
114  err << "cusolver::getrf calculation failed with devInfo = " << ipiv[0] << std::endl;
115  std::cerr << err.str();
116  throw std::runtime_error(err.str());
117  }
118  make_identity_matrix_cuda(norb, Ainv_gpu.data(), norb, hstream_);
119  cusolverErrorCheck(cusolver::getrs(h_cusolver_, CUBLAS_OP_T, norb, norb, Mat1_gpu.data(), norb, ipiv_gpu.data() + 1,
120  Ainv_gpu.data(), norb, ipiv_gpu.data()),
121  "cusolver::getrs failed!");
123  "cudaMemcpyAsync failed!");
124  computeLogDet(LU_diag.data(), norb, ipiv.data() + 1, log_value);
125  cudaErrorCheck(cudaMemcpyAsync(Ainv.data(), Ainv_gpu.data(), Ainv.size() * sizeof(TMAT), cudaMemcpyDeviceToHost,
126  hstream_),
127  "cudaMemcpyAsync failed!");
128  // check solve success
129  cudaErrorCheck(cudaStreamSynchronize(hstream_), "cudaStreamSynchronize after getrs failed!");
130  if (ipiv[0] != 0)
131  {
132  std::ostringstream err;
133  err << "cusolver::getrs calculation failed with devInfo = " << ipiv[0] << std::endl;
134  std::cerr << err.str();
135  throw std::runtime_error(err.str());
136  }
137  }
138 
139  /** compute the inverse of the transpose of matrix A and its determinant value in log
140  * when T_FP and TMAT are not the same
141  * @tparam TREAL real type
142  */
143  template<typename TMAT, typename TREAL, typename = std::enable_if_t<!std::is_same<TMAT, T_FP>::value>>
144  std::enable_if_t<!std::is_same<TMAT, T_FP>::value> invert_transpose(const Matrix<TMAT>& logdetT,
145  Matrix<TMAT>& Ainv,
146  Matrix<TMAT, CUDAAllocator<TMAT>>& Ainv_gpu,
147  std::complex<TREAL>& log_value)
148  {
149  const int norb = logdetT.rows();
150  resize(norb);
151  Mat2_gpu.resize(norb, norb);
152  cudaErrorCheck(cudaMemcpyAsync(Mat2_gpu.data(), logdetT.data(), logdetT.size() * sizeof(TMAT),
154  "cudaMemcpyAsync failed!");
155  copy_matrix_cuda(norb, norb, (TMAT*)Mat2_gpu.data(), norb, Mat1_gpu.data(), norb, hstream_);
156  cusolverErrorCheck(cusolver::getrf(h_cusolver_, norb, norb, Mat1_gpu.data(), norb, work_gpu.data(),
157  ipiv_gpu.data() + 1, ipiv_gpu.data()),
158  "cusolver::getrf failed!");
159  cudaErrorCheck(cudaMemcpyAsync(ipiv.data(), ipiv_gpu.data(), ipiv_gpu.size() * sizeof(int), cudaMemcpyDeviceToHost,
160  hstream_),
161  "cudaMemcpyAsync failed!");
162  extract_matrix_diagonal_cuda(norb, Mat1_gpu.data(), norb, LU_diag_gpu.data(), hstream_);
163  cudaErrorCheck(cudaMemcpyAsync(LU_diag.data(), LU_diag_gpu.data(), LU_diag.size() * sizeof(T_FP),
165  "cudaMemcpyAsync failed!");
166  // check LU success
167  cudaErrorCheck(cudaStreamSynchronize(hstream_), "cudaStreamSynchronize after getrf failed!");
168  if (ipiv[0] != 0)
169  {
170  std::ostringstream err;
171  err << "cusolver::getrf calculation failed with devInfo = " << ipiv[0] << std::endl;
172  throw std::runtime_error(err.str());
173  }
174  make_identity_matrix_cuda(norb, Mat2_gpu.data(), norb, hstream_);
175  cusolverErrorCheck(cusolver::getrs(h_cusolver_, CUBLAS_OP_T, norb, norb, Mat1_gpu.data(), norb, ipiv_gpu.data() + 1,
176  Mat2_gpu.data(), norb, ipiv_gpu.data()),
177  "cusolver::getrs failed!");
178  copy_matrix_cuda(norb, norb, Mat2_gpu.data(), norb, Ainv_gpu.data(), norb, hstream_);
180  "cudaMemcpyAsync failed!");
181  computeLogDet(LU_diag.data(), norb, ipiv.data() + 1, log_value);
182  cudaErrorCheck(cudaMemcpyAsync(Ainv.data(), Ainv_gpu.data(), Ainv.size() * sizeof(TMAT), cudaMemcpyDeviceToHost,
183  hstream_),
184  "cudaMemcpyAsync failed!");
185  // check solve success
186  cudaErrorCheck(cudaStreamSynchronize(hstream_), "cudaStreamSynchronize after getrs failed!");
187  if (ipiv[0] != 0)
188  {
189  std::ostringstream err;
190  err << "cusolver::getrs calculation failed with devInfo = " << ipiv[0] << std::endl;
191  throw std::runtime_error(err.str());
192  }
193 
194  std::ostringstream nan_msg;
195  for(int i = 0; i < norb; i++)
196  if (qmcplusplus::isnan(std::norm(Ainv[i][i])))
197  nan_msg << " Ainv["<< i << "][" << i << "] has bad value " << Ainv[i][i] << std::endl;
198  if (const std::string str = nan_msg.str(); !str.empty())
199  throw std::runtime_error("Inverse matrix diagonal check found:\n" + str);
200  }
201 };
202 } // namespace qmcplusplus
203 
204 #endif // QMCPLUSPLUS_CUSOLVERINVERTOR_H
cuSolverInverter()
default constructor
cusolverStatus_t getrs(cusolverDnHandle_t &handle, const cublasOperation_t &transa, int m, int n, const double *A, int lda, int *ipiv, double *B, int ldb, int *info)
Definition: cusolver.hpp:116
helper functions for EinsplineSetBuilder
Definition: Configuration.h:43
void make_identity_matrix_cuda(const int nrows, double *mat, const int lda, cudaStream_t hstream)
create identity matrix on the device
handle CUDA/HIP runtime selection.
#define cudaStreamDestroy
Definition: cuda2hip.h:151
void extract_matrix_diagonal_cuda(const int nrows, const double *mat, const int lda, double *diag, cudaStream_t hstream)
extract matrix diagonal
std::enable_if_t< std::is_same< TMAT, T_FP >::value > invert_transpose(const Matrix< TMAT > &logdetT, Matrix< TMAT > &Ainv, Matrix< TMAT, CUDAAllocator< TMAT >> &Ainv_gpu, std::complex< TREAL > &log_value)
compute the inverse of the transpose of matrix A and its determinant value in log when T_FP and TMAT ...
void copy_matrix_cuda(const int nrows, const int ncols, const double *mat_in, const int lda, float *mat_out, const int ldb, cudaStream_t hstream)
copy matrix with precision difference
#define cudaStream_t
Definition: cuda2hip.h:149
#define cudaStreamCreate
Definition: cuda2hip.h:150
this file provides three C++ memory allocators using CUDA specific memory allocation functions...
Matrix< T_FP, CUDAAllocator< T_FP > > Mat2_gpu
scratch memory for cusolverDN
Matrix< T_FP, CUDAAllocator< T_FP > > Mat1_gpu
scratch memory for cusolverDN
void resize(int norb)
resize the internal storage
size_type size() const
Definition: OhmmsMatrix.h:76
Vector< T_FP, CUDAAllocator< T_FP > > work_gpu
workspace
double norm(const zVec &c)
Definition: VectorOps.h:118
Vector< int, CUDAAllocator< int > > ipiv_gpu
cudaErrorCheck(cudaMemcpyAsync(dev_lu.data(), lu.data(), sizeof(decltype(lu)::value_type) *lu.size(), cudaMemcpyHostToDevice, hstream), "cudaMemcpyAsync failed copying log_values to device")
#define cudaMemcpyDeviceToHost
Definition: cuda2hip.h:138
#define CUBLAS_OP_T
Definition: cuda2hip.h:20
Vector< T_FP, CUDAAllocator< T_FP > > LU_diag_gpu
#define cudaStreamSynchronize
Definition: cuda2hip.h:152
void computeLogDet(const T *restrict diag, int n, const int *restrict pivot, std::complex< T_FP > &logdet)
Definition: DiracMatrix.h:100
#define cudaMemcpyHostToDevice
Definition: cuda2hip.h:139
cusolverStatus_t getrf_bufferSize(cusolverDnHandle_t &handle, int m, int n, double *A, int lda, int *lwork)
Definition: cusolver.hpp:77
size_type rows() const
Definition: OhmmsMatrix.h:77
Declaraton of Vector<T,Alloc> Manage memory through Alloc directly and allow referencing an existing ...
Vector< int, CUDAHostAllocator< int > > ipiv
pivot array + info
#define cusolverErrorCheck(ans, cause)
Definition: cusolver.hpp:21
allocator for CUDA device memory
implements matrix inversion via cuSolverDN
Vector< T_FP, CUDAHostAllocator< T_FP > > LU_diag
diagonal terms of LU matrix
cusolverStatus_t getrf(cusolverDnHandle_t &handle, int m, int n, double *A, int lda, double *work, int *ipiv, int *info)
Definition: cusolver.hpp:92
bool isnan(float a)
return true if the value is NaN.
Definition: math.cpp:18
#define cudaMemcpyAsync
Definition: cuda2hip.h:136
std::enable_if_t<!std::is_same< TMAT, T_FP >::value > invert_transpose(const Matrix< TMAT > &logdetT, Matrix< TMAT > &Ainv, Matrix< TMAT, CUDAAllocator< TMAT >> &Ainv_gpu, std::complex< TREAL > &log_value)
compute the inverse of the transpose of matrix A and its determinant value in log when T_FP and TMAT ...