QMCPACK
rocSolverInverter.hpp
Go to the documentation of this file.
1 //////////////////////////////////////////////////////////////////////////////////////
2 // This file is distributed under the University of Illinois/NCSA Open Source License.
3 // See LICENSE file in top directory for details.
4 //
5 // Copyright (c) 2022 QMCPACK developers.
6 //
7 // File developed by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
8 // Mark Dewing, mdewing@anl.gov, Argonne National Laboratory
9 //
10 // File created by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
11 //////////////////////////////////////////////////////////////////////////////////////
12 
13 #ifndef QMCPLUSPLUS_ROCSOLVERINVERTER_H
14 #define QMCPLUSPLUS_ROCSOLVERINVERTER_H
15 
16 #include "OhmmsPETE/OhmmsVector.h"
17 #include "OhmmsPETE/OhmmsMatrix.h"
18 
19 #if !defined(QMC_CUDA2HIP)
20 #error rocSolverInverter.hpp expects QMC_CUDA2HIP to be defined
21 #endif
22 // This file assumes that QMC_CUDA2HIP is defined and that creates HIP versions of these functions (despite being labeled with "CUDA")
23 #include "CUDA/CUDAruntime.hpp"
24 #include "CUDA/CUDAallocator.hpp"
25 #include "ROCm/rocsolver.hpp"
27 #include "CPU/math.hpp"
28 
29 namespace qmcplusplus
30 {
31 /** implements matrix inversion via rocSolver
32  * @tparam T_FP high precision for matrix inversion, T_FP >= T
33  */
34 template<typename T_FP>
36 {
37  /// scratch memory for cusolverDN
39  /// scratch memory for cusolverDN
41  /// pivot array + info
44  /// diagonal terms of LU matrix
47  /// workspace
49 
50  // CUDA specific variables
51  rocblas_handle h_rocsolver_;
52  hipStream_t hstream_;
53 
54  /** resize the internal storage
55  * @param norb number of electrons/orbitals
56  * @param delay, maximum delay 0<delay<=norb
57  */
58  inline void resize(int norb)
59  {
60  if (Mat1_gpu.rows() != norb)
61  {
62  Mat1_gpu.resize(norb, norb);
63  // prepare cusolver auxiliary arrays
64  ipiv.resize(norb + 1);
65  ipiv_gpu.resize(norb + 1);
66  LU_diag.resize(norb);
67  LU_diag_gpu.resize(norb);
68 
69  // Memory for temporary storage for solver calls.
70  // The rocSOLVER library handles this memory itself.
71  // If we need more control, there are API's to get the size and set the buffer memory
72 #if 0
73  size_t memory_size;
74  rocblas_start_device_memory_size_query(h_rocsolver_);
75  rocsolverErrorCheck(rocsolver::dgetrf(h_rocsolver_, norb, norb, nullptr, norb, nullptr, nullptr);
76  rocsolverErrorCheck(rocsolver::dgetri(h_rocsolver_, norb, norb, nullptr, norb, nullptr, nullptr);
77  rocblas_stop_device_memory_size_query(h_rocsolver_, &memory_size);
78 #endif
79  }
80  }
81 
82 public:
83  /// default constructor
85  {
86  cudaErrorCheck(hipStreamCreate(&hstream_), "hipStreamCreate failed!");
87  rocsolverErrorCheck(rocblas_create_handle(&h_rocsolver_), "rocblas_create_handle failed!");
88  rocsolverErrorCheck(rocblas_set_stream(h_rocsolver_, hstream_), "rocblas_set_stream failed!");
89  }
90 
92  {
93  rocsolverErrorCheck(rocblas_destroy_handle(h_rocsolver_), "rocblas_destroy_handle failed!");
94  cudaErrorCheck(hipStreamDestroy(hstream_), "hipStreamDestroy failed!");
95  }
96 
97  /** compute the inverse of the transpose of matrix A and its determinant value in log
98  * when T_FP and TMAT are the same
99  * @tparam TREAL real type
100  */
101  template<typename TMAT, typename TREAL, typename = std::enable_if_t<std::is_same<TMAT, T_FP>::value>>
102  std::enable_if_t<std::is_same<TMAT, T_FP>::value> invert_transpose(const Matrix<TMAT>& logdetT,
103  Matrix<TMAT>& Ainv,
104  Matrix<TMAT, CUDAAllocator<TMAT>>& Ainv_gpu,
105  std::complex<TREAL>& log_value)
106  {
107  const int norb = logdetT.rows();
108  resize(norb);
109  cudaErrorCheck(hipMemcpyAsync(Mat1_gpu.data(), logdetT.data(), logdetT.size() * sizeof(TMAT), hipMemcpyHostToDevice,
110  hstream_),
111  "hipMemcpyAsync for logdetT to Mat1_gpu failed!");
112  rocsolverErrorCheck(rocsolver::getrf(h_rocsolver_, norb, norb, Mat1_gpu.data(), norb, ipiv_gpu.data() + 1,
113  ipiv_gpu.data()),
114  "rocsolver::getrf failed!");
115  cudaErrorCheck(hipMemcpyAsync(ipiv.data(), ipiv_gpu.data(), ipiv_gpu.size() * sizeof(int), hipMemcpyDeviceToHost,
116  hstream_),
117  "hipMemcpyAsync for ipiv failed!");
118  extract_matrix_diagonal_cuda(norb, Mat1_gpu.data(), norb, LU_diag_gpu.data(), hstream_);
119  cudaErrorCheck(hipMemcpyAsync(LU_diag.data(), LU_diag_gpu.data(), LU_diag.size() * sizeof(T_FP),
120  hipMemcpyDeviceToHost, hstream_),
121  "hipMemcpyAsync for LU_diag failed!");
122  // check LU success
123  cudaErrorCheck(hipStreamSynchronize(hstream_), "hipStreamSynchronize after getrf failed!");
124  if (ipiv[0] != 0)
125  {
126  std::ostringstream err;
127  err << "rocsolver::getrf calculation failed with devInfo = " << ipiv[0] << std::endl;
128  std::cerr << err.str();
129  throw std::runtime_error(err.str());
130  }
131  make_identity_matrix_cuda(norb, Ainv_gpu.data(), norb, hstream_);
132  rocsolverErrorCheck(rocsolver::getrs(h_rocsolver_, rocblas_operation_transpose, norb, norb, Mat1_gpu.data(), norb,
133  ipiv_gpu.data() + 1, Ainv_gpu.data(), norb),
134  "rocsolver::getrs failed!");
135  cudaErrorCheck(hipMemcpyAsync(ipiv.data(), ipiv_gpu.data(), sizeof(int), hipMemcpyDeviceToHost, hstream_),
136  "hipMemcpyAsync for ipiv failed!");
137  computeLogDet(LU_diag.data(), norb, ipiv.data() + 1, log_value);
138  cudaErrorCheck(hipMemcpyAsync(Ainv.data(), Ainv_gpu.data(), Ainv.size() * sizeof(TMAT), hipMemcpyDeviceToHost,
139  hstream_),
140  "hipMemcpyAsync for Ainv failed!");
141  cudaErrorCheck(hipStreamSynchronize(hstream_), "hipStreamSynchronize after getrs failed!");
142  if (ipiv[0] != 0)
143  {
144  std::ostringstream err;
145  err << "rocsolver::getrs calculation failed with devInfo = " << ipiv[0] << std::endl;
146  std::cerr << err.str();
147  throw std::runtime_error(err.str());
148  }
149  }
150 
151  /** compute the inverse of the transpose of matrix A and its determinant value in log
152  * when T_FP and TMAT are not the same
153  * @tparam TREAL real type
154  */
155  template<typename TMAT, typename TREAL, typename = std::enable_if_t<!std::is_same<TMAT, T_FP>::value>>
156  std::enable_if_t<!std::is_same<TMAT, T_FP>::value> invert_transpose(const Matrix<TMAT>& logdetT,
157  Matrix<TMAT>& Ainv,
158  Matrix<TMAT, CUDAAllocator<TMAT>>& Ainv_gpu,
159  std::complex<TREAL>& log_value)
160  {
161  const int norb = logdetT.rows();
162  resize(norb);
163  Mat2_gpu.resize(norb, norb);
164  cudaErrorCheck(hipMemcpyAsync(Mat2_gpu.data(), logdetT.data(), logdetT.size() * sizeof(TMAT), hipMemcpyHostToDevice,
165  hstream_),
166  "hipMemcpyAsync failed!");
167  copy_matrix_cuda(norb, norb, (TMAT*)Mat2_gpu.data(), norb, Mat1_gpu.data(), norb, hstream_);
168  rocsolverErrorCheck(rocsolver::getrf(h_rocsolver_, norb, norb, Mat1_gpu.data(), norb, ipiv_gpu.data() + 1,
169  ipiv_gpu.data()),
170  "rocsolver::getrf failed!");
171  cudaErrorCheck(hipMemcpyAsync(ipiv.data(), ipiv_gpu.data(), ipiv_gpu.size() * sizeof(int), hipMemcpyDeviceToHost,
172  hstream_),
173  "hipMemcpyAsync failed!");
174  extract_matrix_diagonal_cuda(norb, Mat1_gpu.data(), norb, LU_diag_gpu.data(), hstream_);
175  cudaErrorCheck(hipMemcpyAsync(LU_diag.data(), LU_diag_gpu.data(), LU_diag.size() * sizeof(T_FP),
176  hipMemcpyDeviceToHost, hstream_),
177  "hipMemcpyAsync failed!");
178  // check LU success
179  cudaErrorCheck(hipStreamSynchronize(hstream_), "hipStreamSynchronize after getrf failed!");
180  if (ipiv[0] != 0)
181  {
182  std::ostringstream err;
183  err << "rocsolver::getrf calculation failed with devInfo = " << ipiv[0] << std::endl;
184  std::cerr << err.str();
185  throw std::runtime_error(err.str());
186  }
187  make_identity_matrix_cuda(norb, Mat2_gpu.data(), norb, hstream_);
188  rocsolverErrorCheck(rocsolver::getrs(h_rocsolver_, rocblas_operation_transpose, norb, norb, Mat1_gpu.data(), norb,
189  ipiv_gpu.data() + 1, Mat2_gpu.data(), norb),
190  "rocsolver::getrs failed!");
191  copy_matrix_cuda(norb, norb, Mat2_gpu.data(), norb, Ainv_gpu.data(), norb, hstream_);
192  cudaErrorCheck(hipMemcpyAsync(ipiv.data(), ipiv_gpu.data(), sizeof(int), hipMemcpyDeviceToHost, hstream_),
193  "hipMemcpyAsync failed!");
194  computeLogDet(LU_diag.data(), norb, ipiv.data() + 1, log_value);
195  cudaErrorCheck(hipMemcpyAsync(Ainv.data(), Ainv_gpu.data(), Ainv.size() * sizeof(TMAT), hipMemcpyDeviceToHost,
196  hstream_),
197  "hipMemcpyAsync failed!");
198  // check solve success
199  cudaErrorCheck(hipStreamSynchronize(hstream_), "hipStreamSynchronize after getrs failed!");
200  if (ipiv[0] != 0)
201  {
202  std::ostringstream err;
203  err << "rocsolver::getrs calculation failed with devInfo = " << ipiv[0] << std::endl;
204  std::cerr << err.str();
205  throw std::runtime_error(err.str());
206  }
207 
208  std::ostringstream nan_msg;
209  for(int i = 0; i < norb; i++)
210  if (qmcplusplus::isnan(std::norm(Ainv[i][i])))
211  nan_msg << " Ainv["<< i << "][" << i << "] has bad value " << Ainv[i][i] << std::endl;
212  if (const std::string str = nan_msg.str(); !str.empty())
213  throw std::runtime_error("Inverse matrix diagonal check found:\n" + str);
214  }
215 };
216 } // namespace qmcplusplus
217 
218 #endif // QMCPLUSPLUS_ROCSOLVERINVERTER_H
Vector< T_FP, CUDAAllocator< T_FP > > work_gpu
workspace
rocSolverInverter()
default constructor
rocblas_status getrs(rocblas_handle &handle, const rocblas_operation &transa, int m, int n, double *A, int lda, int *ipiv, double *B, int ldb)
Definition: rocsolver.hpp:117
rocblas_status getrf(rocblas_handle &handle, int m, int n, double *A, int lda, int *ipiv, int *info)
Definition: rocsolver.hpp:101
helper functions for EinsplineSetBuilder
Definition: Configuration.h:43
Vector< int, CUDAHostAllocator< int > > ipiv
pivot array + info
void make_identity_matrix_cuda(const int nrows, double *mat, const int lda, cudaStream_t hstream)
create identity matrix on the device
handle CUDA/HIP runtime selection.
void extract_matrix_diagonal_cuda(const int nrows, const double *mat, const int lda, double *diag, cudaStream_t hstream)
extract matrix diagonal
std::enable_if_t< std::is_same< TMAT, T_FP >::value > invert_transpose(const Matrix< TMAT > &logdetT, Matrix< TMAT > &Ainv, Matrix< TMAT, CUDAAllocator< TMAT >> &Ainv_gpu, std::complex< TREAL > &log_value)
compute the inverse of the transpose of matrix A and its determinant value in log when T_FP and TMAT ...
void copy_matrix_cuda(const int nrows, const int ncols, const double *mat_in, const int lda, float *mat_out, const int ldb, cudaStream_t hstream)
copy matrix with precision difference
this file provides three C++ memory allocators using CUDA specific memory allocation functions...
implements matrix inversion via rocSolver
size_type size() const
Definition: OhmmsMatrix.h:76
double norm(const zVec &c)
Definition: VectorOps.h:118
Vector< int, CUDAAllocator< int > > ipiv_gpu
void dgetrf(const int &n, const int &m, double *a, const int &n0, int *piv, int &st)
cudaErrorCheck(cudaMemcpyAsync(dev_lu.data(), lu.data(), sizeof(decltype(lu)::value_type) *lu.size(), cudaMemcpyHostToDevice, hstream), "cudaMemcpyAsync failed copying log_values to device")
Vector< T_FP, CUDAAllocator< T_FP > > LU_diag_gpu
void computeLogDet(const T *restrict diag, int n, const int *restrict pivot, std::complex< T_FP > &logdet)
Definition: DiracMatrix.h:100
std::enable_if_t<!std::is_same< TMAT, T_FP >::value > invert_transpose(const Matrix< TMAT > &logdetT, Matrix< TMAT > &Ainv, Matrix< TMAT, CUDAAllocator< TMAT >> &Ainv_gpu, std::complex< TREAL > &log_value)
compute the inverse of the transpose of matrix A and its determinant value in log when T_FP and TMAT ...
size_type rows() const
Definition: OhmmsMatrix.h:77
Declaraton of Vector<T,Alloc> Manage memory through Alloc directly and allow referencing an existing ...
Matrix< T_FP, CUDAAllocator< T_FP > > Mat2_gpu
scratch memory for cusolverDN
#define rocsolverErrorCheck(ans, cause)
Definition: rocsolver.hpp:25
Vector< T_FP, CUDAHostAllocator< T_FP > > LU_diag
diagonal terms of LU matrix
allocator for CUDA device memory
void dgetri(const int &n, double *a, const int &n0, int const *piv, double *work, const int &, int &st)
Matrix< T_FP, CUDAAllocator< T_FP > > Mat1_gpu
scratch memory for cusolverDN
bool isnan(float a)
return true if the value is NaN.
Definition: math.cpp:18
void resize(int norb)
resize the internal storage