12 #ifndef QMCPLUSPLUS_CUSOLVERINVERTOR_H 13 #define QMCPLUSPLUS_CUSOLVERINVERTOR_H 28 template<
typename T_FP>
58 ipiv.resize(norb + 1);
64 "cusolver::getrf_bufferSize failed!");
88 template<typename TMAT, typename TREAL, typename = std::enable_if_t<std::is_same<TMAT, T_FP>::value>>
92 std::complex<TREAL>& log_value)
94 const int norb = logdetT.
rows();
98 "cudaMemcpyAsync failed!");
101 "cusolver::getrf failed!");
104 "cudaMemcpyAsync failed!");
108 "cudaMemcpyAsync failed!");
113 std::ostringstream err;
114 err <<
"cusolver::getrf calculation failed with devInfo = " <<
ipiv[0] << std::endl;
115 std::cerr << err.str();
116 throw std::runtime_error(err.str());
120 Ainv_gpu.data(), norb,
ipiv_gpu.data()),
121 "cusolver::getrs failed!");
123 "cudaMemcpyAsync failed!");
127 "cudaMemcpyAsync failed!");
132 std::ostringstream err;
133 err <<
"cusolver::getrs calculation failed with devInfo = " <<
ipiv[0] << std::endl;
134 std::cerr << err.str();
135 throw std::runtime_error(err.str());
143 template<typename TMAT, typename TREAL, typename = std::enable_if_t<!std::is_same<TMAT, T_FP>::value>>
147 std::complex<TREAL>& log_value)
149 const int norb = logdetT.
rows();
154 "cudaMemcpyAsync failed!");
158 "cusolver::getrf failed!");
161 "cudaMemcpyAsync failed!");
165 "cudaMemcpyAsync failed!");
170 std::ostringstream err;
171 err <<
"cusolver::getrf calculation failed with devInfo = " <<
ipiv[0] << std::endl;
172 throw std::runtime_error(err.str());
177 "cusolver::getrs failed!");
180 "cudaMemcpyAsync failed!");
184 "cudaMemcpyAsync failed!");
189 std::ostringstream err;
190 err <<
"cusolver::getrs calculation failed with devInfo = " <<
ipiv[0] << std::endl;
191 throw std::runtime_error(err.str());
194 std::ostringstream nan_msg;
195 for(
int i = 0; i < norb; i++)
197 nan_msg <<
" Ainv["<< i <<
"][" << i <<
"] has bad value " << Ainv[i][i] << std::endl;
198 if (
const std::string str = nan_msg.str(); !str.empty())
199 throw std::runtime_error(
"Inverse matrix diagonal check found:\n" + str);
204 #endif // QMCPLUSPLUS_CUSOLVERINVERTOR_H
cuSolverInverter()
default constructor
cusolverStatus_t getrs(cusolverDnHandle_t &handle, const cublasOperation_t &transa, int m, int n, const double *A, int lda, int *ipiv, double *B, int ldb, int *info)
helper functions for EinsplineSetBuilder
void make_identity_matrix_cuda(const int nrows, double *mat, const int lda, cudaStream_t hstream)
create identity matrix on the device
handle CUDA/HIP runtime selection.
#define cudaStreamDestroy
void extract_matrix_diagonal_cuda(const int nrows, const double *mat, const int lda, double *diag, cudaStream_t hstream)
extract matrix diagonal
cusolverDnHandle_t h_cusolver_
std::enable_if_t< std::is_same< TMAT, T_FP >::value > invert_transpose(const Matrix< TMAT > &logdetT, Matrix< TMAT > &Ainv, Matrix< TMAT, CUDAAllocator< TMAT >> &Ainv_gpu, std::complex< TREAL > &log_value)
compute the inverse of the transpose of matrix A and its determinant value in log when T_FP and TMAT ...
void copy_matrix_cuda(const int nrows, const int ncols, const double *mat_in, const int lda, float *mat_out, const int ldb, cudaStream_t hstream)
copy matrix with precision difference
this file provides three C++ memory allocators using CUDA specific memory allocation functions...
Matrix< T_FP, CUDAAllocator< T_FP > > Mat2_gpu
scratch memory for cusolverDN
Matrix< T_FP, CUDAAllocator< T_FP > > Mat1_gpu
scratch memory for cusolverDN
void resize(int norb)
resize the internal storage
Vector< T_FP, CUDAAllocator< T_FP > > work_gpu
workspace
double norm(const zVec &c)
Vector< int, CUDAAllocator< int > > ipiv_gpu
cudaErrorCheck(cudaMemcpyAsync(dev_lu.data(), lu.data(), sizeof(decltype(lu)::value_type) *lu.size(), cudaMemcpyHostToDevice, hstream), "cudaMemcpyAsync failed copying log_values to device")
#define cudaMemcpyDeviceToHost
Vector< T_FP, CUDAAllocator< T_FP > > LU_diag_gpu
#define cudaStreamSynchronize
void computeLogDet(const T *restrict diag, int n, const int *restrict pivot, std::complex< T_FP > &logdet)
#define cudaMemcpyHostToDevice
cusolverStatus_t getrf_bufferSize(cusolverDnHandle_t &handle, int m, int n, double *A, int lda, int *lwork)
Declaraton of Vector<T,Alloc> Manage memory through Alloc directly and allow referencing an existing ...
Vector< int, CUDAHostAllocator< int > > ipiv
pivot array + info
#define cusolverErrorCheck(ans, cause)
allocator for CUDA device memory
implements matrix inversion via cuSolverDN
Vector< T_FP, CUDAHostAllocator< T_FP > > LU_diag
diagonal terms of LU matrix
cusolverStatus_t getrf(cusolverDnHandle_t &handle, int m, int n, double *A, int lda, double *work, int *ipiv, int *info)
bool isnan(float a)
return true if the value is NaN.
std::enable_if_t<!std::is_same< TMAT, T_FP >::value > invert_transpose(const Matrix< TMAT > &logdetT, Matrix< TMAT > &Ainv, Matrix< TMAT, CUDAAllocator< TMAT >> &Ainv_gpu, std::complex< TREAL > &log_value)
compute the inverse of the transpose of matrix A and its determinant value in log when T_FP and TMAT ...