13 #ifndef QMCPLUSPLUS_ROCSOLVERINVERTER_H 14 #define QMCPLUSPLUS_ROCSOLVERINVERTER_H 19 #if !defined(QMC_CUDA2HIP) 20 #error rocSolverInverter.hpp expects QMC_CUDA2HIP to be defined 34 template<
typename T_FP>
64 ipiv.resize(norb + 1);
77 rocblas_stop_device_memory_size_query(
h_rocsolver_, &memory_size);
101 template<typename TMAT, typename TREAL, typename = std::enable_if_t<std::is_same<TMAT, T_FP>::value>>
105 std::complex<TREAL>& log_value)
107 const int norb = logdetT.
rows();
111 "hipMemcpyAsync for logdetT to Mat1_gpu failed!");
114 "rocsolver::getrf failed!");
117 "hipMemcpyAsync for ipiv failed!");
121 "hipMemcpyAsync for LU_diag failed!");
126 std::ostringstream err;
127 err <<
"rocsolver::getrf calculation failed with devInfo = " <<
ipiv[0] << std::endl;
128 std::cerr << err.str();
129 throw std::runtime_error(err.str());
133 ipiv_gpu.data() + 1, Ainv_gpu.data(), norb),
134 "rocsolver::getrs failed!");
136 "hipMemcpyAsync for ipiv failed!");
138 cudaErrorCheck(hipMemcpyAsync(Ainv.
data(), Ainv_gpu.data(), Ainv.
size() *
sizeof(TMAT), hipMemcpyDeviceToHost,
140 "hipMemcpyAsync for Ainv failed!");
144 std::ostringstream err;
145 err <<
"rocsolver::getrs calculation failed with devInfo = " <<
ipiv[0] << std::endl;
146 std::cerr << err.str();
147 throw std::runtime_error(err.str());
155 template<typename TMAT, typename TREAL, typename = std::enable_if_t<!std::is_same<TMAT, T_FP>::value>>
159 std::complex<TREAL>& log_value)
161 const int norb = logdetT.
rows();
166 "hipMemcpyAsync failed!");
170 "rocsolver::getrf failed!");
173 "hipMemcpyAsync failed!");
177 "hipMemcpyAsync failed!");
182 std::ostringstream err;
183 err <<
"rocsolver::getrf calculation failed with devInfo = " <<
ipiv[0] << std::endl;
184 std::cerr << err.str();
185 throw std::runtime_error(err.str());
190 "rocsolver::getrs failed!");
193 "hipMemcpyAsync failed!");
195 cudaErrorCheck(hipMemcpyAsync(Ainv.
data(), Ainv_gpu.data(), Ainv.
size() *
sizeof(TMAT), hipMemcpyDeviceToHost,
197 "hipMemcpyAsync failed!");
202 std::ostringstream err;
203 err <<
"rocsolver::getrs calculation failed with devInfo = " <<
ipiv[0] << std::endl;
204 std::cerr << err.str();
205 throw std::runtime_error(err.str());
208 std::ostringstream nan_msg;
209 for(
int i = 0; i < norb; i++)
211 nan_msg <<
" Ainv["<< i <<
"][" << i <<
"] has bad value " << Ainv[i][i] << std::endl;
212 if (
const std::string str = nan_msg.str(); !str.empty())
213 throw std::runtime_error(
"Inverse matrix diagonal check found:\n" + str);
218 #endif // QMCPLUSPLUS_ROCSOLVERINVERTER_H Vector< T_FP, CUDAAllocator< T_FP > > work_gpu
workspace
rocSolverInverter()
default constructor
rocblas_status getrs(rocblas_handle &handle, const rocblas_operation &transa, int m, int n, double *A, int lda, int *ipiv, double *B, int ldb)
rocblas_status getrf(rocblas_handle &handle, int m, int n, double *A, int lda, int *ipiv, int *info)
helper functions for EinsplineSetBuilder
Vector< int, CUDAHostAllocator< int > > ipiv
pivot array + info
void make_identity_matrix_cuda(const int nrows, double *mat, const int lda, cudaStream_t hstream)
create identity matrix on the device
handle CUDA/HIP runtime selection.
void extract_matrix_diagonal_cuda(const int nrows, const double *mat, const int lda, double *diag, cudaStream_t hstream)
extract matrix diagonal
std::enable_if_t< std::is_same< TMAT, T_FP >::value > invert_transpose(const Matrix< TMAT > &logdetT, Matrix< TMAT > &Ainv, Matrix< TMAT, CUDAAllocator< TMAT >> &Ainv_gpu, std::complex< TREAL > &log_value)
compute the inverse of the transpose of matrix A and its determinant value in log when T_FP and TMAT ...
void copy_matrix_cuda(const int nrows, const int ncols, const double *mat_in, const int lda, float *mat_out, const int ldb, cudaStream_t hstream)
copy matrix with precision difference
this file provides three C++ memory allocators using CUDA specific memory allocation functions...
implements matrix inversion via rocSolver
double norm(const zVec &c)
Vector< int, CUDAAllocator< int > > ipiv_gpu
void dgetrf(const int &n, const int &m, double *a, const int &n0, int *piv, int &st)
cudaErrorCheck(cudaMemcpyAsync(dev_lu.data(), lu.data(), sizeof(decltype(lu)::value_type) *lu.size(), cudaMemcpyHostToDevice, hstream), "cudaMemcpyAsync failed copying log_values to device")
Vector< T_FP, CUDAAllocator< T_FP > > LU_diag_gpu
void computeLogDet(const T *restrict diag, int n, const int *restrict pivot, std::complex< T_FP > &logdet)
std::enable_if_t<!std::is_same< TMAT, T_FP >::value > invert_transpose(const Matrix< TMAT > &logdetT, Matrix< TMAT > &Ainv, Matrix< TMAT, CUDAAllocator< TMAT >> &Ainv_gpu, std::complex< TREAL > &log_value)
compute the inverse of the transpose of matrix A and its determinant value in log when T_FP and TMAT ...
Declaraton of Vector<T,Alloc> Manage memory through Alloc directly and allow referencing an existing ...
Matrix< T_FP, CUDAAllocator< T_FP > > Mat2_gpu
scratch memory for cusolverDN
rocblas_handle h_rocsolver_
#define rocsolverErrorCheck(ans, cause)
Vector< T_FP, CUDAHostAllocator< T_FP > > LU_diag
diagonal terms of LU matrix
allocator for CUDA device memory
void dgetri(const int &n, double *a, const int &n0, int const *piv, double *work, const int &, int &st)
Matrix< T_FP, CUDAAllocator< T_FP > > Mat1_gpu
scratch memory for cusolverDN
bool isnan(float a)
return true if the value is NaN.
void resize(int norb)
resize the internal storage