12 #ifndef QMCPLUSPLUS_CUSOLVERINVERTOR_H    13 #define QMCPLUSPLUS_CUSOLVERINVERTOR_H    28 template<
typename T_FP>
    58       ipiv.resize(norb + 1);
    64                          "cusolver::getrf_bufferSize failed!");
    88   template<typename TMAT, typename TREAL, typename = std::enable_if_t<std::is_same<TMAT, T_FP>::value>>
    92                                                                      std::complex<TREAL>& log_value)
    94     const int norb = logdetT.
rows();
    98                    "cudaMemcpyAsync failed!");
   101                        "cusolver::getrf failed!");
   104                    "cudaMemcpyAsync failed!");
   108                    "cudaMemcpyAsync failed!");
   113       std::ostringstream err;
   114       err << 
"cusolver::getrf calculation failed with devInfo = " << 
ipiv[0] << std::endl;
   115       std::cerr << err.str();
   116       throw std::runtime_error(err.str());
   120                                        Ainv_gpu.data(), norb, 
ipiv_gpu.data()),
   121                        "cusolver::getrs failed!");
   123                    "cudaMemcpyAsync failed!");
   127                    "cudaMemcpyAsync failed!");
   132       std::ostringstream err;
   133       err << 
"cusolver::getrs calculation failed with devInfo = " << 
ipiv[0] << std::endl;
   134       std::cerr << err.str();
   135       throw std::runtime_error(err.str());
   143   template<typename TMAT, typename TREAL, typename = std::enable_if_t<!std::is_same<TMAT, T_FP>::value>>
   147                                                                       std::complex<TREAL>& log_value)
   149     const int norb = logdetT.
rows();
   154                    "cudaMemcpyAsync failed!");
   158                        "cusolver::getrf failed!");
   161                    "cudaMemcpyAsync failed!");
   165                    "cudaMemcpyAsync failed!");
   170       std::ostringstream err;
   171       err << 
"cusolver::getrf calculation failed with devInfo = " << 
ipiv[0] << std::endl;
   172       throw std::runtime_error(err.str());
   177                        "cusolver::getrs failed!");
   180                    "cudaMemcpyAsync failed!");
   184                    "cudaMemcpyAsync failed!");
   189       std::ostringstream err;
   190       err << 
"cusolver::getrs calculation failed with devInfo = " << 
ipiv[0] << std::endl;
   191       throw std::runtime_error(err.str());
   194     std::ostringstream nan_msg;
   195     for(
int i = 0; i < norb; i++)
   197         nan_msg << 
"  Ainv["<< i << 
"][" << i << 
"] has bad value " << Ainv[i][i] << std::endl;
   198     if (
const std::string str = nan_msg.str(); !str.empty())
   199       throw std::runtime_error(
"Inverse matrix diagonal check found:\n" + str);
   204 #endif // QMCPLUSPLUS_CUSOLVERINVERTOR_H 
cuSolverInverter()
default constructor 
cusolverStatus_t getrs(cusolverDnHandle_t &handle, const cublasOperation_t &transa, int m, int n, const double *A, int lda, int *ipiv, double *B, int ldb, int *info)
helper functions for EinsplineSetBuilder 
void make_identity_matrix_cuda(const int nrows, double *mat, const int lda, cudaStream_t hstream)
create identity matrix on the device 
handle CUDA/HIP runtime selection. 
#define cudaStreamDestroy
void extract_matrix_diagonal_cuda(const int nrows, const double *mat, const int lda, double *diag, cudaStream_t hstream)
extract matrix diagonal 
cusolverDnHandle_t h_cusolver_
std::enable_if_t< std::is_same< TMAT, T_FP >::value > invert_transpose(const Matrix< TMAT > &logdetT, Matrix< TMAT > &Ainv, Matrix< TMAT, CUDAAllocator< TMAT >> &Ainv_gpu, std::complex< TREAL > &log_value)
compute the inverse of the transpose of matrix A and its determinant value in log when T_FP and TMAT ...
void copy_matrix_cuda(const int nrows, const int ncols, const double *mat_in, const int lda, float *mat_out, const int ldb, cudaStream_t hstream)
copy matrix with precision difference 
this file provides three C++ memory allocators using CUDA specific memory allocation functions...
Matrix< T_FP, CUDAAllocator< T_FP > > Mat2_gpu
scratch memory for cusolverDN 
Matrix< T_FP, CUDAAllocator< T_FP > > Mat1_gpu
scratch memory for cusolverDN 
void resize(int norb)
resize the internal storage 
Vector< T_FP, CUDAAllocator< T_FP > > work_gpu
workspace 
double norm(const zVec &c)
Vector< int, CUDAAllocator< int > > ipiv_gpu
cudaErrorCheck(cudaMemcpyAsync(dev_lu.data(), lu.data(), sizeof(decltype(lu)::value_type) *lu.size(), cudaMemcpyHostToDevice, hstream), "cudaMemcpyAsync failed copying log_values to device")
#define cudaMemcpyDeviceToHost
Vector< T_FP, CUDAAllocator< T_FP > > LU_diag_gpu
#define cudaStreamSynchronize
void computeLogDet(const T *restrict diag, int n, const int *restrict pivot, std::complex< T_FP > &logdet)
#define cudaMemcpyHostToDevice
cusolverStatus_t getrf_bufferSize(cusolverDnHandle_t &handle, int m, int n, double *A, int lda, int *lwork)
Declaraton of Vector<T,Alloc> Manage memory through Alloc directly and allow referencing an existing ...
Vector< int, CUDAHostAllocator< int > > ipiv
pivot array + info 
#define cusolverErrorCheck(ans, cause)
allocator for CUDA device memory 
implements matrix inversion via cuSolverDN 
Vector< T_FP, CUDAHostAllocator< T_FP > > LU_diag
diagonal terms of LU matrix 
cusolverStatus_t getrf(cusolverDnHandle_t &handle, int m, int n, double *A, int lda, double *work, int *ipiv, int *info)
bool isnan(float a)
return true if the value is NaN. 
std::enable_if_t<!std::is_same< TMAT, T_FP >::value > invert_transpose(const Matrix< TMAT > &logdetT, Matrix< TMAT > &Ainv, Matrix< TMAT, CUDAAllocator< TMAT >> &Ainv_gpu, std::complex< TREAL > &log_value)
compute the inverse of the transpose of matrix A and its determinant value in log when T_FP and TMAT ...