d4/d8e/a01694_source.html

 //////////////////////////////////////////////////////////////////////////////////////
 // This file is distributed under the University of Illinois/NCSA Open Source License.
 // See LICENSE file in top directory for details.
 //
 // Copyright (c) 2019 QMCPACK developers.
 //
 // File developed by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
 //
 // File created by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
 //////////////////////////////////////////////////////////////////////////////////////

 #ifndef QMCPLUSPLUS_SYCL_MKL_SOLVERINVERTOR_H
 #define QMCPLUSPLUS_SYCL_MKL_SOLVERINVERTOR_H

 #include "OhmmsPETE/OhmmsVector.h"
 #include "OhmmsPETE/OhmmsMatrix.h"
 #include "SYCL/SYCLallocator.hpp"
 #include "SYCL/syclBLAS.hpp"
 #include "SYCL/syclSolver.hpp"
 #include "QMCWaveFunctions/detail/SYCL/sycl_determinant_helper.hpp"
 #include "CPU/math.hpp"

 namespace qmcplusplus
 {
 /** implements matrix inversion via cuSolverDN
  * @tparam T_FP high precision for matrix inversion, T_FP >= T
  */
 template<typename T_FP>
 class syclSolverInverter
 {
   /// scratch memory for cusolverDN
   Matrix<T_FP, SYCLAllocator<T_FP>> Mat1_gpu;
   /// pivot array + info
   Vector<std::int64_t, SYCLAllocator<std::int64_t>> ipiv;
   /// workspace
   Vector<T_FP, SYCLAllocator<T_FP>> workspace;
   std::int64_t getrf_ws = 0;
   std::int64_t getri_ws = 0;

   /** resize the internal storage
    * @param norb number of electrons/orbitals
    * @param delay, maximum delay 0<delay<=norb
    */
   inline void resize(int norb, sycl::queue& m_queue)
   {
     if (Mat1_gpu.rows() != norb)
     {
       Mat1_gpu.resize(norb, norb);
       ipiv.resize(norb);
       getrf_ws = syclSolver::getrf_scratchpad_size<T_FP>(m_queue, norb, norb, norb);
       getri_ws = syclSolver::getri_scratchpad_size<T_FP>(m_queue, norb, norb);
       workspace.resize(std::max(getrf_ws, getri_ws));
     }
   }

 public:
   /** compute the inverse of the transpose of matrix A and its determinant value in log
    * when T_FP and TMAT are the same
    * @tparam TREAL real type
    */
   template<typename TMAT, typename TREAL, typename = std::enable_if_t<std::is_same<TMAT, T_FP>::value>>
   std::enable_if_t<std::is_same<TMAT, T_FP>::value> invert_transpose(const Matrix<TMAT>& logdetT,
                                                                      Matrix<TMAT>& Ainv,
                                                                      Matrix<TMAT, SYCLAllocator<TMAT>>& Ainv_gpu,
                                                                      std::complex<TREAL>& log_value,
                                                                      sycl::queue& m_queue)
   {
     const int norb = logdetT.rows();
     resize(norb, m_queue);

     m_queue.memcpy(Mat1_gpu.data(), logdetT.data(), logdetT.size() * sizeof(TMAT));
     syclBLAS::transpose(m_queue, Mat1_gpu.data(), norb, Mat1_gpu.cols(), Ainv_gpu.data(), norb,
                                        Ainv_gpu.cols());
     try
     {
       syclSolver::getrf(m_queue, norb, norb, Ainv_gpu.data(), norb, ipiv.data(), workspace.data(), getrf_ws)
           .wait();
     }
     catch (sycl::exception const& ex)
     {
       std::ostringstream err;
       err << "\t\tCaught synchronous SYCL exception during getrf:\n"
           << ex.what() << "  status: " << ex.code() << std::endl;
       std::cerr << err.str();
       throw std::runtime_error(err.str());
     }

     log_value = computeLogDet_sycl<TREAL>(m_queue, norb, Ainv_gpu.cols(), Ainv_gpu.data(), ipiv.data());

     syclSolver::getri(m_queue, norb, Ainv_gpu.data(), norb, ipiv.data(), workspace.data(), getri_ws);

     m_queue.memcpy(Ainv.data(), Ainv_gpu.data(), Ainv.size() * sizeof(TMAT)).wait();
   }

   /** compute the inverse of the transpose of matrix A and its determinant value in log
    * when T_FP and TMAT are not the same
    * @tparam TREAL real type
    */
   template<typename TMAT, typename TREAL, typename = std::enable_if_t<!std::is_same<TMAT, T_FP>::value>>
   std::enable_if_t<!std::is_same<TMAT, T_FP>::value> invert_transpose(const Matrix<TMAT>& logdetT,
                                                                       Matrix<TMAT>& Ainv,
                                                                       Matrix<TMAT, SYCLAllocator<TMAT>>& Ainv_gpu,
                                                                       std::complex<TREAL>& log_value,
                                                                       sycl::queue& m_queue)
   {
     const int norb = logdetT.rows();
     resize(norb, m_queue);
     //use Ainv_gpu for transpose
     m_queue.memcpy(Ainv_gpu.data(), logdetT.data(), logdetT.size() * sizeof(TMAT));
     //transpose
     syclBLAS::transpose(m_queue, Ainv_gpu.data(), norb, Ainv_gpu.cols(), Mat1_gpu.data(), norb,
                                        Mat1_gpu.cols());

     //getrf (LU) -> getri (inverse)
     try
     {
       syclSolver::getrf(m_queue, norb, norb, Mat1_gpu.data(), norb, ipiv.data(), workspace.data(), getrf_ws)
           .wait();
     }
     catch (sycl::exception const& ex)
     {
       std::ostringstream err;
       err << "\t\tCaught synchronous SYCL exception during getrf:\n"
           << ex.what() << "  status: " << ex.code() << std::endl;
       std::cerr << err.str();
       throw std::runtime_error(err.str());
     }

     log_value = computeLogDet_sycl<TREAL>(m_queue, norb, Mat1_gpu.cols(), Mat1_gpu.data(), ipiv.data());

     syclSolver::getri(m_queue, norb, Mat1_gpu.data(), norb, ipiv.data(), workspace.data(), getri_ws);

     syclBLAS::copy_n(m_queue, Mat1_gpu.data(), Mat1_gpu.size(), Ainv_gpu.data());

     m_queue.memcpy(Ainv.data(), Ainv_gpu.data(), Ainv.size() * sizeof(TMAT)).wait();

     for(int i = 0; i < norb; i++)
       if (qmcplusplus::isnan(std::norm(Ainv[i][i])))
         throw std::runtime_error("Ainv[i][i] is NaN. i = " + std::to_string(i));
   }
 };
 } // namespace qmcplusplus

 #endif // QMCPLUSPLUS_CUSOLVERINVERTOR_H
qmcplusplus::DataLocality::queue

qmcplusplus
helper functions for EinsplineSetBuilder
Definition: Configuration.h:43

qmcplusplus::SYCLAllocator
allocator for SYCL device memory  T data type  ALIGN alignment in bytes
Definition: SYCLallocator.hpp:97

sycl_determinant_helper.hpp

qmcplusplus::Vector
Definition: OhmmsVector.h:33

qmcplusplus::syclSolverInverter::Mat1_gpu
Matrix< T_FP, SYCLAllocator< T_FP > > Mat1_gpu
scratch memory for cusolverDN
Definition: syclSolverInverter.hpp:32

qmcplusplus::syclSolverInverter::invert_transpose
std::enable_if_t<!std::is_same< TMAT, T_FP >::value > invert_transpose(const Matrix< TMAT > &logdetT, Matrix< TMAT > &Ainv, Matrix< TMAT, SYCLAllocator< TMAT >> &Ainv_gpu, std::complex< TREAL > &log_value, sycl::queue &m_queue)
compute the inverse of the transpose of matrix A and its determinant value in log when T_FP and TMAT ...
Definition: syclSolverInverter.hpp:100

qmcplusplus::syclSolverInverter
implements matrix inversion via cuSolverDN
Definition: syclSolverInverter.hpp:29

qmcplusplus::syclBLAS::transpose
sycl::event transpose(sycl::queue &q, const T1 *restrict in, int m, int lda, T2 *restrict out, int n, int ldb, const std::vector< sycl::event > &events)
Definition: syclBLAS.cpp:475

qmcplusplus::Matrix::size
size_type size() const
Definition: OhmmsMatrix.h:76

norm
double norm(const zVec &c)
Definition: VectorOps.h:118

qmcplusplus::rocsolver::getri
rocblas_status getri(rocblas_handle &handle, int n, double *A, int lda, int *ipiv, int *info)
Definition: rocsolver.hpp:143

syclBLAS.hpp

qmcplusplus::syclSolverInverter::workspace
Vector< T_FP, SYCLAllocator< T_FP > > workspace
workspace
Definition: syclSolverInverter.hpp:36

syclSolver.hpp

qmcplusplus::Matrix::rows
size_type rows() const
Definition: OhmmsMatrix.h:77

OhmmsVector.h
Declaraton of Vector<T,Alloc> Manage memory through Alloc directly and allow referencing an existing ...

qmcplusplus::syclBLAS::copy_n
sycl::event copy_n(sycl::queue &aq, const T1 *restrict VA, size_t array_size, T2 *restrict VC, const std::vector< sycl::event > &events)
Definition: syclBLAS.cpp:548

qmcplusplus::Matrix
Definition: OhmmsMatrix.h:27

SYCLallocator.hpp
this file provides three C++ memory allocators using SYCL specific memory allocation functions...

qmcplusplus::Matrix::data
pointer data()
Definition: OhmmsMatrix.h:182

qmcplusplus::syclSolverInverter::getrf_ws
std::int64_t getrf_ws
Definition: syclSolverInverter.hpp:37

qmcplusplus::syclSolverInverter::invert_transpose
std::enable_if_t< std::is_same< TMAT, T_FP >::value > invert_transpose(const Matrix< TMAT > &logdetT, Matrix< TMAT > &Ainv, Matrix< TMAT, SYCLAllocator< TMAT >> &Ainv_gpu, std::complex< TREAL > &log_value, sycl::queue &m_queue)
compute the inverse of the transpose of matrix A and its determinant value in log when T_FP and TMAT ...
Definition: syclSolverInverter.hpp:62

OhmmsMatrix.h

qmcplusplus::syclSolverInverter::resize
void resize(int norb, sycl::queue &m_queue)
resize the internal storage
Definition: syclSolverInverter.hpp:44

qmcplusplus::syclSolverInverter::getri_ws
std::int64_t getri_ws
Definition: syclSolverInverter.hpp:38

qmcplusplus::syclSolverInverter::ipiv
Vector< std::int64_t, SYCLAllocator< std::int64_t > > ipiv
pivot array + info
Definition: syclSolverInverter.hpp:34

qmcplusplus::cusolver::getrf
cusolverStatus_t getrf(cusolverDnHandle_t &handle, int m, int n, double *A, int lda, double *work, int *ipiv, int *info)
Definition: cusolver.hpp:92

math.hpp

qmcplusplus::isnan
bool isnan(float a)
return true if the value is NaN.
Definition: math.cpp:18