d4/d2e/a02051_source.html

 //////////////////////////////////////////////////////////////////////////////////////
 // This file is distributed under the University of Illinois/NCSA Open Source License.
 // See LICENSE file in top directory for details.
 //
 // Copyright (c) 2021 QMCPACK developers.
 //
 // File developed by: Peter Doak, doakpw@ornl.gov, Oak Ridge National Laboratory
 //
 // File created by: Peter Doak, doakpw@ornl.gov, Oak Ridge National Laboratory
 //////////////////////////////////////////////////////////////////////////////////////

 #include <catch.hpp>
 #include <algorithm>
 #include "OhmmsData/Libxml2Doc.h"
 #include "OhmmsPETE/OhmmsMatrix.h"
 #include "OhmmsPETE/OhmmsVector.h"
 #include "QMCWaveFunctions/WaveFunctionComponent.h"
 #include "QMCWaveFunctions/Fermion/DiracMatrixComputeCUDA.hpp"
 #include "makeRngSpdMatrix.hpp"
 #include "Utilities/for_testing/checkMatrix.hpp"
 #include "Utilities/for_testing/RandomForTest.h"
 #include "Platforms/DualAllocatorAliases.hpp"
 #include "Platforms/CUDA/QueueCUDA.hpp"
 #include "Platforms/CUDA/AccelBLAS_CUDA.hpp"

 // Legacy CPU inversion for temporary testing
 #include "QMCWaveFunctions/Fermion/DiracMatrix.h"


 namespace qmcplusplus
 {
 template<typename T>
 using OffloadPinnedMatrix = Matrix<T, PinnedDualAllocator<T>>;
 template<typename T>
 using OffloadPinnedVector = Vector<T, PinnedDualAllocator<T>>;

 TEST_CASE("DiracMatrixComputeCUDA_cuBLAS_geam_call", "[wavefunction][fermion]")
 {
   OffloadPinnedMatrix<double> mat_a;
   int n = 4;
   mat_a.resize(n, n);
   OffloadPinnedMatrix<double> temp_mat;
   temp_mat.resize(n, n);
   OffloadPinnedMatrix<double> mat_c;
   mat_c.resize(n, n);

   double host_one(1.0);
   double host_zero(0.0);

   std::vector<double> A{2, 5, 8, 7, 5, 2, 2, 8, 7, 5, 6, 6, 5, 4, 4, 8};
   std::copy_n(A.begin(), 16, mat_a.data());
   compute::Queue<PlatformKind::CUDA> queue;
   compute::BLASHandle<PlatformKind::CUDA> cuda_handles(queue);
   int lda = n;
   cudaCheck(cudaMemcpyAsync((void*)(temp_mat.device_data()), (void*)(mat_a.data()), mat_a.size() * sizeof(double),
                             cudaMemcpyHostToDevice, queue.getNative()));
   cublasErrorCheck(cuBLAS::geam(cuda_handles.h_cublas, CUBLAS_OP_T, CUBLAS_OP_N, n, n, &host_one,
                                 temp_mat.device_data(), lda, &host_zero, mat_c.device_data(), lda, mat_a.device_data(),
                                 lda),
                    "cuBLAS::geam failed.");
 }

 TEST_CASE("DiracMatrixComputeCUDA_different_batch_sizes", "[wavefunction][fermion]")
 {
   OffloadPinnedMatrix<double> mat_a;
   mat_a.resize(4, 4);
   std::vector<double> A{2, 5, 8, 7, 5, 2, 2, 8, 7, 5, 6, 6, 5, 4, 4, 8};
   std::copy_n(A.data(), 16, mat_a.data());
   OffloadPinnedVector<std::complex<double>> log_values;
   log_values.resize(1);
   OffloadPinnedMatrix<double> inv_mat_a;
   inv_mat_a.resize(4, 4);
   compute::Queue<PlatformKind::CUDA> queue;
   DiracMatrixComputeCUDA<double> dmcc;

   dmcc.invert_transpose(queue, mat_a, inv_mat_a, log_values);


   OffloadPinnedMatrix<double> mat_b;
   mat_b.resize(4, 4);
   double invA[16]{-0.08247423, -0.26804124, 0.26804124, 0.05154639,  0.18556701,  -0.89690722, 0.39690722,  0.13402062,
                   0.24742268,  -0.19587629, 0.19587629, -0.15463918, -0.29896907, 1.27835052,  -0.77835052, 0.06185567};
   std::copy_n(invA, 16, mat_b.data());

   auto check_matrix_result = checkMatrix(inv_mat_a, mat_b);
   CHECKED_ELSE(check_matrix_result.result) { FAIL(check_matrix_result.result_message); }

   OffloadPinnedMatrix<double> mat_a2;
   mat_a2.resize(4, 4);
   std::copy_n(A.begin(), 16, mat_a2.data());
   OffloadPinnedMatrix<double> inv_mat_a2;
   inv_mat_a2.resize(4, 4);

   RefVector<const OffloadPinnedMatrix<double>> a_mats{mat_a, mat_a2};
   RefVector<OffloadPinnedMatrix<double>> inv_a_mats{inv_mat_a, inv_mat_a2};

   log_values.resize(2);
   dmcc.mw_invertTranspose(queue, a_mats, inv_a_mats, log_values);

   check_matrix_result = checkMatrix(inv_mat_a, mat_b);
   CHECKED_ELSE(check_matrix_result.result) { FAIL(check_matrix_result.result_message); }
   check_matrix_result = checkMatrix(inv_mat_a2, mat_b);
   CHECKED_ELSE(check_matrix_result.result) { FAIL(check_matrix_result.result_message); }

   CHECK(log_values[0] == ComplexApprox(std::complex<double>{5.267858159063328, 6.283185307179586}));
   CHECK(log_values[1] == ComplexApprox(std::complex<double>{5.267858159063328, 6.283185307179586}));

   OffloadPinnedMatrix<double> mat_a3;
   mat_a3.resize(4, 4);
   std::copy_n(A.begin(), 16, mat_a3.data());
   OffloadPinnedMatrix<double> inv_mat_a3;
   inv_mat_a3.resize(4, 4);

   a_mats[1] = mat_a3;

   RefVector<const OffloadPinnedMatrix<double>> a_mats3{mat_a, mat_a2, mat_a3};
   RefVector<OffloadPinnedMatrix<double>> inv_a_mats3{inv_mat_a, inv_mat_a2, inv_mat_a3};

   log_values.resize(3);
   dmcc.mw_invertTranspose(queue, a_mats3, inv_a_mats3, log_values);

   check_matrix_result = checkMatrix(inv_mat_a, mat_b);
   CHECKED_ELSE(check_matrix_result.result) { FAIL(check_matrix_result.result_message); }
   check_matrix_result = checkMatrix(inv_mat_a2, mat_b);
   CHECKED_ELSE(check_matrix_result.result) { FAIL(check_matrix_result.result_message); }
   check_matrix_result = checkMatrix(inv_mat_a3, mat_b);
   CHECKED_ELSE(check_matrix_result.result) { FAIL(check_matrix_result.result_message); }

   CHECK(log_values[0] == ComplexApprox(std::complex<double>{5.267858159063328, 6.283185307179586}));
   CHECK(log_values[1] == ComplexApprox(std::complex<double>{5.267858159063328, 6.283185307179586}));
   CHECK(log_values[2] == ComplexApprox(std::complex<double>{5.267858159063328, 6.283185307179586}));
 }

 TEST_CASE("DiracMatrixComputeCUDA_complex_determinants_against_legacy", "[wavefunction][fermion]")
 {
   int n = 64;
   compute::Queue<PlatformKind::CUDA> queue;

   DiracMatrixComputeCUDA<std::complex<double>> dmcc;

   Matrix<std::complex<double>> mat_spd;
   mat_spd.resize(n, n);
   testing::MakeRngSpdMatrix<std::complex<double>> makeRngSpdMatrix;
   makeRngSpdMatrix(mat_spd);
   // You would hope you could do this
   // OffloadPinnedMatrix<double> mat_a(mat_spd);
   // But you can't
   OffloadPinnedMatrix<std::complex<double>> mat_a(n, n);
   for (int i = 0; i < n; ++i)
     for (int j = 0; j < n; ++j)
       mat_a(i, j) = mat_spd(i, j);

   Matrix<std::complex<double>> mat_spd2;
   mat_spd2.resize(n, n);
   makeRngSpdMatrix(mat_spd2);
   // You would hope you could do this
   // OffloadPinnedMatrix<double> mat_a(mat_spd);
   // But you can't
   OffloadPinnedMatrix<std::complex<double>> mat_a2(n, n);
   for (int i = 0; i < n; ++i)
     for (int j = 0; j < n; ++j)
       mat_a2(i, j) = mat_spd2(i, j);

   OffloadPinnedVector<std::complex<double>> log_values;
   log_values.resize(2);
   OffloadPinnedMatrix<std::complex<double>> inv_mat_a;
   inv_mat_a.resize(n, n);
   OffloadPinnedMatrix<std::complex<double>> inv_mat_a2;
   inv_mat_a2.resize(n, n);

   RefVector<const OffloadPinnedMatrix<std::complex<double>>> a_mats{mat_a, mat_a2};
   RefVector<OffloadPinnedMatrix<std::complex<double>>> inv_a_mats{inv_mat_a, inv_mat_a2};

   dmcc.mw_invertTranspose(queue, a_mats, inv_a_mats, log_values);

   DiracMatrix<std::complex<double>> dmat;
   Matrix<std::complex<double>> inv_mat_test(n, n);
   std::complex<double> det_log_value;
   dmat.invert_transpose(mat_spd, inv_mat_test, det_log_value);

   auto check_matrix_result = checkMatrix(inv_mat_a, inv_mat_test);
   CHECKED_ELSE(check_matrix_result.result) { FAIL(check_matrix_result.result_message); }

   dmat.invert_transpose(mat_spd2, inv_mat_test, det_log_value);
   check_matrix_result = checkMatrix(inv_mat_a2, inv_mat_test);
   CHECKED_ELSE(check_matrix_result.result) { FAIL(check_matrix_result.result_message); }
 }

 TEST_CASE("DiracMatrixComputeCUDA_large_determinants_against_legacy", "[wavefunction][fermion]")
 {
   int n = 64;
   compute::Queue<PlatformKind::CUDA> queue;
   DiracMatrixComputeCUDA<double> dmcc;

   Matrix<double> mat_spd;
   mat_spd.resize(n, n);
   testing::MakeRngSpdMatrix<double> makeRngSpdMatrix;
   makeRngSpdMatrix(mat_spd);
   // You would hope you could do this
   // OffloadPinnedMatrix<double> mat_a(mat_spd);
   // But you can't
   OffloadPinnedMatrix<double> mat_a(n, n);
   for (int i = 0; i < n; ++i)
     for (int j = 0; j < n; ++j)
       mat_a(i, j) = mat_spd(i, j);

   Matrix<double> mat_spd2;
   mat_spd2.resize(n, n);
   makeRngSpdMatrix(mat_spd2);
   // You would hope you could do this
   // OffloadPinnedMatrix<double> mat_a(mat_spd);
   // But you can't
   OffloadPinnedMatrix<double> mat_a2(n, n);
   for (int i = 0; i < n; ++i)
     for (int j = 0; j < n; ++j)
       mat_a2(i, j) = mat_spd2(i, j);

   OffloadPinnedVector<std::complex<double>> log_values;
   log_values.resize(2);
   OffloadPinnedMatrix<double> inv_mat_a;
   inv_mat_a.resize(n, n);
   OffloadPinnedMatrix<double> inv_mat_a2;
   inv_mat_a2.resize(n, n);

   RefVector<const OffloadPinnedMatrix<double>> a_mats{mat_a, mat_a2};
   RefVector<OffloadPinnedMatrix<double>> inv_a_mats{inv_mat_a, inv_mat_a2};

   dmcc.mw_invertTranspose(queue, a_mats, inv_a_mats, log_values);

   DiracMatrix<double> dmat;
   Matrix<double> inv_mat_test(n, n);
   std::complex<double> det_log_value;
   dmat.invert_transpose(mat_spd, inv_mat_test, det_log_value);

   auto check_matrix_result = checkMatrix(inv_mat_a, inv_mat_test);
   CHECKED_ELSE(check_matrix_result.result) { FAIL(check_matrix_result.result_message); }

   dmat.invert_transpose(mat_spd2, inv_mat_test, det_log_value);
   check_matrix_result = checkMatrix(inv_mat_a2, inv_mat_test);
   CHECKED_ELSE(check_matrix_result.result) { FAIL(check_matrix_result.result_message); }
 }

 } // namespace qmcplusplus
CUBLAS_OP_N
#define CUBLAS_OP_N
Definition: cuda2hip.h:19

qmcplusplus::DataLocality::queue

qmcplusplus::compute::BLASHandle< PlatformKind::CUDA >::h_cublas
cublasHandle_t h_cublas
Definition: AccelBLAS_CUDA.hpp:36

qmcplusplus
helper functions for EinsplineSetBuilder
Definition: Configuration.h:43

qmcplusplus::OffloadPinnedVector
Vector< T, PinnedDualAllocator< T > > OffloadPinnedVector
Definition: benchmark_DiracMatrixComputeCUDA.cpp:42

qmcplusplus::log_values
std::vector< StdComp, CUDAHostAllocator< StdComp > > log_values(batch_size)

qmcplusplus::TEST_CASE
TEST_CASE("complex_helper", "[type_traits]")
Definition: test_complex_helper.cpp:38

RandomForTest.h

qmcplusplus::CHECKED_ELSE
CHECKED_ELSE(check_matrix_result.result)
Definition: test_cuBLAS_LU.cpp:455

checkMatrix.hpp

qmcplusplus::Vector
Definition: OhmmsVector.h:33

qmcplusplus::check_matrix_result
auto check_matrix_result
Definition: test_cuBLAS_LU.cpp:454

qmcplusplus::Matrix::resize
void resize(size_type n, size_type m)
Resize the container.
Definition: OhmmsMatrix.h:99

qmcplusplus::DiracMatrixComputeCUDA::mw_invertTranspose
std::enable_if_t<!std::is_same< VALUE_FP, TMAT >::value > mw_invertTranspose(compute::Queue< PlatformKind::CUDA > &queue, const RefVector< const DualMatrix< TMAT >> &a_mats, const RefVector< DualMatrix< TMAT >> &inv_a_mats, DualVector< LogValue > &log_values)
Mixed precision specialization When TMAT is not full precision we need to still do the inversion and ...
Definition: DiracMatrixComputeCUDA.hpp:291

AccelBLAS_CUDA.hpp

qmcplusplus::Matrix::size
size_type size() const
Definition: OhmmsMatrix.h:76

qmcplusplus::testing::makeRngSpdMatrix
void makeRngSpdMatrix(testing::RandomForTest< RngValueType< T >> &rng, Matrix< T > &mat_spd)
Definition: makeRngSpdMatrix.hpp:37

makeRngSpdMatrix.hpp

Libxml2Doc.h

DiracMatrix.h

qmcplusplus::DiracMatrix
helper class to compute matrix inversion and the log value of determinant
Definition: DiracMatrix.h:111

DualAllocatorAliases.hpp
These allocators are to make code that should be generic with the respect to accelerator code flavor ...

qmcplusplus::DiracMatrixComputeCUDA::invert_transpose
void invert_transpose(compute::Queue< PlatformKind::CUDA > &queue, DualMatrix< TMAT > &a_mat, DualMatrix< TMAT > &inv_a_mat, DualVector< LogValue > &log_values)
Given a_mat returns inverted amit and log determinant of a_matches.
Definition: DiracMatrixComputeCUDA.hpp:244

CUBLAS_OP_T
#define CUBLAS_OP_T
Definition: cuda2hip.h:20

qmcplusplus::compute::BLASHandle< PlatformKind::CUDA >
Definition: AccelBLAS_CUDA.hpp:30

qmcplusplus::lda
int lda
Definition: test_cuBLAS_LU.cpp:217

cudaMemcpyHostToDevice
#define cudaMemcpyHostToDevice
Definition: cuda2hip.h:139

qmcplusplus::Matrix::device_data
pointer device_data()
Definition: OhmmsMatrix.h:188

qmcplusplus::RefVector
std::vector< std::reference_wrapper< T > > RefVector
Definition: template_types.hpp:32

qmcplusplus::n
int n
Definition: test_cuBLAS_LU.cpp:216

DiracMatrixComputeCUDA.hpp

OhmmsVector.h
Declaraton of Vector<T,Alloc> Manage memory through Alloc directly and allow referencing an existing ...

qmcplusplus::syclBLAS::copy_n
sycl::event copy_n(sycl::queue &aq, const T1 *restrict VA, size_t array_size, T2 *restrict VC, const std::vector< sycl::event > &events)
Definition: syclBLAS.cpp:548

qmcplusplus::DiracMatrixComputeCUDA
class defining a compute and memory resource to compute matrix inversion and the log determinants of ...
Definition: DiracMatrixComputeCUDA.hpp:49

qmcplusplus::Matrix
Definition: OhmmsMatrix.h:27

qmcplusplus::checkMatrix
CheckMatrixResult checkMatrix(M1 &a_mat, M2 &b_mat, const bool check_all=false, std::optional< const double > eps=std::nullopt)
This function checks equality a_mat and b_mat elements M1, M2 need to have their element type declare...
Definition: checkMatrix.hpp:63

qmcplusplus::cuBLAS::geam
cublasStatus_t geam(cublasHandle_t &handle, cublasOperation_t &transa, cublasOperation_t &transb, int m, int n, const float *alpha, const float *A, int lda, const float *beta, const float *B, int ldb, float *C, int ldc)
Definition: cuBLAS.hpp:110

qmcplusplus::CHECK
CHECK(log_values[0]==ComplexApprox(std::complex< double >{ 5.603777579195571, -6.1586603331188225 }))

qmcplusplus::Matrix::data
pointer data()
Definition: OhmmsMatrix.h:182

WaveFunctionComponent.h
Declaration of WaveFunctionComponent.

cublasErrorCheck
#define cublasErrorCheck(ans, cause)
Definition: cuBLAS.hpp:34

cudaCheck
#define cudaCheck(ans)
Definition: CUDAerror.h:27

qmcplusplus::OffloadPinnedMatrix
Matrix< T, PinnedDualAllocator< T > > OffloadPinnedMatrix
Definition: benchmark_DiracMatrixComputeCUDA.cpp:40

qmcplusplus::Units::distance::A
const real A
Definition: unit_conversion.h:38

qmcplusplus::DiracMatrix::invert_transpose
std::enable_if_t< std::is_same< T_FP, TMAT >::value > invert_transpose(const Matrix< TMAT, ALLOC1 > &amat, Matrix< TMAT, ALLOC2 > &invMat, std::complex< TREAL > &LogDet)
compute the inverse of the transpose of matrix A and its determinant value in log when T_FP and TMAT ...
Definition: DiracMatrix.h:188

OhmmsMatrix.h

QueueCUDA.hpp

qmcplusplus::compute::Queue< PlatformKind::CUDA >
Definition: QueueCUDA.hpp:25

qmcplusplus::testing::MakeRngSpdMatrix
Functor to provide scope for rng when making SpdMatrix for testing.
Definition: makeRngSpdMatrix.hpp:118

cudaMemcpyAsync
#define cudaMemcpyAsync
Definition: cuda2hip.h:136