QMCPACK
benchmark_DiracMatrixComputeCUDA.cpp
Go to the documentation of this file.
1 //////////////////////////////////////////////////////////////////////////////////////
2 // This file is distributed under the University of Illinois/NCSA Open Source License.
3 // See LICENSE file in top directory for details.
4 //
5 // Copyright (c) 2021 QMCPACK developers.
6 //
7 // File developed by: Peter Doak, doakpw@ornl.gov, Oak Ridge National Laboratory
8 //
9 // File created by: Peter Doak, doakpw@ornl.gov, Oak Ridge National Laboratory
10 //////////////////////////////////////////////////////////////////////////////////////
11 
12 
13 /** \file
14  * This implements micro benchmarking on DiracMatrixComputeCUDA.hpp
15  * Currently it also benchmarks the same size matrix's and batch sizes
16  * using the Legacy DiracMatrix serially.
17  */
18 
19 #include "catch.hpp"
20 
21 #include <algorithm>
22 #include "Configuration.h"
23 #include "OhmmsData/Libxml2Doc.h"
24 #include "OhmmsPETE/OhmmsMatrix.h"
25 #include "OhmmsPETE/OhmmsVector.h"
28 #include "makeRngSpdMatrix.hpp"
33 
34 // Legacy CPU inversion for temporary testing
36 
37 namespace qmcplusplus
38 {
39 template<typename T>
41 template<typename T>
43 
44 // Mechanism to pretty print benchmark names.
46 std::ostream& operator<<(std::ostream& out, const DiracComputeBenchmarkParameters& dcbmp);
48 {
49  std::string name;
50  size_t n;
52  std::string str()
53  {
54  std::stringstream stream;
55  stream << *this;
56  return stream.str();
57  }
58 };
59 
60 std::ostream& operator<<(std::ostream& out, const DiracComputeBenchmarkParameters& dcbmp)
61 {
62  out << dcbmp.name << " n=" << dcbmp.n << " batch=" << dcbmp.batch_size;
63  return out;
64 }
65 
66 /** This and other [.benchmark] benchmarks only run if "[benchmark]" is explicitly passed as tag to test.
67  */
68 TEST_CASE("DiracMatrixComputeCUDA_large_determinants_benchmark_legacy_1024_4", "[wavefunction][fermion][.benchmark]")
69 {
71  params.name = "Batched CUDA";
72  params.n = 1024;
73  params.batch_size = 4;
74 
77 
78  std::vector<Matrix<double>> spd_mats(params.batch_size, {params.n, params.n});
79  std::vector<OffloadPinnedMatrix<double>> pinned_spd_mats(params.batch_size, {params.n, params.n});
80 
82  for (int im = 0; im < params.batch_size; ++im)
83  {
84  makeRngSpdMatrix(spd_mats[im]);
85  for (int i = 0; i < params.n; ++i)
86  for (int j = 0; j < params.n; ++j)
87  pinned_spd_mats[im](i, j) = spd_mats[im](i, j);
88  }
89 
91  std::vector<OffloadPinnedMatrix<double>> pinned_inv_mats(params.batch_size, {params.n, params.n});
92 
93  auto a_mats = makeRefVector<const decltype(pinned_spd_mats)::value_type>(pinned_spd_mats);
95  makeRefVector<decltype(pinned_inv_mats)::value_type>(pinned_inv_mats);
96 
97  std::vector<bool> compute_mask(params.batch_size, true);
98  BENCHMARK_ADVANCED(params.str())(Catch::Benchmark::Chronometer meter)
99  {
100  meter.measure([&] { dmcc.mw_invertTranspose(queue, a_mats, inv_a_mats, log_values); });
101  };
102 
103  DiracMatrix<double> dmat;
104  std::vector<Matrix<double>> inv_mats_test(params.batch_size, {params.n, params.n});
105  ;
106  std::vector<std::complex<double>> log_values_test(params.batch_size);
107 
108  params.name = "legacy CPU";
109  BENCHMARK_ADVANCED(params.str())(Catch::Benchmark::Chronometer meter)
110  {
111  meter.measure([&] {
112  for (int im = 0; im < params.batch_size; ++im)
113  dmat.invert_transpose(spd_mats[im], inv_mats_test[im], log_values_test[im]);
114  });
115  };
116 }
117 
118 /** This test will run by default.
119  */
120 TEST_CASE("benchmark_DiracMatrixComputeCUDA_vs_legacy_256_10", "[wavefunction][fermion][benchmark]")
121 {
123  params.name = "Batched CUDA";
124  params.n = 256;
125  params.batch_size = 10;
126 
129 
130  std::vector<Matrix<double>> spd_mats(params.batch_size, {params.n, params.n});
131  std::vector<OffloadPinnedMatrix<double>> pinned_spd_mats(params.batch_size, {params.n, params.n});
132 
134  for (int im = 0; im < params.batch_size; ++im)
135  {
136  makeRngSpdMatrix(spd_mats[im]);
137  for (int i = 0; i < params.n; ++i)
138  for (int j = 0; j < params.n; ++j)
139  pinned_spd_mats[im](i, j) = spd_mats[im](i, j);
140  }
141 
143  std::vector<OffloadPinnedMatrix<double>> pinned_inv_mats(params.batch_size, {params.n, params.n});
144 
145  auto a_mats = makeRefVector<const decltype(pinned_spd_mats)::value_type>(pinned_spd_mats);
147  makeRefVector<decltype(pinned_inv_mats)::value_type>(pinned_inv_mats);
148 
149  std::vector<bool> compute_mask(params.batch_size, true);
150  BENCHMARK_ADVANCED(params.str())(Catch::Benchmark::Chronometer meter)
151  {
152  meter.measure([&] { dmcc.mw_invertTranspose(queue, a_mats, inv_a_mats, log_values); });
153  };
154 
155 
156  DiracMatrix<double> dmat;
157  std::vector<Matrix<double>> inv_mats_test(params.batch_size, {params.n, params.n});
158  ;
159  std::vector<std::complex<double>> log_values_test(params.batch_size);
160 
161  params.name = "legacy CPU";
162  BENCHMARK_ADVANCED(params.str())(Catch::Benchmark::Chronometer meter)
163  {
164  meter.measure([&] {
165  for (int im = 0; im < params.batch_size; ++im)
166  dmat.invert_transpose(spd_mats[im], inv_mats_test[im], log_values_test[im]);
167  });
168  };
169 }
170 
171 /** Only runs if [benchmark] tag is passed.
172  */
173 TEST_CASE("benchmark_DiracMatrixComputeCUDASingle_vs_legacy_256_10", "[wavefunction][fermion][.benchmark]")
174 {
176  params.name = "Forced Serial Batched CUDA";
177  params.n = 256;
178  params.batch_size = 10;
179 
182 
183  std::vector<Matrix<double>> spd_mats(params.batch_size, {params.n, params.n});
184  std::vector<OffloadPinnedMatrix<double>> pinned_spd_mats(params.batch_size, {params.n, params.n});
185 
187  for (int im = 0; im < params.batch_size; ++im)
188  {
189  makeRngSpdMatrix(spd_mats[im]);
190  for (int i = 0; i < params.n; ++i)
191  for (int j = 0; j < params.n; ++j)
192  pinned_spd_mats[im](i, j) = spd_mats[im](i, j);
193  }
194 
195  std::vector<OffloadPinnedMatrix<double>> pinned_inv_mats(params.batch_size, {params.n, params.n});
196  std::vector<OffloadPinnedVector<std::complex<double>>> log_values(params.batch_size);
197  for (auto& log_value : log_values)
198  log_value.resize(1, {0, 0});
199 
200  auto a_mats = makeRefVector<decltype(pinned_spd_mats)::value_type>(pinned_spd_mats);
202  makeRefVector<decltype(pinned_inv_mats)::value_type>(pinned_inv_mats);
203 
204  std::vector<bool> compute_mask(params.batch_size, true);
205  BENCHMARK_ADVANCED(params.str())(Catch::Benchmark::Chronometer meter)
206  {
207  meter.measure([&] {
208  for (int im = 0; im < params.batch_size; ++im)
209  dmcc.invert_transpose(queue, pinned_spd_mats[im], pinned_inv_mats[im], log_values[im]);
210  });
211  };
212 
213 
214  DiracMatrix<double> dmat;
215  std::vector<Matrix<double>> inv_mats_test(params.batch_size, {params.n, params.n});
216  std::vector<std::complex<double>> log_values_test(params.batch_size);
217 
218  params.name = "legacy CPU";
219  BENCHMARK_ADVANCED(params.str())(Catch::Benchmark::Chronometer meter)
220  {
221  meter.measure([&] {
222  for (int im = 0; im < params.batch_size; ++im)
223  dmat.invert_transpose(spd_mats[im], inv_mats_test[im], log_values_test[im]);
224  });
225  };
226 }
227 
228 /** Only runs if [benchmark] tag is passed.
229  */
230 TEST_CASE("benchmark_DiracMatrixComputeCUDASingle_vs_legacy_1024_4", "[wavefunction][fermion][.benchmark]")
231 {
233  params.name = "Forced Serial Batched CUDA";
234  params.n = 1024;
235  params.batch_size = 4;
236 
239 
240  std::vector<Matrix<double>> spd_mats(params.batch_size, {params.n, params.n});
241  std::vector<OffloadPinnedMatrix<double>> pinned_spd_mats(params.batch_size, {params.n, params.n});
242 
243 
245  for (int im = 0; im < params.batch_size; ++im)
246  {
247  makeRngSpdMatrix(spd_mats[im]);
248  for (int i = 0; i < params.n; ++i)
249  for (int j = 0; j < params.n; ++j)
250  pinned_spd_mats[im](i, j) = spd_mats[im](i, j);
251  }
252 
253  std::vector<OffloadPinnedMatrix<double>> pinned_inv_mats(params.batch_size, {params.n, params.n});
254  std::vector<OffloadPinnedVector<std::complex<double>>> log_values(params.batch_size);
255  for (auto& log_value : log_values)
256  log_value.resize(1, {0, 0});
257 
258  auto a_mats = makeRefVector<decltype(pinned_spd_mats)::value_type>(pinned_spd_mats);
260  makeRefVector<decltype(pinned_inv_mats)::value_type>(pinned_inv_mats);
261 
262  std::vector<bool> compute_mask(params.batch_size, true);
263  BENCHMARK_ADVANCED(params.str())(Catch::Benchmark::Chronometer meter)
264  {
265  meter.measure([&] {
266  for (int im = 0; im < params.batch_size; ++im)
267  dmcc.invert_transpose(queue, pinned_spd_mats[im], pinned_inv_mats[im], log_values[im]);
268  });
269  };
270 
271 
272  DiracMatrix<double> dmat;
273  std::vector<Matrix<double>> inv_mats_test(params.batch_size, {params.n, params.n});
274  ;
275  std::vector<std::complex<double>> log_values_test(params.batch_size);
276 
277  params.name = "legacy CPU";
278 
279  BENCHMARK_ADVANCED(params.str())(Catch::Benchmark::Chronometer meter)
280  {
281  meter.measure([&] {
282  for (int im = 0; im < params.batch_size; ++im)
283  dmat.invert_transpose(spd_mats[im], inv_mats_test[im], log_values_test[im]);
284  });
285  };
286 }
287 
288 
289 } // namespace qmcplusplus
helper functions for EinsplineSetBuilder
Definition: Configuration.h:43
std::vector< StdComp, CUDAHostAllocator< StdComp > > log_values(batch_size)
TEST_CASE("complex_helper", "[type_traits]")
std::enable_if_t<!std::is_same< VALUE_FP, TMAT >::value > mw_invertTranspose(compute::Queue< PlatformKind::CUDA > &queue, const RefVector< const DualMatrix< TMAT >> &a_mats, const RefVector< DualMatrix< TMAT >> &inv_a_mats, DualVector< LogValue > &log_values)
Mixed precision specialization When TMAT is not full precision we need to still do the inversion and ...
void makeRngSpdMatrix(testing::RandomForTest< RngValueType< T >> &rng, Matrix< T > &mat_spd)
helper class to compute matrix inversion and the log value of determinant
Definition: DiracMatrix.h:111
These allocators are to make code that should be generic with the respect to accelerator code flavor ...
void invert_transpose(compute::Queue< PlatformKind::CUDA > &queue, DualMatrix< TMAT > &a_mat, DualMatrix< TMAT > &inv_a_mat, DualVector< LogValue > &log_values)
Given a_mat returns inverted amit and log determinant of a_matches.
std::ostream & operator<<(std::ostream &out, const AntiSymTensor< T, D > &rhs)
std::vector< std::reference_wrapper< T > > RefVector
Declaraton of Vector<T,Alloc> Manage memory through Alloc directly and allow referencing an existing ...
class defining a compute and memory resource to compute matrix inversion and the log determinants of ...
Declaration of WaveFunctionComponent.
std::enable_if_t< std::is_same< T_FP, TMAT >::value > invert_transpose(const Matrix< TMAT, ALLOC1 > &amat, Matrix< TMAT, ALLOC2 > &invMat, std::complex< TREAL > &LogDet)
compute the inverse of the transpose of matrix A and its determinant value in log when T_FP and TMAT ...
Definition: DiracMatrix.h:188
Functor to provide scope for rng when making SpdMatrix for testing.