54 std::stringstream stream;
68 TEST_CASE(
"DiracMatrixComputeCUDA_large_determinants_benchmark_legacy_1024_4",
"[wavefunction][fermion][.benchmark]")
71 params.
name =
"Batched CUDA";
78 std::vector<Matrix<double>> spd_mats(params.
batch_size, {params.n, params.n});
79 std::vector<OffloadPinnedMatrix<double>> pinned_spd_mats(params.
batch_size, {params.n, params.n});
85 for (
int i = 0; i < params.
n; ++i)
86 for (
int j = 0; j < params.
n; ++j)
87 pinned_spd_mats[im](i, j) = spd_mats[im](i, j);
91 std::vector<OffloadPinnedMatrix<double>> pinned_inv_mats(params.
batch_size, {params.n, params.n});
93 auto a_mats = makeRefVector<const decltype(pinned_spd_mats)::value_type>(pinned_spd_mats);
95 makeRefVector<decltype(pinned_inv_mats)::value_type>(pinned_inv_mats);
97 std::vector<bool> compute_mask(params.
batch_size,
true);
98 BENCHMARK_ADVANCED(params.
str())(Catch::Benchmark::Chronometer
meter)
104 std::vector<Matrix<double>> inv_mats_test(params.
batch_size, {params.n, params.n});
106 std::vector<std::complex<double>> log_values_test(params.
batch_size);
108 params.
name =
"legacy CPU";
109 BENCHMARK_ADVANCED(params.
str())(Catch::Benchmark::Chronometer
meter)
112 for (
int im = 0; im < params.
batch_size; ++im)
113 dmat.
invert_transpose(spd_mats[im], inv_mats_test[im], log_values_test[im]);
120 TEST_CASE(
"benchmark_DiracMatrixComputeCUDA_vs_legacy_256_10",
"[wavefunction][fermion][benchmark]")
123 params.
name =
"Batched CUDA";
130 std::vector<Matrix<double>> spd_mats(params.
batch_size, {params.n, params.n});
131 std::vector<OffloadPinnedMatrix<double>> pinned_spd_mats(params.
batch_size, {params.n, params.n});
134 for (
int im = 0; im < params.
batch_size; ++im)
137 for (
int i = 0; i < params.
n; ++i)
138 for (
int j = 0; j < params.
n; ++j)
139 pinned_spd_mats[im](i, j) = spd_mats[im](i, j);
143 std::vector<OffloadPinnedMatrix<double>> pinned_inv_mats(params.
batch_size, {params.n, params.n});
145 auto a_mats = makeRefVector<const decltype(pinned_spd_mats)::value_type>(pinned_spd_mats);
147 makeRefVector<decltype(pinned_inv_mats)::value_type>(pinned_inv_mats);
149 std::vector<bool> compute_mask(params.
batch_size,
true);
150 BENCHMARK_ADVANCED(params.
str())(Catch::Benchmark::Chronometer
meter)
157 std::vector<Matrix<double>> inv_mats_test(params.
batch_size, {params.n, params.n});
159 std::vector<std::complex<double>> log_values_test(params.
batch_size);
161 params.
name =
"legacy CPU";
162 BENCHMARK_ADVANCED(params.
str())(Catch::Benchmark::Chronometer
meter)
165 for (
int im = 0; im < params.
batch_size; ++im)
166 dmat.
invert_transpose(spd_mats[im], inv_mats_test[im], log_values_test[im]);
173 TEST_CASE(
"benchmark_DiracMatrixComputeCUDASingle_vs_legacy_256_10",
"[wavefunction][fermion][.benchmark]")
176 params.
name =
"Forced Serial Batched CUDA";
183 std::vector<Matrix<double>> spd_mats(params.
batch_size, {params.n, params.n});
184 std::vector<OffloadPinnedMatrix<double>> pinned_spd_mats(params.
batch_size, {params.n, params.n});
187 for (
int im = 0; im < params.
batch_size; ++im)
190 for (
int i = 0; i < params.
n; ++i)
191 for (
int j = 0; j < params.
n; ++j)
192 pinned_spd_mats[im](i, j) = spd_mats[im](i, j);
195 std::vector<OffloadPinnedMatrix<double>> pinned_inv_mats(params.
batch_size, {params.n, params.n});
198 log_value.resize(1, {0, 0});
200 auto a_mats = makeRefVector<decltype(pinned_spd_mats)::value_type>(pinned_spd_mats);
202 makeRefVector<decltype(pinned_inv_mats)::value_type>(pinned_inv_mats);
204 std::vector<bool> compute_mask(params.
batch_size,
true);
205 BENCHMARK_ADVANCED(params.
str())(Catch::Benchmark::Chronometer
meter)
208 for (
int im = 0; im < params.
batch_size; ++im)
215 std::vector<Matrix<double>> inv_mats_test(params.
batch_size, {params.n, params.n});
216 std::vector<std::complex<double>> log_values_test(params.
batch_size);
218 params.
name =
"legacy CPU";
219 BENCHMARK_ADVANCED(params.
str())(Catch::Benchmark::Chronometer
meter)
222 for (
int im = 0; im < params.
batch_size; ++im)
223 dmat.
invert_transpose(spd_mats[im], inv_mats_test[im], log_values_test[im]);
230 TEST_CASE(
"benchmark_DiracMatrixComputeCUDASingle_vs_legacy_1024_4",
"[wavefunction][fermion][.benchmark]")
233 params.
name =
"Forced Serial Batched CUDA";
240 std::vector<Matrix<double>> spd_mats(params.
batch_size, {params.n, params.n});
241 std::vector<OffloadPinnedMatrix<double>> pinned_spd_mats(params.
batch_size, {params.n, params.n});
245 for (
int im = 0; im < params.
batch_size; ++im)
248 for (
int i = 0; i < params.
n; ++i)
249 for (
int j = 0; j < params.
n; ++j)
250 pinned_spd_mats[im](i, j) = spd_mats[im](i, j);
253 std::vector<OffloadPinnedMatrix<double>> pinned_inv_mats(params.
batch_size, {params.n, params.n});
256 log_value.resize(1, {0, 0});
258 auto a_mats = makeRefVector<decltype(pinned_spd_mats)::value_type>(pinned_spd_mats);
260 makeRefVector<decltype(pinned_inv_mats)::value_type>(pinned_inv_mats);
262 std::vector<bool> compute_mask(params.
batch_size,
true);
263 BENCHMARK_ADVANCED(params.
str())(Catch::Benchmark::Chronometer
meter)
266 for (
int im = 0; im < params.
batch_size; ++im)
273 std::vector<Matrix<double>> inv_mats_test(params.
batch_size, {params.n, params.n});
275 std::vector<std::complex<double>> log_values_test(params.
batch_size);
277 params.
name =
"legacy CPU";
279 BENCHMARK_ADVANCED(params.
str())(Catch::Benchmark::Chronometer
meter)
282 for (
int im = 0; im < params.
batch_size; ++im)
283 dmat.
invert_transpose(spd_mats[im], inv_mats_test[im], log_values_test[im]);
helper functions for EinsplineSetBuilder
std::vector< StdComp, CUDAHostAllocator< StdComp > > log_values(batch_size)
TEST_CASE("complex_helper", "[type_traits]")
std::enable_if_t<!std::is_same< VALUE_FP, TMAT >::value > mw_invertTranspose(compute::Queue< PlatformKind::CUDA > &queue, const RefVector< const DualMatrix< TMAT >> &a_mats, const RefVector< DualMatrix< TMAT >> &inv_a_mats, DualVector< LogValue > &log_values)
Mixed precision specialization When TMAT is not full precision we need to still do the inversion and ...
void makeRngSpdMatrix(testing::RandomForTest< RngValueType< T >> &rng, Matrix< T > &mat_spd)
helper class to compute matrix inversion and the log value of determinant
These allocators are to make code that should be generic with the respect to accelerator code flavor ...
void invert_transpose(compute::Queue< PlatformKind::CUDA > &queue, DualMatrix< TMAT > &a_mat, DualMatrix< TMAT > &inv_a_mat, DualVector< LogValue > &log_values)
Given a_mat returns inverted amit and log determinant of a_matches.
std::ostream & operator<<(std::ostream &out, const AntiSymTensor< T, D > &rhs)
std::vector< std::reference_wrapper< T > > RefVector
Declaraton of Vector<T,Alloc> Manage memory through Alloc directly and allow referencing an existing ...
class defining a compute and memory resource to compute matrix inversion and the log determinants of ...
Declaration of WaveFunctionComponent.
std::enable_if_t< std::is_same< T_FP, TMAT >::value > invert_transpose(const Matrix< TMAT, ALLOC1 > &amat, Matrix< TMAT, ALLOC2 > &invMat, std::complex< TREAL > &LogDet)
compute the inverse of the transpose of matrix A and its determinant value in log when T_FP and TMAT ...
Functor to provide scope for rng when making SpdMatrix for testing.