67 TEST_CASE(
"cuBLAS_LU::computeLogDet",
"[wavefunction][CUDA]")
69 auto cuda_handles = std::make_unique<testing::CUDAHandles>();
73 auto&
hstream = cuda_handles->hstream;
76 std::vector<double, CUDAHostAllocator<double>>
lu = {7., 0.28571429, 0.71428571, 0.71428571,
77 5., 3.57142857, 0.12, -0.44,
78 6., 6.28571429, -1.04, -0.46153846,
79 6., 5.28571429, 3.08, 7.46153846};
81 std::vector<double, CUDAAllocator<double>>
dev_lu(16);
83 std::vector<double*, CUDAHostAllocator<double*>>
lus(1,
nullptr);
85 std::vector<double*, CUDAHostAllocator<double*>>
dev_lus(1);
87 using StdComp = std::complex<double>;
91 std::vector<int, CUDAHostAllocator<int>>
pivots = {3, 3, 4, 4};
92 std::vector<int, CUDAAllocator<int>>
dev_pivots(4);
109 CHECK(
log_values[0] == ComplexApprox(std::complex<double>{5.267858159063328, 6.283185307179586}));
112 TEST_CASE(
"cuBLAS_LU::computeLogDet_complex",
"[wavefunction][CUDA]")
114 auto cuda_handles = std::make_unique<testing::CUDAHandles>();
118 auto&
hstream = cuda_handles->hstream;
120 using StdComp = std::complex<double>;
124 {0.8793774319066148, 0.07003891050583658},
125 {0.24980544747081712, -0.0031128404669260694},
126 {0.6233463035019455, -0.026459143968871595},
128 {6.248249027237354, 0.2719844357976654},
129 {0.7194170575332381, -0.01831314754114669},
130 {0.1212375092639108, 0.02522449751055713},
132 {0.7097276264591441, -0.4443579766536965},
133 {4.999337315778741, 0.6013141870887196},
134 {0.26158183940834034, 0.23245112532996867},
136 {4.440466926070039, -1.7525291828793774},
137 {0.840192589866152, 1.5044529443071093},
138 {1.0698651110730424, -0.10853319738453365}};
140 std::vector<StdComp, CUDAAllocator<StdComp>>
dev_lu(
lu.size());
141 std::vector<StdComp*, CUDAHostAllocator<StdComp*>>
lus(
batch_size);
148 std::vector<int, CUDAHostAllocator<int>>
pivots = {3, 4, 3, 4};
149 std::vector<int, CUDAAllocator<int>>
dev_pivots(4);
152 "cudaMemcpyAsync failed copying log_values to device");
154 "cudaMemcpyAsync failed copying lus to device");
156 "cudaMemcpyAsync failed copying log_values to device");
163 "cudaMemcpyAsync failed copying log_values from device");
171 TEST_CASE(
"cuBLAS_LU::computeLogDet_float",
"[wavefunction][CUDA]")
173 auto cuda_handles = std::make_unique<testing::CUDAHandles>();
177 auto&
hstream = cuda_handles->hstream;
180 std::vector<float, CUDAHostAllocator<float>>
lu = {7., 0.28571429, 0.71428571, 0.71428571,
181 5., 3.57142857, 0.12, -0.44,
182 6., 6.28571429, -1.04, -0.46153846,
183 6., 5.28571429, 3.08, 7.46153846};
185 std::vector<float, CUDAAllocator<float>>
dev_lu(
lu.size());
187 std::vector<float*, CUDAHostAllocator<float*>>
lus(
batch_size,
nullptr);
191 using StdComp = std::complex<double>;
195 std::vector<int, CUDAHostAllocator<int>>
pivots = {3, 3, 4, 4};
196 std::vector<int, CUDAAllocator<int>>
dev_pivots(4);
210 CHECK(
log_values[0] == ComplexApprox(std::complex<double>{5.267858159063328, 6.283185307179586}));
213 TEST_CASE(
"cuBLAS_LU::computeLogDet(batch=2)",
"[wavefunction][CUDA]")
215 auto cuda_handles = std::make_unique<testing::CUDAHandles>();
225 {0.8793774319066148, 0.07003891050583658},
226 {0.24980544747081712, -0.0031128404669260694},
227 {0.6233463035019455, -0.026459143968871595},
229 {6.248249027237354, 0.2719844357976654},
230 {0.7194170575332381, -0.01831314754114669},
231 {0.1212375092639108, 0.02522449751055713},
233 {0.7097276264591441, -0.4443579766536965},
234 {4.999337315778741, 0.6013141870887196},
235 {0.26158183940834034, 0.23245112532996867},
237 {4.440466926070039, -1.7525291828793774},
238 {0.840192589866152, 1.5044529443071093},
239 {1.0698651110730424, -0.10853319738453365}};
242 {0.8793774319066148, 0.07003891050583658},
243 {0.49883268482490273, -0.01867704280155642},
244 {0.24980544747081712, -0.0031128404669260694},
246 {6.248249027237354, 0.2719844357976654},
247 {0.800088933543564, -0.004823898651572499},
248 {0.2401906003014191, 0.0025474386841018853},
250 {3.3478599221789884, -0.23424124513618677},
251 {0.8297816353227319, 1.3593612303468308},
252 {0.6377685195602139, -0.6747848919351336},
254 {4.440466926070039, -1.7525291828793774},
255 {-1.5284389377713894, 1.6976073494521235},
256 {2.7608934839023482, -1.542084179899335}};
259 CUDAHostAllocator<StdComp>>
dev_lu(
lu.size());
261 CUDAHostAllocator<StdComp>>
dev_lu2(
lu2.size());
263 std::vector<StdComp*, CUDAHostAllocator<StdComp*>>
lus(
batch_size);
271 std::vector<int, CUDAHostAllocator<int>>
pivots = {3, 4, 3, 4, 3, 4, 4, 4};
275 "cudaMemcpyAsync failed copying log_values to device");
277 "cudaMemcpyAsync failed copying log_values to device");
279 "cudaMemcpyAsync failed copying log_values to device");
282 "cudaMemcpyAsync failed copying log_values to device");
287 "cudaMemcpyAsync failed copying log_values from device");
290 CHECK(
log_values[0] == ComplexApprox(std::complex<double>{ 5.603777579195571, -6.1586603331188225 }));
291 CHECK(
log_values[1] == ComplexApprox(std::complex<double>{ 5.531331998282581, -8.805487075984523 }));
295 TEST_CASE(
"cuBLAS_LU::getrf_batched_complex",
"[wavefunction][CUDA]")
297 auto cuda_handles = std::make_unique<testing::CUDAHandles>();
301 auto&
hstream = cuda_handles->hstream;
303 using StdComp = std::complex<double>;
305 std::vector<StdComp, CUDAHostAllocator<StdComp>> M = {{2.0, 0.1}, {5.0, 0.1}, {8.0, 0.5}, {7.0, 1.0},
306 {5.0, 0.1}, {2.0, 0.2}, {2.0, 0.1}, {8.0, 0.5},
307 {7.0, 0.2}, {5.0, 1.0}, {6.0, -0.2}, {6.0, -0.2},
308 {5.0, 0.0}, {4.0, -0.1}, {4.0, -0.6}, {8.0, -2.0}};
310 std::vector<StdComp, CUDAAllocator<StdComp>> devM(M.size());
311 std::vector<StdComp*, CUDAHostAllocator<StdComp*>>
Ms(
batch_size);
315 std::vector<int, CUDAHostAllocator<int>>
pivots = {1, 1, 1, 1};
318 std::vector<int, CUDAHostAllocator<int>>
infos = {1, 1, 1, 1};
322 "cudaMemcpyAsync failed copying M to device");
324 "cudaMemcpyAsync failed copying Ms to device");
329 "cudaMemcpyAsync failed copying invM from device");
331 "cudaMemcpyAsync failed copying pivots from device");
338 for (
int i = 0; i <
n; ++i)
345 std::vector<StdComp>
lu{{8.0, 0.5},
346 {0.8793774319066148, 0.07003891050583658},
347 {0.24980544747081712, -0.0031128404669260694},
348 {0.6233463035019455, -0.026459143968871595},
350 {6.248249027237354, 0.2719844357976654},
351 {0.7194170575332381, -0.01831314754114669},
352 {0.1212375092639108, 0.02522449751055713},
354 {0.7097276264591441, -0.4443579766536965},
355 {4.999337315778741, 0.6013141870887196},
356 {0.26158183940834034, 0.23245112532996867},
358 {4.440466926070039, -1.7525291828793774},
359 {0.840192589866152, 1.5044529443071093},
360 {1.0698651110730424, -0.10853319738453365}};
370 TEST_CASE(
"cuBLAS_LU::getrf_batched(batch=2)",
"[wavefunction][CUDA]")
372 auto cuda_handles = std::make_unique<testing::CUDAHandles>();
375 auto&
hstream = cuda_handles->hstream;
379 std::vector<double, CUDAHostAllocator<double>>
M_vec{2, 5, 7, 5, 5, 2, 5, 4, 8, 2, 6, 4, 7, 8, 6, 8};
380 std::vector<double, CUDAHostAllocator<double>>
M2_vec{6, 5, 7, 5, 2, 2, 5, 4, 8, 2, 6, 4, 3, 8, 6, 8};
381 std::vector<double, CUDAAllocator<double>>
devM_vec(
M_vec.size());
384 std::vector<double*, CUDAAllocator<double*>>
devMs(
Ms.size());
386 std::vector<int, CUDAHostAllocator<int>>
pivots(8, -1.0);
389 std::vector<int, CUDAHostAllocator<int>>
infos(8, 1.0);
395 "cudaMemcpyAsync failed copying M to device");
398 "cudaMemcpyAsync failed copying M2 to device");
402 "cudaMemcpyAsync failed copying Ms to device");
410 "cudaMemcpyAsync failed copying invM from device");
413 "cudaMemcpyAsync failed copying invM from device");
416 "cudaMemcpyAsync failed copying pivots from device");
421 std::vector<double>
lu{7., 0.28571429,
422 0.71428571, 0.71428571,
430 std::vector<double>
lu2{7.0, 0.8571428571428571,
431 0.7142857142857142, 0.7142857142857142,
432 5.0, -2.2857142857142856,
433 0.6874999999999998, -0.18750000000000022,
434 6.0, 2.8571428571428577,
435 -4.249999999999999, -0.05882352941176502,
436 6.0, -2.1428571428571423,
437 5.1875, 3.617647058823531};
442 for (
int i = 0; i <
n; ++i)
448 testing::MatrixAccessor<double>
M_mat(
M_vec.data(), 4, 4);
449 testing::MatrixAccessor<double>
lu_mat(
lu.data(), 4, 4);
450 testing::MatrixAccessor<double>
M2_mat(
M2_vec.data(), 4, 4);
451 testing::MatrixAccessor<double>
lu2_mat(
lu2.data(), 4, 4);
460 TEST_CASE(
"cuBLAS_LU::getri_batched",
"[wavefunction][CUDA]")
462 auto cuda_handles = std::make_unique<testing::CUDAHandles>();
465 auto&
hstream = cuda_handles->hstream;
469 std::vector<double, CUDAHostAllocator<double>>
M_vec{7., 0.28571429, 0.71428571, 0.71428571,
470 5., 3.57142857, 0.12, -0.44,
471 6., 6.28571429, -1.04, -0.46153846,
472 6., 5.28571429, 3.08, 7.46153846};
473 std::vector<double, CUDAAllocator<double>>
devM_vec(
M_vec.size());
475 std::vector<double*, CUDAHostAllocator<double*>>
Ms{
devM_vec.data()};
476 std::vector<double*, CUDAAllocator<double*>>
devMs(
Ms.size());
478 std::vector<double, CUDAHostAllocator<double>> invM_vec{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
479 std::vector<double, CUDAHostAllocator<double>> dev_invM_vec(invM_vec.size());
481 std::vector<double*, CUDAHostAllocator<double*>> invMs{dev_invM_vec.data()};
482 std::vector<double*, CUDAAllocator<double*>> dev_invMs(invMs.size());
484 std::vector<int, CUDAHostAllocator<int>>
pivots{3, 3, 4, 4};
487 std::vector<int, CUDAHostAllocator<int>>
infos(4, 1.0);
491 "cudaMemcpyAsync failed copying M to device");
493 "cudaMemcpyAsync failed copying Ms to device");
495 "cudaMemcpyAsync failed copying invMs to device");
497 "cudaMemcpyAsync failed copying pivots to device");
501 "cudaMemcpyAsync failed copying invM from device");
503 "cudaMemcpyAsync failed copying infos from device");
507 std::vector<double> invA{-0.08247423, -0.26804124, 0.26804124, 0.05154639,
508 0.18556701, -0.89690722, 0.39690722, 0.13402062,
509 0.24742268, -0.19587629, 0.19587629, -0.15463918,
510 -0.29896907, 1.27835052, -0.77835052, 0.06185567};
514 for (
int i = 0; i <
n; ++i)
std::vector< StdComp, CUDAHostAllocator< StdComp > > dev_lu(lu.size())
helper functions for EinsplineSetBuilder
testing::MatrixAccessor< double > M2_mat(M2_vec.data(), 4, 4)
std::vector< StdComp, CUDAHostAllocator< StdComp > > log_values(batch_size)
std::vector< StdComp *, CUDAAllocator< StdComp * > > dev_lus(batch_size)
TEST_CASE("complex_helper", "[type_traits]")
handle CUDA/HIP runtime selection.
#define cudaStreamDestroy
CHECKED_ELSE(check_matrix_result.result)
std::vector< double *, CUDAAllocator< double * > > devMs(Ms.size())
std::vector< int, CUDAAllocator< int > > dev_infos(pivots.size())
this file provides three C++ memory allocators using CUDA specific memory allocation functions...
At the qmcplusplus cuBLAS_LU level all *, **, *[] are assumed to be to device addresses.
void computeGetrf_batched(cublasHandle_t &h_cublas, cudaStream_t &hstream, const int n, const int lda, T *Ms[], int *pivots, int *host_infos, int *infos, const int batch_size)
std::vector< StdComp, CUDAHostAllocator< StdComp > > dev_lu2(lu2.size())
std::vector< double, CUDAAllocator< double > > devM2_vec(M2_vec.size())
void computeLogDet_batched(cudaStream_t &hstream, const int n, const int lda, T **Ms, const int *pivots, std::complex< double > *logdets, const int batch_size)
CUDAHandles(const CUDAHandles &)
std::vector< double, CUDAHostAllocator< double > > M2_vec
std::vector< int, CUDAHostAllocator< int > > pivots
testing::MatrixAccessor< double > lu2_mat(lu2.data(), 4, 4)
std::vector< double, CUDAHostAllocator< double > > M_vec
std::vector< int, CUDAHostAllocator< int > > infos(8, 1.0)
testing::MatrixAccessor< double > M_mat(M_vec.data(), 4, 4)
std::vector< int > real_pivot
cudaErrorCheck(cudaMemcpyAsync(dev_lu.data(), lu.data(), sizeof(decltype(lu)::value_type) *lu.size(), cudaMemcpyHostToDevice, hstream), "cudaMemcpyAsync failed copying log_values to device")
#define cudaMemcpyDeviceToHost
std::vector< double *, CUDAHostAllocator< double * > > Ms
#define cudaStreamSynchronize
#define cudaMemcpyHostToDevice
std::vector< StdComp, CUDAAllocator< StdComp > > dev_log_values(batch_size)
testing::MatrixAccessor< double > lu_mat(lu.data(), 4, 4)
CheckMatrixResult checkMatrix(M1 &a_mat, M2 &b_mat, const bool check_all=false, std::optional< const double > eps=std::nullopt)
This function checks equality a_mat and b_mat elements M1, M2 need to have their element type declare...
CHECK(log_values[0]==ComplexApprox(std::complex< double >{ 5.603777579195571, -6.1586603331188225 }))
std::vector< StdComp, CUDAHostAllocator< StdComp > > lu
std::vector< StdComp, CUDAHostAllocator< StdComp > > lu2
Doesn't depend on the resource management scheme thats out of scope for unit tests.
double B(double x, int k, int i, const std::vector< double > &t)
#define cublasErrorCheck(ans, cause)
std::vector< int, CUDAAllocator< int > > dev_pivots(pivots.size())
allocator for CUDA host pinned memory
Read only access, for testing!
QMCTraits::FullPrecRealType value_type
void computeGetri_batched(cublasHandle_t &h_cublas, cudaStream_t &hstream, const int n, const int lda, T *Ms[], T *Cs[], int *pivots, int *host_infos, int *infos, const int batch_size)
std::complex< double > StdComp
std::vector< double, CUDAAllocator< double > > devM_vec(M_vec.size())