QMCPACK
test_cuBLAS_LU.cpp
Go to the documentation of this file.
1 //////////////////////////////////////////////////////////////////////////////////////
2 // This file is distributed under the University of Illinois/NCSA Open Source License.
3 // See LICENSE file in top directory for details.
4 //
5 // Copyright (c) 2021 QMCPACK developers.
6 //
7 // File developed by: Peter Doak, doakpw@ornl.gov, Oak Ridge National Laboratory
8 //
9 // File created by: Peter Doak, doakpw@ornl.gov, Oak Ridge National Laboratory
10 //////////////////////////////////////////////////////////////////////////////////////
11 
12 #include "catch.hpp"
13 
14 #include <memory>
15 #include <iostream>
16 #include <vector>
17 #include "CUDA/CUDAruntime.hpp"
18 #include "CUDA/cuBLAS.hpp"
19 #include "CUDA/CUDAfill.hpp"
20 #include "CUDA/CUDAallocator.hpp"
24 
25 /** \file
26  *
27  * These are unit tests for the low level LU factorization used by the full inversion and
28  * calculation of log determinant for dirac determinants. Fundamental testing of these kernels
29  * requires full knowledge of the memory layout and data movement, As such OhmmsMatrices and
30  * custom allocators are not used. They have their own unit tests (Hopefully!) This is also documentation
31  * of how these calls expect the memory handed to them to look. Please leave this intact.
32  * Someday those container abstractions will change, if inversion breaks and this stil works you
33  * will have a fighting chance to know how to change these routines or fix the bug you introduced in the
34  * higher level abstractions.
35  *
36  * Reference data generated by qmcpack/tests/scripts/inversion_ref.py
37  */
38 namespace qmcplusplus
39 {
40 namespace testing
41 {
42 /** Doesn't depend on the resource management scheme thats out of scope for unit tests */
44 {
45  // CUDA specific variables
48 
50  {
51  cudaErrorCheck(cudaStreamCreate(&hstream), "cudaStreamCreate failed!");
52  cublasErrorCheck(cublasCreate(&h_cublas), "cublasCreate failed!");
53  cublasErrorCheck(cublasSetStream(h_cublas, hstream), "cublasSetStream failed!");
54  }
55 
57 
59  {
60  cublasErrorCheck(cublasDestroy(h_cublas), "cublasDestroy failed!");
61  cudaErrorCheck(cudaStreamDestroy(hstream), "cudaStreamDestroy failed!");
62  }
63 };
64 } // namespace testing
65 
66 /** Single double computeLogDet */
67 TEST_CASE("cuBLAS_LU::computeLogDet", "[wavefunction][CUDA]")
68 {
69  auto cuda_handles = std::make_unique<testing::CUDAHandles>();
70  int n = 4;
71  int lda = 4;
72  int batch_size = 1;
73  auto& hstream = cuda_handles->hstream;
74 
75  // clang-format off
76  std::vector<double, CUDAHostAllocator<double>> lu = {7., 0.28571429, 0.71428571, 0.71428571,
77  5., 3.57142857, 0.12, -0.44,
78  6., 6.28571429, -1.04, -0.46153846,
79  6., 5.28571429, 3.08, 7.46153846};
80  // clang-format on
81  std::vector<double, CUDAAllocator<double>> dev_lu(16);
82 
83  std::vector<double*, CUDAHostAllocator<double*>> lus(1, nullptr);
84  lus[0] = dev_lu.data();
85  std::vector<double*, CUDAHostAllocator<double*>> dev_lus(1);
86 
87  using StdComp = std::complex<double>;
88  std::vector<StdComp, CUDAHostAllocator<StdComp>> log_values(batch_size, 0.0);
89  std::vector<StdComp, CUDAAllocator<StdComp>> dev_log_values(batch_size, 0.0);
90 
91  std::vector<int, CUDAHostAllocator<int>> pivots = {3, 3, 4, 4};
92  std::vector<int, CUDAAllocator<int>> dev_pivots(4);
93 
94  // Transfers and launch kernel.
95  cudaCheck(cudaMemcpyAsync(dev_lu.data(), lu.data(), sizeof(decltype(lu)::value_type) * 16, cudaMemcpyHostToDevice,
96  hstream));
97  cudaCheck(cudaMemcpyAsync(dev_lus.data(), lus.data(), sizeof(double**), cudaMemcpyHostToDevice, hstream));
98  cudaCheck(cudaMemcpyAsync(dev_pivots.data(), pivots.data(), sizeof(int) * 4, cudaMemcpyHostToDevice, hstream));
99 
100  // The types of the pointers passed here matter
101  // Pass the C++ types
102  cuBLAS_LU::computeLogDet_batched(cuda_handles->hstream, n, lda, dev_lus.data(), dev_pivots.data(),
103  dev_log_values.data(), batch_size);
104 
105  // Copy back to the log_values
106  cudaCheck(cudaMemcpyAsync(log_values.data(), dev_log_values.data(), sizeof(std::complex<double>) * 1,
109  CHECK(log_values[0] == ComplexApprox(std::complex<double>{5.267858159063328, 6.283185307179586}));
110 }
111 
112 TEST_CASE("cuBLAS_LU::computeLogDet_complex", "[wavefunction][CUDA]")
113 {
114  auto cuda_handles = std::make_unique<testing::CUDAHandles>();
115  int n = 4;
116  int lda = 4;
117  int batch_size = 1;
118  auto& hstream = cuda_handles->hstream;
119 
120  using StdComp = std::complex<double>;
121  // clang-format off
122  std::vector<StdComp,
123  CUDAHostAllocator<StdComp>> lu = {{8.0, 0.5},
124  {0.8793774319066148, 0.07003891050583658},
125  {0.24980544747081712, -0.0031128404669260694},
126  {0.6233463035019455, -0.026459143968871595},
127  {2.0, 0.1},
128  {6.248249027237354, 0.2719844357976654},
129  {0.7194170575332381, -0.01831314754114669},
130  {0.1212375092639108, 0.02522449751055713},
131  {6.0, -0.2},
132  {0.7097276264591441, -0.4443579766536965},
133  {4.999337315778741, 0.6013141870887196},
134  {0.26158183940834034, 0.23245112532996867},
135  {4.0, -0.6},
136  {4.440466926070039, -1.7525291828793774},
137  {0.840192589866152, 1.5044529443071093},
138  {1.0698651110730424, -0.10853319738453365}};
139  // clang-format on
140  std::vector<StdComp, CUDAAllocator<StdComp>> dev_lu(lu.size());
141  std::vector<StdComp*, CUDAHostAllocator<StdComp*>> lus(batch_size);
142  lus[0] = dev_lu.data();
143  std::vector<StdComp*, CUDAAllocator<StdComp*>> dev_lus(batch_size);
144 
145  std::vector<StdComp, CUDAHostAllocator<StdComp>> log_values(batch_size);
146  std::vector<StdComp, CUDAAllocator<StdComp>> dev_log_values(batch_size);
147 
148  std::vector<int, CUDAHostAllocator<int>> pivots = {3, 4, 3, 4};
149  std::vector<int, CUDAAllocator<int>> dev_pivots(4);
150 
151  cudaErrorCheck(cudaMemcpyAsync(dev_lu.data(), lu.data(), sizeof(double) * 32, cudaMemcpyHostToDevice, hstream),
152  "cudaMemcpyAsync failed copying log_values to device");
153  cudaErrorCheck(cudaMemcpyAsync(dev_lus.data(), lus.data(), sizeof(double*), cudaMemcpyHostToDevice, hstream),
154  "cudaMemcpyAsync failed copying lus to device");
156  "cudaMemcpyAsync failed copying log_values to device");
157 
158  cuBLAS_LU::computeLogDet_batched(cuda_handles->hstream, n, lda, dev_lus.data(), dev_pivots.data(),
159  dev_log_values.data(), batch_size);
160 
163  "cudaMemcpyAsync failed copying log_values from device");
164  cudaErrorCheck(cudaStreamSynchronize(hstream), "cudaStreamSynchronize failed!");
165  CHECK(log_values[0] == ComplexApprox(StdComp{5.603777579195571, -6.1586603331188225}));
166 }
167 
168 /** while this working is a good test, in production code its likely we want to
169  * widen the matrix M to double and thereby the LU matrix as well.
170  */
171 TEST_CASE("cuBLAS_LU::computeLogDet_float", "[wavefunction][CUDA]")
172 {
173  auto cuda_handles = std::make_unique<testing::CUDAHandles>();
174  int n = 4;
175  int lda = 4;
176  int batch_size = 1;
177  auto& hstream = cuda_handles->hstream;
178 
179  // clang-format off
180  std::vector<float, CUDAHostAllocator<float>> lu = {7., 0.28571429, 0.71428571, 0.71428571,
181  5., 3.57142857, 0.12, -0.44,
182  6., 6.28571429, -1.04, -0.46153846,
183  6., 5.28571429, 3.08, 7.46153846};
184  // clang-format on
185  std::vector<float, CUDAAllocator<float>> dev_lu(lu.size());
186 
187  std::vector<float*, CUDAHostAllocator<float*>> lus(batch_size, nullptr);
188  lus[0] = dev_lu.data();
189  std::vector<float*, CUDAAllocator<float*>> dev_lus(batch_size);
190 
191  using StdComp = std::complex<double>;
192  std::vector<StdComp, CUDAHostAllocator<StdComp>> log_values(batch_size, 0.0);
193  std::vector<StdComp, CUDAAllocator<StdComp>> dev_log_values(batch_size, 0.0);
194 
195  std::vector<int, CUDAHostAllocator<int>> pivots = {3, 3, 4, 4};
196  std::vector<int, CUDAAllocator<int>> dev_pivots(4);
197 
198  // Transfer and run kernel.
199  cudaCheck(cudaMemcpyAsync(dev_lu.data(), lu.data(), sizeof(float) * 16, cudaMemcpyHostToDevice, hstream));
200  cudaCheck(cudaMemcpyAsync(dev_lus.data(), lus.data(), sizeof(float**), cudaMemcpyHostToDevice, hstream));
201  cudaCheck(cudaMemcpyAsync(dev_pivots.data(), pivots.data(), sizeof(int) * 4, cudaMemcpyHostToDevice, hstream));
202 
203  // The types of the pointers passed here matter
205  batch_size);
206 
207  cudaCheck(cudaMemcpyAsync(log_values.data(), dev_log_values.data(), sizeof(std::complex<double>) * batch_size,
210  CHECK(log_values[0] == ComplexApprox(std::complex<double>{5.267858159063328, 6.283185307179586}));
211 }
212 
213 TEST_CASE("cuBLAS_LU::computeLogDet(batch=2)", "[wavefunction][CUDA]")
214 {
215  auto cuda_handles = std::make_unique<testing::CUDAHandles>();
216  int n = 4;
217  int lda = 4;
218  auto& hstream = cuda_handles->hstream;
219  int batch_size = 2;
220 
221  using StdComp = std::complex<double>;
222  // clang-format off
223  std::vector<StdComp,
225  {0.8793774319066148, 0.07003891050583658},
226  {0.24980544747081712, -0.0031128404669260694},
227  {0.6233463035019455, -0.026459143968871595},
228  {2.0, 0.1},
229  {6.248249027237354, 0.2719844357976654},
230  {0.7194170575332381, -0.01831314754114669},
231  {0.1212375092639108, 0.02522449751055713},
232  {6.0, -0.2},
233  {0.7097276264591441, -0.4443579766536965},
234  {4.999337315778741, 0.6013141870887196},
235  {0.26158183940834034, 0.23245112532996867},
236  {4.0, -0.6},
237  {4.440466926070039, -1.7525291828793774},
238  {0.840192589866152, 1.5044529443071093},
239  {1.0698651110730424, -0.10853319738453365}};
240  std::vector<StdComp,
242  {0.8793774319066148, 0.07003891050583658},
243  {0.49883268482490273, -0.01867704280155642},
244  {0.24980544747081712, -0.0031128404669260694},
245  {2.0, 0.1},
246  {6.248249027237354, 0.2719844357976654},
247  {0.800088933543564, -0.004823898651572499},
248  {0.2401906003014191, 0.0025474386841018853},
249  {3.0, -0.2},
250  {3.3478599221789884, -0.23424124513618677},
251  {0.8297816353227319, 1.3593612303468308},
252  {0.6377685195602139, -0.6747848919351336},
253  {4.0, -0.6},
254  {4.440466926070039, -1.7525291828793774},
255  {-1.5284389377713894, 1.6976073494521235},
256  {2.7608934839023482, -1.542084179899335}};
257  // clang-format off
258  std::vector<StdComp,
259  CUDAHostAllocator<StdComp>> dev_lu(lu.size());
260  std::vector<StdComp,
261  CUDAHostAllocator<StdComp>> dev_lu2(lu2.size());
262 
263  std::vector<StdComp*, CUDAHostAllocator<StdComp*>> lus(batch_size);
264  lus[0] = dev_lu.data();
265  lus[1] = dev_lu2.data();
266  std::vector<StdComp*, CUDAAllocator<StdComp*>> dev_lus(batch_size);
267 
268  std::vector<StdComp, CUDAHostAllocator<StdComp>> log_values(batch_size);
269  std::vector<StdComp, CUDAAllocator<StdComp>> dev_log_values(batch_size);
270 
271  std::vector<int, CUDAHostAllocator<int>> pivots = {3, 4, 3, 4, 3, 4, 4, 4};
272  std::vector<int, CUDAAllocator<int>> dev_pivots(pivots.size());
273 
274  cudaErrorCheck(cudaMemcpyAsync(dev_lu.data(), lu.data(), sizeof(decltype(lu)::value_type) * lu.size(), cudaMemcpyHostToDevice, hstream),
275  "cudaMemcpyAsync failed copying log_values to device");
276  cudaErrorCheck(cudaMemcpyAsync(dev_lu2.data(), lu2.data(), sizeof(decltype(lu2)::value_type) * lu2.size(), cudaMemcpyHostToDevice, hstream),
277  "cudaMemcpyAsync failed copying log_values to device");
278  cudaErrorCheck(cudaMemcpyAsync(dev_lus.data(), lus.data(), sizeof(decltype(lus)::value_type) * lus.size(), cudaMemcpyHostToDevice, hstream),
279  "cudaMemcpyAsync failed copying log_values to device");
280 
281  cudaErrorCheck(cudaMemcpyAsync(dev_pivots.data(), pivots.data(), sizeof(int) * pivots.size(), cudaMemcpyHostToDevice, hstream),
282  "cudaMemcpyAsync failed copying log_values to device");
283 
284  cuBLAS_LU::computeLogDet_batched(cuda_handles->hstream, n, lda, dev_lus.data(), dev_pivots.data(), dev_log_values.data(), batch_size);
285  cudaErrorCheck(cudaMemcpyAsync(log_values.data(), dev_log_values.data(), sizeof(std::complex<double>) * 2, cudaMemcpyDeviceToHost,
286  hstream),
287  "cudaMemcpyAsync failed copying log_values from device");
288  cudaErrorCheck(cudaStreamSynchronize(hstream), "cudaStreamSynchronize failed!");
289 
290  CHECK(log_values[0] == ComplexApprox(std::complex<double>{ 5.603777579195571, -6.1586603331188225 }));
291  CHECK(log_values[1] == ComplexApprox(std::complex<double>{ 5.531331998282581, -8.805487075984523 }));
292 }
293 
294 
295 TEST_CASE("cuBLAS_LU::getrf_batched_complex", "[wavefunction][CUDA]")
296 {
297  auto cuda_handles = std::make_unique<testing::CUDAHandles>();
298  int n = 4;
299  int lda = 4;
300  int batch_size = 1;
301  auto& hstream = cuda_handles->hstream;
302 
303  using StdComp = std::complex<double>;
304  // clang-format off
305  std::vector<StdComp, CUDAHostAllocator<StdComp>> M = {{2.0, 0.1}, {5.0, 0.1}, {8.0, 0.5}, {7.0, 1.0},
306  {5.0, 0.1}, {2.0, 0.2}, {2.0, 0.1}, {8.0, 0.5},
307  {7.0, 0.2}, {5.0, 1.0}, {6.0, -0.2}, {6.0, -0.2},
308  {5.0, 0.0}, {4.0, -0.1}, {4.0, -0.6}, {8.0, -2.0}};
309  // clang-format on
310  std::vector<StdComp, CUDAAllocator<StdComp>> devM(M.size());
311  std::vector<StdComp*, CUDAHostAllocator<StdComp*>> Ms(batch_size);
312  std::vector<StdComp*, CUDAAllocator<StdComp*>> devMs(batch_size);
313  Ms[0] = devM.data();
314 
315  std::vector<int, CUDAHostAllocator<int>> pivots = {1, 1, 1, 1};
316  std::vector<int, CUDAAllocator<int>> dev_pivots(pivots.size());
317 
318  std::vector<int, CUDAHostAllocator<int>> infos = {1, 1, 1, 1};
319  std::vector<int, CUDAAllocator<int>> dev_infos(infos.size());
320 
321  cudaErrorCheck(cudaMemcpyAsync(devM.data(), M.data(), sizeof(decltype(devM)::value_type) * devM.size(), cudaMemcpyHostToDevice, hstream),
322  "cudaMemcpyAsync failed copying M to device");
323  cudaErrorCheck(cudaMemcpyAsync(devMs.data(), Ms.data(), sizeof(decltype(devMs)::value_type) * devMs.size(), cudaMemcpyHostToDevice, hstream),
324  "cudaMemcpyAsync failed copying Ms to device");
325 
326  cuBLAS_LU::computeGetrf_batched(cuda_handles->h_cublas, cuda_handles->hstream, n, lda, devMs.data(), dev_pivots.data(), infos.data(), dev_infos.data(), batch_size);
327 
328  cudaErrorCheck(cudaMemcpyAsync(M.data(), devM.data(), sizeof(decltype(devM)::value_type) * devM.size(), cudaMemcpyDeviceToHost, hstream),
329  "cudaMemcpyAsync failed copying invM from device");
330  cudaErrorCheck(cudaMemcpyAsync(pivots.data(), dev_pivots.data(), sizeof(int) * pivots.size(), cudaMemcpyDeviceToHost, hstream),
331  "cudaMemcpyAsync failed copying pivots from device");
332 
333  cudaErrorCheck(cudaStreamSynchronize(hstream), "cudaStreamSynchronize failed!");
334 
335  std::vector<int> real_pivot = {3, 4, 3, 4};
336 
337  auto checkArray = [](auto A, auto B, int n) {
338  for (int i = 0; i < n; ++i)
339  {
340  CHECK(A[i] == B[i]);
341  }
342  };
343  checkArray(real_pivot.begin(), pivots.begin(), 4);
344  // clang-format off
345  std::vector<StdComp> lu{{8.0, 0.5},
346  {0.8793774319066148, 0.07003891050583658},
347  {0.24980544747081712, -0.0031128404669260694},
348  {0.6233463035019455, -0.026459143968871595},
349  {2.0, 0.1},
350  {6.248249027237354, 0.2719844357976654},
351  {0.7194170575332381, -0.01831314754114669},
352  {0.1212375092639108, 0.02522449751055713},
353  {6.0, -0.2},
354  {0.7097276264591441, -0.4443579766536965},
355  {4.999337315778741, 0.6013141870887196},
356  {0.26158183940834034, 0.23245112532996867},
357  {4.0, -0.6},
358  {4.440466926070039, -1.7525291828793774},
359  {0.840192589866152, 1.5044529443071093},
360  {1.0698651110730424, -0.10853319738453365}};
361  // clang-format on
362  // This could actually be any container that supported the concept of
363  // access via operator()(i, j) and had <T, ALLOCT> template signature
367  CHECKED_ELSE(check_matrix_result.result) { FAIL(check_matrix_result.result_message); }
368 }
369 
370 TEST_CASE("cuBLAS_LU::getrf_batched(batch=2)", "[wavefunction][CUDA]")
371 {
372  auto cuda_handles = std::make_unique<testing::CUDAHandles>();
373  int n = 4;
374  int lda = 4;
375  auto& hstream = cuda_handles->hstream;
376 
377  int batch_size = 2;
378 
379  std::vector<double, CUDAHostAllocator<double>> M_vec{2, 5, 7, 5, 5, 2, 5, 4, 8, 2, 6, 4, 7, 8, 6, 8};
380  std::vector<double, CUDAHostAllocator<double>> M2_vec{6, 5, 7, 5, 2, 2, 5, 4, 8, 2, 6, 4, 3, 8, 6, 8};
381  std::vector<double, CUDAAllocator<double>> devM_vec(M_vec.size());
382  std::vector<double, CUDAAllocator<double>> devM2_vec(M2_vec.size());
383  std::vector<double*, CUDAHostAllocator<double*>> Ms{devM_vec.data(), devM2_vec.data()};
384  std::vector<double*, CUDAAllocator<double*>> devMs(Ms.size());
385 
386  std::vector<int, CUDAHostAllocator<int>> pivots(8, -1.0);
387  std::vector<int, CUDAAllocator<int>> dev_pivots(pivots.size());
388 
389  std::vector<int, CUDAHostAllocator<int>> infos(8, 1.0);
390  std::vector<int, CUDAAllocator<int>> dev_infos(pivots.size());
391 
392  //Now copy the Ms
393  cudaErrorCheck(cudaMemcpyAsync(devM_vec.data(), M_vec.data(), sizeof(decltype(M_vec)::value_type) * M_vec.size(),
395  "cudaMemcpyAsync failed copying M to device");
396  cudaErrorCheck(cudaMemcpyAsync(devM2_vec.data(), M2_vec.data(), sizeof(decltype(M2_vec)::value_type) * M2_vec.size(),
398  "cudaMemcpyAsync failed copying M2 to device");
399  // copy the pointer array
400  cudaErrorCheck(cudaMemcpyAsync(devMs.data(), Ms.data(), sizeof(decltype(Ms)::value_type) * Ms.size(),
402  "cudaMemcpyAsync failed copying Ms to device");
403 
404  cuBLAS_LU::computeGetrf_batched(cuda_handles->h_cublas, cuda_handles->hstream, n, lda, devMs.data(),
405  dev_pivots.data(), infos.data(), dev_infos.data(), batch_size);
406 
407  // copy back the Ms, infos, pivots
408  cudaErrorCheck(cudaMemcpyAsync(M_vec.data(), devM_vec.data(), sizeof(decltype(M_vec)::value_type) * M_vec.size(),
410  "cudaMemcpyAsync failed copying invM from device");
411  cudaErrorCheck(cudaMemcpyAsync(M2_vec.data(), devM2_vec.data(), sizeof(decltype(M2_vec)::value_type) * M2_vec.size(),
413  "cudaMemcpyAsync failed copying invM from device");
414  cudaErrorCheck(cudaMemcpyAsync(pivots.data(), dev_pivots.data(), sizeof(int) * pivots.size(), cudaMemcpyDeviceToHost,
415  hstream),
416  "cudaMemcpyAsync failed copying pivots from device");
417 
418  cudaErrorCheck(cudaStreamSynchronize(hstream), "cudaStreamSynchronize failed!");
419 
420  // clang-format off
421  std::vector<double> lu{7., 0.28571429,
422  0.71428571, 0.71428571,
423  5., 3.57142857,
424  0.12, -0.44,
425  6., 6.28571429,
426  -1.04, -0.46153846,
427  6., 5.28571429,
428  3.08, 7.46153846};
429 
430  std::vector<double> lu2{7.0, 0.8571428571428571,
431  0.7142857142857142, 0.7142857142857142,
432  5.0, -2.2857142857142856,
433  0.6874999999999998, -0.18750000000000022,
434  6.0, 2.8571428571428577,
435  -4.249999999999999, -0.05882352941176502,
436  6.0, -2.1428571428571423,
437  5.1875, 3.617647058823531};
438  // clang-format on
439  std::vector<int> real_pivot{3, 3, 4, 4, 3, 3, 3, 4};
440 
441  auto checkArray = [](auto A, auto B, int n) {
442  for (int i = 0; i < n; ++i)
443  {
444  CHECK(A[i] == Approx(B[i]));
445  }
446  };
447 
448  testing::MatrixAccessor<double> M_mat(M_vec.data(), 4, 4);
449  testing::MatrixAccessor<double> lu_mat(lu.data(), 4, 4);
450  testing::MatrixAccessor<double> M2_mat(M2_vec.data(), 4, 4);
451  testing::MatrixAccessor<double> lu2_mat(lu2.data(), 4, 4);
452 
455  CHECKED_ELSE(check_matrix_result.result) { FAIL(check_matrix_result.result_message); }
457  CHECKED_ELSE(check_matrix_result.result) { FAIL(check_matrix_result.result_message); }
458 }
459 
460 TEST_CASE("cuBLAS_LU::getri_batched", "[wavefunction][CUDA]")
461 {
462  auto cuda_handles = std::make_unique<testing::CUDAHandles>();
463  int n = 4;
464  int lda = 4;
465  auto& hstream = cuda_handles->hstream;
466  int batch_size = 1;
467 
468  // clang-format off
469  std::vector<double, CUDAHostAllocator<double>> M_vec{7., 0.28571429, 0.71428571, 0.71428571,
470  5., 3.57142857, 0.12, -0.44,
471  6., 6.28571429, -1.04, -0.46153846,
472  6., 5.28571429, 3.08, 7.46153846};
473  std::vector<double, CUDAAllocator<double>> devM_vec(M_vec.size());
474 
475  std::vector<double*, CUDAHostAllocator<double*>> Ms{devM_vec.data()};
476  std::vector<double*, CUDAAllocator<double*>> devMs(Ms.size());
477 
478  std::vector<double, CUDAHostAllocator<double>> invM_vec{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
479  std::vector<double, CUDAHostAllocator<double>> dev_invM_vec(invM_vec.size());
480 
481  std::vector<double*, CUDAHostAllocator<double*>> invMs{dev_invM_vec.data()};
482  std::vector<double*, CUDAAllocator<double*>> dev_invMs(invMs.size());
483 
484  std::vector<int, CUDAHostAllocator<int>> pivots{3, 3, 4, 4};
485  std::vector<int, CUDAAllocator<int>> dev_pivots(pivots.size());
486 
487  std::vector<int, CUDAHostAllocator<int>> infos(4, 1.0);
488  std::vector<int, CUDAAllocator<int>> dev_infos(pivots.size());
489 
490  cudaErrorCheck(cudaMemcpyAsync(devM_vec.data(), M_vec.data(), sizeof(decltype(M_vec)::value_type) * M_vec.size(), cudaMemcpyHostToDevice, hstream),
491  "cudaMemcpyAsync failed copying M to device");
492  cudaErrorCheck(cudaMemcpyAsync(devMs.data(), Ms.data(), sizeof(double*), cudaMemcpyHostToDevice, hstream),
493  "cudaMemcpyAsync failed copying Ms to device");
494  cudaErrorCheck(cudaMemcpyAsync(dev_invMs.data(), invMs.data(), sizeof(double*), cudaMemcpyHostToDevice, hstream),
495  "cudaMemcpyAsync failed copying invMs to device");
497  "cudaMemcpyAsync failed copying pivots to device");
498  cuBLAS_LU::computeGetri_batched(cuda_handles->h_cublas, cuda_handles->hstream, n, lda, devMs.data(), invMs.data(), dev_pivots.data(), infos.data(), dev_infos.data(), batch_size);
499 
500  cudaErrorCheck(cudaMemcpyAsync(invM_vec.data(), dev_invM_vec.data(), sizeof(double) * 16, cudaMemcpyDeviceToHost, hstream),
501  "cudaMemcpyAsync failed copying invM from device");
502  cudaErrorCheck(cudaMemcpyAsync(infos.data(), dev_infos.data(), sizeof(int) * 4, cudaMemcpyDeviceToHost, hstream),
503  "cudaMemcpyAsync failed copying infos from device");
504  cudaErrorCheck(cudaStreamSynchronize(hstream), "cudaStreamSynchronize failed!");
505 
506  // clang-format off
507  std::vector<double> invA{-0.08247423, -0.26804124, 0.26804124, 0.05154639,
508  0.18556701, -0.89690722, 0.39690722, 0.13402062,
509  0.24742268, -0.19587629, 0.19587629, -0.15463918,
510  -0.29896907, 1.27835052, -0.77835052, 0.06185567};
511  // clang-format on
512 
513  auto checkArray = [](auto A, auto B, int n) {
514  for (int i = 0; i < n; ++i)
515  {
516  CHECK(A[i] == Approx(B[i]));
517  }
518  };
519 
520  testing::MatrixAccessor<double> invA_mat(invA.data(), 4, 4);
521  testing::MatrixAccessor<double> invM_mat(invM_vec.data(), 4, 4);
522 
523  auto check_matrix_result = checkMatrix(invA_mat, invM_mat);
524  CHECKED_ELSE(check_matrix_result.result) { FAIL(check_matrix_result.result_message); }
525 }
526 
527 } // namespace qmcplusplus
std::vector< StdComp, CUDAHostAllocator< StdComp > > dev_lu(lu.size())
helper functions for EinsplineSetBuilder
Definition: Configuration.h:43
testing::MatrixAccessor< double > M2_mat(M2_vec.data(), 4, 4)
std::vector< StdComp, CUDAHostAllocator< StdComp > > log_values(batch_size)
std::vector< StdComp *, CUDAAllocator< StdComp * > > dev_lus(batch_size)
TEST_CASE("complex_helper", "[type_traits]")
handle CUDA/HIP runtime selection.
#define cudaStreamDestroy
Definition: cuda2hip.h:151
CHECKED_ELSE(check_matrix_result.result)
std::vector< double *, CUDAAllocator< double * > > devMs(Ms.size())
#define cublasDestroy
Definition: cuda2hip.h:38
auto check_matrix_result
#define cudaStream_t
Definition: cuda2hip.h:149
std::vector< int, CUDAAllocator< int > > dev_infos(pivots.size())
#define cudaStreamCreate
Definition: cuda2hip.h:150
this file provides three C++ memory allocators using CUDA specific memory allocation functions...
At the qmcplusplus cuBLAS_LU level all *, **, *[] are assumed to be to device addresses.
void computeGetrf_batched(cublasHandle_t &h_cublas, cudaStream_t &hstream, const int n, const int lda, T *Ms[], int *pivots, int *host_infos, int *infos, const int batch_size)
std::vector< StdComp, CUDAHostAllocator< StdComp > > dev_lu2(lu2.size())
std::vector< double, CUDAAllocator< double > > devM2_vec(M2_vec.size())
void computeLogDet_batched(cudaStream_t &hstream, const int n, const int lda, T **Ms, const int *pivots, std::complex< double > *logdets, const int batch_size)
std::vector< double, CUDAHostAllocator< double > > M2_vec
std::vector< int, CUDAHostAllocator< int > > pivots
testing::MatrixAccessor< double > lu2_mat(lu2.data(), 4, 4)
std::vector< double, CUDAHostAllocator< double > > M_vec
std::vector< int, CUDAHostAllocator< int > > infos(8, 1.0)
testing::MatrixAccessor< double > M_mat(M_vec.data(), 4, 4)
std::vector< int > real_pivot
cudaErrorCheck(cudaMemcpyAsync(dev_lu.data(), lu.data(), sizeof(decltype(lu)::value_type) *lu.size(), cudaMemcpyHostToDevice, hstream), "cudaMemcpyAsync failed copying log_values to device")
#define cudaMemcpyDeviceToHost
Definition: cuda2hip.h:138
std::vector< double *, CUDAHostAllocator< double * > > Ms
#define cudaStreamSynchronize
Definition: cuda2hip.h:152
#define cudaMemcpyHostToDevice
Definition: cuda2hip.h:139
#define cublasCreate
Definition: cuda2hip.h:37
#define cublasSetStream
Definition: cuda2hip.h:39
std::vector< StdComp, CUDAAllocator< StdComp > > dev_log_values(batch_size)
testing::MatrixAccessor< double > lu_mat(lu.data(), 4, 4)
CheckMatrixResult checkMatrix(M1 &a_mat, M2 &b_mat, const bool check_all=false, std::optional< const double > eps=std::nullopt)
This function checks equality a_mat and b_mat elements M1, M2 need to have their element type declare...
Definition: checkMatrix.hpp:63
CHECK(log_values[0]==ComplexApprox(std::complex< double >{ 5.603777579195571, -6.1586603331188225 }))
std::vector< StdComp, CUDAHostAllocator< StdComp > > lu
std::vector< StdComp, CUDAHostAllocator< StdComp > > lu2
Doesn&#39;t depend on the resource management scheme thats out of scope for unit tests.
double B(double x, int k, int i, const std::vector< double > &t)
#define cublasErrorCheck(ans, cause)
Definition: cuBLAS.hpp:34
#define cudaCheck(ans)
Definition: CUDAerror.h:27
std::vector< int, CUDAAllocator< int > > dev_pivots(pivots.size())
allocator for CUDA host pinned memory
Read only access, for testing!
QMCTraits::FullPrecRealType value_type
void computeGetri_batched(cublasHandle_t &h_cublas, cudaStream_t &hstream, const int n, const int lda, T *Ms[], T *Cs[], int *pivots, int *host_infos, int *infos, const int batch_size)
std::complex< double > StdComp
std::vector< double, CUDAAllocator< double > > devM_vec(M_vec.size())
#define cublasHandle_t
Definition: cuda2hip.h:35
#define cudaMemcpyAsync
Definition: cuda2hip.h:136