QMCPACK
test_syclBLAS.cpp
Go to the documentation of this file.
1 //////////////////////////////////////////////////////////////////////////////////////
2 // This file is distributed under the University of Illinois/NCSA Open Source License.
3 // See LICENSE file in top directory for details.
4 //
5 // Copyright (c) 2021 QMCPACK developers.
6 //
7 // File developed by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
8 //
9 // File created by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
10 //////////////////////////////////////////////////////////////////////////////////////
11 
12 #include "catch.hpp"
13 
14 #include <memory>
15 #include <vector>
16 #include <iostream>
17 #include "DualAllocatorAliases.hpp"
18 #include "SYCL/SYCLruntime.hpp"
19 #include "SYCL/syclBLAS.hpp"
20 #include <OhmmsPETE/OhmmsVector.h>
21 #include <OhmmsPETE/OhmmsMatrix.h>
22 #include "CPU/BLAS.hpp"
23 
24 namespace qmcplusplus
25 {
26 
27 template<typename T>
28 void test_gemv(const int M_b, const int N_b, const char trans)
29 {
30  const int M = trans == 'T' ? M_b : N_b;
31  const int N = trans == 'T' ? N_b : M_b;
32 
33  using vec_t = Vector<T, PinnedDualAllocator<T>>;
34  using mat_t = Matrix<T, PinnedDualAllocator<T>>;
35 
37 
38  vec_t A(N); // Input vector
39  mat_t B(M_b, N_b); // Input matrix
40  vec_t C(M); // Result vector syclBLAS
41  vec_t D(M); // Result vector BLAS
42 
43  // Fill data
44  for (int i = 0; i < N; i++)
45  A[i] = i;
46 
47  for (int j = 0; j < M_b; j++)
48  for (int i = 0; i < N_b; i++)
49  B[j][i] = i + j * 2;
50 
51  // Fill C and D with 0
52  for (int i = 0; i < M; i++)
53  C[i] = D[i] = T(0);
54 
55  A.updateTo();
56  B.updateTo();
57  C.updateTo();
58 
59  T alpha(1);
60  T beta(0);
61 
62  // in Fortran, B[M][N] is viewed as B^T
63  // when trans == 'T', the actual calculation is B * A[N] = C[M]
64  // when trans == 'N', the actual calculation is B^T * A[M] = C[N]
65  syclBLAS::gemv(handle, trans, N_b, M_b, alpha, B.device_data(), N_b, A.device_data(), 1, beta, C.device_data(), 1)
66  .wait();
67  C.updateFrom();
68 
69  if (trans == 'T')
70  BLAS::gemv_trans(M_b, N_b, B.data(), A.data(), D.data());
71  else
72  BLAS::gemv(M_b, N_b, B.data(), A.data(), D.data());
73 
74  for (int index = 0; index < M; index++)
75  CHECK(C[index] == D[index]);
76 }
77 
78 template<typename T>
79 void test_gemv_batched(const int M_b, const int N_b, const char trans, const int batch_count)
80 {
81  const int M = trans == 'T' ? M_b : N_b;
82  const int N = trans == 'T' ? N_b : M_b;
83 
84  using vec_t = Vector<T, PinnedDualAllocator<T>>;
85  using mat_t = Matrix<T, PinnedDualAllocator<T>>;
86 
88 
89  // Create input vector
90  std::vector<vec_t> As;
91  Vector<const T*, PinnedDualAllocator<const T*>> Aptrs;
92 
93  // Create input matrix
94  std::vector<mat_t> Bs;
95  Vector<const T*, PinnedDualAllocator<const T*>> Bptrs;
96 
97  // Create output vector (syclBLAS)
98  std::vector<vec_t> Cs;
99  Vector<T*, PinnedDualAllocator<T*>> Cptrs;
100 
101  // Create output vector (BLAS)
102  std::vector<vec_t> Ds;
103  Vector<T*, PinnedDualAllocator<T*>> Dptrs;
104 
105  // Resize pointer vectors
106  Aptrs.resize(batch_count);
107  Bptrs.resize(batch_count);
108  Cptrs.resize(batch_count);
109  Dptrs.resize(batch_count);
110 
111  // Resize data vectors
112  As.resize(batch_count);
113  Bs.resize(batch_count);
114  Cs.resize(batch_count);
115  Ds.resize(batch_count);
116 
117  // Fill data
118  for (int batch = 0; batch < batch_count; batch++)
119  {
120  As[batch].resize(N);
121  Aptrs[batch] = As[batch].device_data();
122 
123  Bs[batch].resize(M_b, N_b);
124  Bptrs[batch] = Bs[batch].device_data();
125 
126  Cs[batch].resize(M);
127  Cptrs[batch] = Cs[batch].device_data();
128 
129  Ds[batch].resize(M);
130  Dptrs[batch] = Ds[batch].data();
131 
132  for (int i = 0; i < N; i++)
133  As[batch][i] = i;
134 
135  for (int j = 0; j < M_b; j++)
136  for (int i = 0; i < N_b; i++)
137  Bs[batch][j][i] = i + j * 2;
138 
139  for (int i = 0; i < M; i++)
140  Cs[batch][i] = Ds[batch][i] = T(0);
141 
142  As[batch].updateTo();
143  Bs[batch].updateTo();
144  }
145 
146  Aptrs.updateTo();
147  Bptrs.updateTo();
148  Cptrs.updateTo();
149 
150  // Run tests
151  Vector<T, PinnedDualAllocator<T>> alpha(batch_count);
152  Vector<T, PinnedDualAllocator<T>> beta(batch_count);
153  Vector<T, PinnedDualAllocator<T>> beta1(batch_count);
154 
155  for (int batch = 0; batch < batch_count; batch++)
156  {
157  alpha[batch] = T(0.5);
158  beta[batch] = T(0);
159  beta1[batch] = T(1);
160  }
161 
162  alpha.updateTo();
163  beta.updateTo();
164  beta1.updateTo();
165 
166  // alpha 0.5, beta 0
167  syclBLAS::gemv_batched(handle, trans, N_b, M_b, alpha.device_data(), Bptrs.device_data(), N_b, Aptrs.device_data(), 1,
168  beta.device_data(), Cptrs.device_data(), 1, batch_count)
169  .wait();
170  // alpha 0.5, beta 1
171  syclBLAS::gemv_batched(handle, trans, N_b, M_b, alpha.device_data(), Bptrs.device_data(), N_b, Aptrs.device_data(), 1,
172  beta1.device_data(), Cptrs.device_data(), 1, batch_count)
173  .wait();
174 
175  for (int batch = 0; batch < batch_count; batch++)
176  {
177  Cs[batch].updateFrom();
178  if (trans == 'T')
179  BLAS::gemv_trans(M_b, N_b, Bs[batch].data(), As[batch].data(), Ds[batch].data());
180  else
181  BLAS::gemv(M_b, N_b, Bs[batch].data(), As[batch].data(), Ds[batch].data());
182 
183  // Check results
184  for (int index = 0; index < M; index++)
185  CHECK(Cs[batch][index] == Ds[batch][index]);
186  }
187 }
188 
189 TEST_CASE("OmpBLAS gemv", "[SYCL]")
190 {
191  const int M = 137;
192  const int N = 79;
193  const int batch_count = 23;
194 
195  // Non-batched test
196  std::cout << "Testing TRANS gemv" << std::endl;
197  test_gemv<float>(M, N, 'T');
198  test_gemv<double>(M, N, 'T');
199 #if defined(QMC_COMPLEX)
200  test_gemv<std::complex<float>>(N, M, 'T');
201  test_gemv<std::complex<double>>(N, M, 'T');
202 #endif
203  // Batched Test
204  std::cout << "Testing TRANS gemv_batched" << std::endl;
205  test_gemv_batched<float>(M, N, 'T', batch_count);
206  test_gemv_batched<double>(M, N, 'T', batch_count);
207 #if defined(QMC_COMPLEX)
208  test_gemv_batched<std::complex<float>>(N, M, 'T', batch_count);
209  test_gemv_batched<std::complex<double>>(N, M, 'T', batch_count);
210 #endif
211 }
212 
213 TEST_CASE("OmpBLAS gemv notrans", "[SYCL]")
214 {
215  const int M = 137;
216  const int N = 79;
217  const int batch_count = 23;
218 
219  // Non-batched test
220  std::cout << "Testing NOTRANS gemv" << std::endl;
221  test_gemv<float>(M, N, 'N');
222  test_gemv<double>(M, N, 'N');
223 #if defined(QMC_COMPLEX)
224  test_gemv<std::complex<float>>(N, M, 'N');
225  test_gemv<std::complex<double>>(N, M, 'N');
226 #endif
227  // Batched Test
228  std::cout << "Testing NOTRANS gemv_batched" << std::endl;
229  test_gemv_batched<float>(M, N, 'N', batch_count);
230  test_gemv_batched<double>(M, N, 'N', batch_count);
231 #if defined(QMC_COMPLEX)
232  test_gemv_batched<std::complex<float>>(N, M, 'N', batch_count);
233  test_gemv_batched<std::complex<double>>(N, M, 'N', batch_count);
234 #endif
235 }
236 
237 template<typename T>
238 void test_ger_batched(const int M, const int N, const int batch_count)
239 {
240  using vec_t = Vector<T, PinnedDualAllocator<T>>;
241  using mat_t = Matrix<T, PinnedDualAllocator<T>>;
242 
244 
245  // Create input vector
246  std::vector<vec_t> Xs;
247  Vector<const T*, PinnedDualAllocator<const T*>> Xptrs;
248  std::vector<vec_t> Ys;
249  Vector<const T*, PinnedDualAllocator<const T*>> Yptrs;
250 
251  // Create input matrix
252  std::vector<mat_t> Ahs;
253  Vector<T*, PinnedDualAllocator<T*>> Ahptrs;
254  std::vector<mat_t> Ads;
255  Vector<T*, PinnedDualAllocator<T*>> Adptrs;
256 
257  // Resize pointer vectors
258  Xptrs.resize(batch_count);
259  Yptrs.resize(batch_count);
260  Ahptrs.resize(batch_count);
261  Adptrs.resize(batch_count);
262 
263  // Resize data vectors
264  Xs.resize(batch_count);
265  Ys.resize(batch_count);
266  Ahs.resize(batch_count);
267  Ads.resize(batch_count);
268 
269  // Fill data
270  for (int batch = 0; batch < batch_count; batch++)
271  {
272  Xs[batch].resize(M);
273  Xptrs[batch] = Xs[batch].device_data();
274 
275  Ys[batch].resize(N);
276  Yptrs[batch] = Ys[batch].device_data();
277 
278  Ads[batch].resize(M, N);
279  Adptrs[batch] = Ads[batch].device_data();
280 
281  Ahs[batch].resize(M, N);
282  Ahptrs[batch] = Ahs[batch].data();
283 
284  // Fill data
285  for (int i = 0; i < M; i++)
286  Xs[batch][i] = i;
287  for (int i = 0; i < N; i++)
288  Ys[batch][i] = N - i;
289 
290  for (int j = 0; j < M; j++)
291  for (int i = 0; i < N; i++)
292  {
293  Ads[batch][j][i] = i + j * 2;
294  Ahs[batch][j][i] = i + j * 2;
295  }
296 
297  Xs[batch].updateTo();
298  Ys[batch].updateTo();
299  Ads[batch].updateTo();
300  }
301 
302  Adptrs.updateTo();
303  Xptrs.updateTo();
304  Yptrs.updateTo();
305 
306  // Run tests
307  Vector<T, PinnedDualAllocator<T>> alpha(batch_count);
308 
309  for (int batch = 0; batch < batch_count; batch++)
310  {
311  alpha[batch] = T(1);
312  }
313 
314  alpha.updateTo();
315 
316  syclBLAS::ger_batched(handle, M, N, alpha.device_data(), Xptrs.device_data(), 1, Yptrs.device_data(), 1,
317  Adptrs.device_data(), M, batch_count)
318  .wait();
319 
320  for (int batch = 0; batch < batch_count; batch++)
321  {
322  Ads[batch].updateFrom();
323  BLAS::ger(M, N, alpha[batch], Xs[batch].data(), 1, Ys[batch].data(), 1, Ahs[batch].data(), M);
324 
325  // Check results
326  for (int j = 0; j < M; j++)
327  for (int i = 0; i < N; i++)
328  CHECK(Ads[batch][j][i] == Ahs[batch][j][i]);
329  }
330 }
331 
332 TEST_CASE("OmpBLAS ger", "[SYCL]")
333 {
334  const int M = 137;
335  const int N = 79;
336  const int batch_count = 23;
337 
338  // Batched Test
339  std::cout << "Testing ger_batched" << std::endl;
340  test_ger_batched<float>(M, N, batch_count);
341  test_ger_batched<double>(M, N, batch_count);
342 #if defined(QMC_COMPLEX)
343  test_ger_batched<std::complex<float>>(N, M, batch_count);
344  test_ger_batched<std::complex<double>>(N, M, batch_count);
345 #endif
346 }
347 } // namespace qmcplusplus
void test_ger_batched(const int M, const int N, const int batch_count)
sycl::queue & getSYCLDefaultDeviceDefaultQueue()
return a reference to the per-device default queue
Definition: SYCLruntime.cpp:18
static void gemv_trans(int n, int m, const double *restrict amat, const double *restrict x, double *restrict y)
Definition: BLAS.hpp:147
TinyVector< double, 3 > vec_t
helper functions for EinsplineSetBuilder
Definition: Configuration.h:43
sycl::event ger_batched(sycl::queue &handle, const int m, const int n, const T *alpha, const T *const x[], const int incx, const T *const y[], const int incy, T *const A[], const int lda, const size_t batch_count, const std::vector< sycl::event > &events={})
in-house version of ger_batch implemented in SYCL. Can be dropped if we have vendor optimized version...
TEST_CASE("complex_helper", "[type_traits]")
static void gemv(int n, int m, const double *restrict amat, const double *restrict x, double *restrict y)
Definition: BLAS.hpp:118
sycl::event gemv_batched(sycl::queue &handle, const char trans, const int m, const int n, const T *alpha, const T *const A[], const int lda, const T *const x[], const int incx, const T *beta, T *const y[], const int incy, const size_t batch_count, const std::vector< sycl::event > &events={})
in-house version of gemv_batch implemented in SYCL. Can be dropped if we have vendor optimized versio...
These allocators are to make code that should be generic with the respect to accelerator code flavor ...
static void ger(int m, int n, double alpha, const double *x, int incx, const double *y, int incy, double *a, int lda)
Definition: BLAS.hpp:437
sycl::event gemv(sycl::queue &handle, const char trans, const int m, const int n, const T alpha, const T *const A, const int lda, const T *const x, const int incx, const T beta, T *const y, const int incy, const std::vector< sycl::event > &events)
Definition: syclBLAS.cpp:21
Declaraton of Vector<T,Alloc> Manage memory through Alloc directly and allow referencing an existing ...
CHECK(log_values[0]==ComplexApprox(std::complex< double >{ 5.603777579195571, -6.1586603331188225 }))
void test_gemv_batched(const int M_b, const int N_b, const char trans, const int batch_count)
double B(double x, int k, int i, const std::vector< double > &t)
void test_gemv(const int M_b, const int N_b, const char trans)