QMCPACK
test_AccelBLAS.cpp
Go to the documentation of this file.
1 //////////////////////////////////////////////////////////////////////////////////////
2 // This file is distributed under the University of Illinois/NCSA Open Source License.
3 // See LICENSE file in top directory for details.
4 //
5 // Copyright (c) 2024 QMCPACK developers.
6 //
7 // File developed by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
8 //
9 // File created by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
10 //////////////////////////////////////////////////////////////////////////////////////
11 
12 #include "catch.hpp"
13 
14 #include <DualAllocatorAliases.hpp>
15 #include <AccelBLAS.hpp>
16 #include <OhmmsPETE/OhmmsVector.h>
17 #include <OhmmsPETE/OhmmsMatrix.h>
18 #include <CPU/BLAS.hpp>
19 
20 namespace qmcplusplus
21 {
22 template<PlatformKind PL, typename T>
23 void test_one_gemm(const int M, const int N, const int K, const char transa, const char transb)
24 {
25  const int a0 = transa == 'T' ? M : K;
26  const int a1 = transa == 'T' ? K : M;
27 
28  const int b0 = transb == 'T' ? K : N;
29  const int b1 = transb == 'T' ? N : K;
30 
32  using mat_t = Matrix<T, PinnedDualAllocator<T>>;
33 
34  mat_t A(a0, a1); // Input matrix
35  mat_t B(b0, b1); // Input matrix
36  mat_t C(N, M); // Result matrix AccelBLAS
37  mat_t D(N, M); // Result matrix BLAS
38 
39  // Fill data
40  for (int j = 0; j < a0; j++)
41  for (int i = 0; i < a1; i++)
42  A[j][i] = i * 3 + j * 4;
43 
44  for (int j = 0; j < b0; j++)
45  for (int i = 0; i < b1; i++)
46  B[j][i] = i * 4 + j * 5;
47 
48  A.updateTo();
49  B.updateTo();
50  C.updateTo();
51 
52  T alpha(1);
53  T beta(0);
54 
55  // U[X,Y] denotes a row-major matrix U with X rows and Y cols
56  // element U(i,j) is located at: U.data() + sizeof(U_type) * (i*ldU + j)
57  //
58  // A,B,C,D are treated as row-major matrices, but the arguments to gemm are treated as col-major
59  // so the call below to AccelBLAS::gemm is equivalent to one of the following (with row-major matrices)
60  // transa/transb == 'N'/'N': C[N,M] = A[K,M] * B[N,K]; C = B * A
61  // transa/transb == 'N'/'T': C[N,M] = A[K,M] * B[K,N]; C = B^t * A
62  // transa/transb == 'T'/'N': C[N,M] = A[M,K] * B[N,K]; C = B * A^t
63  // transa/transb == 'T'/'T': C[N,M] = A[M,K] * B[K,N]; C = B^t * A^t
64 
67  compute::BLAS::gemm(h_blas, transa, transb, M, N, K, alpha, A.device_data(), a1, B.device_data(), b1, beta,
68  C.device_data(), M);
69  queue.sync();
70 
71  C.updateFrom();
72 
73  BLAS::gemm(transa, transb, M, N, K, alpha, A.data(), a1, B.data(), b1, beta, D.data(), M);
74 
75  for (int j = 0; j < N; j++)
76  for (int i = 0; i < M; i++)
77  {
78  CHECK(std::real(C[j][i]) == Approx(std::real(D[j][i])));
79  CHECK(std::imag(C[j][i]) == Approx(std::imag(D[j][i])));
80  }
81 
82  mat_t A2(a0, a1); // Input matrix
83  mat_t B2(b0, b1); // Input matrix
84  mat_t C2(N, M); // Result matrix AccelBLAS
85  mat_t D2(N, M); // Result matrix BLAS
86 
87  // Fill data
88  for (int j = 0; j < a0; j++)
89  for (int i = 0; i < a1; i++)
90  A2[j][i] = j * 3 + i * 4;
91 
92  for (int j = 0; j < b0; j++)
93  for (int i = 0; i < b1; i++)
94  B2[j][i] = j * 4 + i * 5;
95 
96  A2.updateTo();
97  B2.updateTo();
98 
101 
102  Aarr[0] = A2.device_data();
103  Aarr[1] = A.device_data();
104  Barr[0] = B2.device_data();
105  Barr[1] = B.device_data();
106 
107  Carr[0] = C.device_data();
108  Carr[1] = C2.device_data();
109 
110  Aarr.updateTo();
111  Barr.updateTo();
112  Carr.updateTo();
113 
114  compute::BLAS::gemm_batched(h_blas, transa, transb, M, N, K, alpha, Aarr.device_data(), a1, Barr.device_data(), b1,
115  beta, Carr.device_data(), M, 2);
116  queue.sync();
117 
118  C.updateFrom();
119  C2.updateFrom();
120 
121  BLAS::gemm(transa, transb, M, N, K, alpha, A2.data(), a1, B2.data(), b1, beta, D2.data(), M);
122 
123  for (int j = 0; j < N; j++)
124  for (int i = 0; i < M; i++)
125  {
126  CHECK(std::real(C2[j][i]) == Approx(std::real(D[j][i])));
127  CHECK(std::imag(C2[j][i]) == Approx(std::imag(D[j][i])));
128  }
129 
130  for (int j = 0; j < N; j++)
131  for (int i = 0; i < M; i++)
132  {
133  CHECK(std::real(C[j][i]) == Approx(std::real(D2[j][i])));
134  CHECK(std::imag(C[j][i]) == Approx(std::imag(D2[j][i])));
135  }
136 }
137 
138 template<PlatformKind PL>
140 {
141  const int M = 29;
142  const int N = 31;
143  const int K = 23;
144 
145  // Non-batched test
146  std::cout << "Testing NN gemm" << std::endl;
147  test_one_gemm<PL, float>(M, N, K, 'N', 'N');
148  test_one_gemm<PL, double>(M, N, K, 'N', 'N');
149 #if defined(QMC_COMPLEX)
150  test_one_gemm<PL, std::complex<float>>(N, M, K, 'N', 'N');
151  test_one_gemm<PL, std::complex<double>>(N, M, K, 'N', 'N');
152 #endif
153  std::cout << "Testing NT gemm" << std::endl;
154  test_one_gemm<PL, float>(M, N, K, 'N', 'T');
155  test_one_gemm<PL, double>(M, N, K, 'N', 'T');
156 #if defined(QMC_COMPLEX)
157  test_one_gemm<PL, std::complex<float>>(N, M, K, 'N', 'T');
158  test_one_gemm<PL, std::complex<double>>(N, M, K, 'N', 'T');
159 #endif
160  std::cout << "Testing TN gemm" << std::endl;
161  test_one_gemm<PL, float>(M, N, K, 'T', 'N');
162  test_one_gemm<PL, double>(M, N, K, 'T', 'N');
163 #if defined(QMC_COMPLEX)
164  test_one_gemm<PL, std::complex<float>>(N, M, K, 'T', 'N');
165  test_one_gemm<PL, std::complex<double>>(N, M, K, 'T', 'N');
166 #endif
167  std::cout << "Testing TT gemm" << std::endl;
168  test_one_gemm<PL, float>(M, N, K, 'T', 'T');
169  test_one_gemm<PL, double>(M, N, K, 'T', 'T');
170 #if defined(QMC_COMPLEX)
171  test_one_gemm<PL, std::complex<float>>(N, M, K, 'T', 'T');
172  test_one_gemm<PL, std::complex<double>>(N, M, K, 'T', 'T');
173 #endif
174 }
175 
176 template<PlatformKind PL, typename T>
177 void test_one_gemv(const int M_b, const int N_b, const char trans)
178 {
179  const int M = trans == 'T' ? M_b : N_b;
180  const int N = trans == 'T' ? N_b : M_b;
181 
183  using mat_t = Matrix<T, PinnedDualAllocator<T>>;
184 
185  vec_t A(N); // Input vector
186  mat_t B(M_b, N_b); // Input matrix
187  vec_t C(M); // Result vector AccelBLAS
188  vec_t D(M); // Result vector BLAS
189 
190  // Fill data
191  for (int i = 0; i < N; i++)
192  A[i] = i;
193 
194  for (int j = 0; j < M_b; j++)
195  for (int i = 0; i < N_b; i++)
196  B[j][i] = i + j * 2;
197 
198  // Fill C and D with 0
199  for (int i = 0; i < M; i++)
200  C[i] = D[i] = T(0);
201 
202  A.updateTo();
203  B.updateTo();
204  C.updateTo();
205 
206  T alpha(0.5);
207  T beta(0);
208 
211  // in Fortran, B[M][N] is viewed as B^T
212  // when trans == 'T', the actual calculation is B * A[N] = C[M]
213  // when trans == 'N', the actual calculation is B^T * A[M] = C[N]
214  compute::BLAS::gemv(h_blas, trans, N_b, M_b, alpha, B.device_data(), N_b, A.device_data(), 1, beta, C.device_data(),
215  1);
216  queue.sync();
217 
218  C.updateFrom();
219 
220  if (trans == 'T')
221  BLAS::gemv_trans(M_b, N_b, B.data(), A.data(), D.data());
222  else
223  BLAS::gemv(M_b, N_b, B.data(), A.data(), D.data());
224 
225  for (int index = 0; index < M; index++)
226  CHECK(C[index] == D[index] * alpha);
227 
228  vec_t A2(N); // Input vector
229  mat_t B2(M_b, N_b); // Input matrix
230  vec_t C2(M); // Result vector AccelBLAS
231  vec_t D2(M); // Result vector BLAS
232 
233  // Fill data
234  for (int i = 0; i < N; i++)
235  A2[i] = i + 1;
236 
237  for (int j = 0; j < M_b; j++)
238  for (int i = 0; i < N_b; i++)
239  B2[j][i] = i * 2 + j;
240 
241  // Fill C and D with 0
242  for (int i = 0; i < M; i++)
243  C2[i] = D2[i] = T(0);
244 
245  A2.updateTo();
246  B2.updateTo();
247  C2.updateTo();
248 
251  Vector<T, PinnedDualAllocator<T>> alpha_arr(2), beta_arr(2);
252 
253  Aarr[0] = A2.device_data();
254  Aarr[1] = A.device_data();
255  Barr[0] = B2.device_data();
256  Barr[1] = B.device_data();
257 
258  Carr[0] = C2.device_data();
259  Carr[1] = C.device_data();
260 
261  alpha_arr[0] = 2;
262  alpha_arr[1] = 0.5;
263  beta_arr[0] = 0;
264  beta_arr[1] = 1;
265 
266  Aarr.updateTo();
267  Barr.updateTo();
268  Carr.updateTo();
269  alpha_arr.updateTo();
270  beta_arr.updateTo();
271 
272  // in Fortran, B[M][N] is viewed as B^T
273  // when trans == 'T', the actual calculation is B * A[N] = C[M]
274  // when trans == 'N', the actual calculation is B^T * A[M] = C[N]
275  compute::BLAS::gemv_batched(h_blas, trans, N_b, M_b, alpha_arr.device_data(), Barr.device_data(), N_b,
276  Aarr.device_data(), 1, beta_arr.device_data(), Carr.device_data(), 1, 2);
277  queue.sync();
278 
279  C.updateFrom();
280  C2.updateFrom();
281 
282  if (trans == 'T')
283  BLAS::gemv_trans(M_b, N_b, B2.data(), A2.data(), D2.data());
284  else
285  BLAS::gemv(M_b, N_b, B2.data(), A2.data(), D2.data());
286 
287  for (int index = 0; index < M; index++)
288  {
289  CHECK(C[index] == D[index]);
290  CHECK(C2[index] == D2[index] * alpha_arr[0]);
291  }
292 }
293 
294 template<PlatformKind PL>
296 {
297  const int M = 137;
298  const int N = 79;
299 
300  std::cout << "Testing NOTRANS gemv" << std::endl;
301  test_one_gemv<PL, float>(M, N, 'N');
302  test_one_gemv<PL, double>(M, N, 'N');
303 #if defined(QMC_COMPLEX)
304  test_one_gemv<PL, std::complex<float>>(N, M, 'N');
305  test_one_gemv<PL, std::complex<double>>(N, M, 'N');
306 #endif
307  std::cout << "Testing TRANS gemv" << std::endl;
308  test_one_gemv<PL, float>(M, N, 'T');
309  test_one_gemv<PL, double>(M, N, 'T');
310 #if defined(QMC_COMPLEX)
311  test_one_gemv<PL, std::complex<float>>(N, M, 'T');
312  test_one_gemv<PL, std::complex<double>>(N, M, 'T');
313 #endif
314 }
315 
316 template<PlatformKind PL, typename T>
317 void test_one_ger(const int M, const int N)
318 {
320  using mat_t = Matrix<T, PinnedDualAllocator<T>>;
321 
322  mat_t Ah(M, N); // Input matrix
323  mat_t Ad(M, N); // Input matrix
324  vec_t x(M); // Input vector
325  vec_t y(N); // Input vector
326 
327  // Fill data
328  for (int i = 0; i < M; i++)
329  x[i] = i;
330  for (int i = 0; i < N; i++)
331  y[i] = N - i;
332 
333  for (int j = 0; j < M; j++)
334  for (int i = 0; i < N; i++)
335  {
336  Ah[j][i] = i + j * 2;
337  Ad[j][i] = i + j * 2;
338  }
339 
340  Ad.updateTo();
341  x.updateTo();
342  y.updateTo();
343 
344  T alpha(0.5);
345 
348  // in Fortran, B[M][N] is viewed as B^T
349  compute::BLAS::ger(h_blas, M, N, alpha, x.device_data(), 1, y.device_data(), 1, Ad.device_data(), M);
350  queue.sync();
351  Ad.updateFrom();
352 
353  BLAS::ger(M, N, alpha, x.data(), 1, y.data(), 1, Ah.data(), M);
354 
355  for (int j = 0; j < M; j++)
356  for (int i = 0; i < N; i++)
357  CHECK(Ah[j][i] == Ad[j][i]);
358 
359  mat_t Ah2(M, N); // Input matrix
360  mat_t Ad2(M, N); // Input matrix
361  vec_t x2(M); // Input vector
362  vec_t y2(N); // Input vector
363 
364  // Fill data
365  for (int i = 0; i < M; i++)
366  x2[i] = i - 1;
367  for (int i = 0; i < N; i++)
368  y2[i] = N + i;
369 
370  for (int j = 0; j < M; j++)
371  for (int i = 0; i < N; i++)
372  {
373  Ah2[j][i] = j + i * 2;
374  Ad2[j][i] = j + i * 2;
375  }
376 
377  Ad2.updateTo();
378  x2.updateTo();
379  y2.updateTo();
380 
383  Vector<T, PinnedDualAllocator<T>> alpha_arr(2);
384 
385  Aarr[0] = Ad2.device_data();
386  Aarr[1] = Ad.device_data();
387  Xarr[0] = x2.device_data();
388  Xarr[1] = x.device_data();
389  Yarr[0] = y2.device_data();
390  Yarr[1] = y.device_data();
391 
392  alpha_arr[0] = 2;
393  alpha_arr[1] = 0.5;
394 
395  Aarr.updateTo();
396  Xarr.updateTo();
397  Yarr.updateTo();
398  alpha_arr.updateTo();
399 
400  // in Fortran, B[M][N] is viewed as B^T
401  compute::BLAS::ger_batched(h_blas, M, N, alpha_arr.device_data(), Xarr.device_data(), 1, Yarr.device_data(), 1,
402  Aarr.device_data(), M, 2);
403  queue.sync();
404  Ad.updateFrom();
405  Ad2.updateFrom();
406 
407  BLAS::ger(M, N, alpha_arr[1], x.data(), 1, y.data(), 1, Ah.data(), M);
408  BLAS::ger(M, N, alpha_arr[0], x2.data(), 1, y2.data(), 1, Ah2.data(), M);
409 
410  for (int j = 0; j < M; j++)
411  for (int i = 0; i < N; i++)
412  {
413  CHECK(Ah[j][i] == Ad[j][i]);
414  CHECK(Ah2[j][i] == Ad2[j][i]);
415  }
416 }
417 
418 template<PlatformKind PL>
420 {
421  const int M = 137;
422  const int N = 79;
423 
424  // Batched Test
425  std::cout << "Testing ger_batched" << std::endl;
426  test_one_ger<PL, float>(M, N);
427  test_one_ger<PL, double>(M, N);
428 #if defined(QMC_COMPLEX)
429  test_one_ger<PL, std::complex<float>>(N, M);
430  test_one_ger<PL, std::complex<double>>(N, M);
431 #endif
432 }
433 
434 TEST_CASE("AccelBLAS", "[BLAS]")
435 {
436  SECTION("gemm")
437  {
438 #if defined(ENABLE_CUDA)
439  std::cout << "Testing gemm<PlatformKind::CUDA>" << std::endl;
440  test_gemm_cases<PlatformKind::CUDA>();
441 #endif
442 #if defined(ENABLE_SYCL)
443  std::cout << "Testing gemm<PlatformKind::SYCL>" << std::endl;
444  test_gemm_cases<PlatformKind::SYCL>();
445 #endif
446 #if defined(ENABLE_OFFLOAD)
447  std::cout << "Testing gemm<PlatformKind::OMPTARGET>" << std::endl;
448  test_gemm_cases<PlatformKind::OMPTARGET>();
449 #endif
450  }
451 
452  SECTION("gemv")
453  {
454 #if defined(ENABLE_CUDA)
455  std::cout << "Testing gemm<PlatformKind::CUDA>" << std::endl;
456  test_gemv_cases<PlatformKind::CUDA>();
457 #endif
458 #if defined(ENABLE_SYCL)
459  std::cout << "Testing gemm<PlatformKind::SYCL>" << std::endl;
460  test_gemv_cases<PlatformKind::SYCL>();
461 #endif
462 #if defined(ENABLE_OFFLOAD)
463  std::cout << "Testing gemm<PlatformKind::OMPTARGET>" << std::endl;
464  test_gemv_cases<PlatformKind::OMPTARGET>();
465 #endif
466  }
467 
468  SECTION("ger")
469  {
470 #if defined(ENABLE_CUDA)
471  std::cout << "Testing ger<PlatformKind::CUDA>" << std::endl;
472  test_ger_cases<PlatformKind::CUDA>();
473 #endif
474 #if defined(ENABLE_SYCL)
475  std::cout << "Testing ger<PlatformKind::SYCL>" << std::endl;
476  test_ger_cases<PlatformKind::SYCL>();
477 #endif
478 #if defined(ENABLE_OFFLOAD)
479  std::cout << "Testing ger<PlatformKind::OMPTARGET>" << std::endl;
480  test_ger_cases<PlatformKind::OMPTARGET>();
481 #endif
482  }
483 }
484 } // namespace qmcplusplus
void gemm(BLASHandle< PlatformKind::CUDA > &handle, const char transa, const char transb, int m, int n, int k, const float &alpha, const float *A, int lda, const float *B, int ldb, const float &beta, float *C, int ldc)
static void gemv_trans(int n, int m, const double *restrict amat, const double *restrict x, double *restrict y)
Definition: BLAS.hpp:147
QMCTraits::RealType real
helper functions for EinsplineSetBuilder
Definition: Configuration.h:43
TEST_CASE("complex_helper", "[type_traits]")
pointer device_data()
Return the device_ptr matching X if this is a vector attached or owning dual space memory...
Definition: OhmmsVector.h:245
static void gemv(int n, int m, const double *restrict amat, const double *restrict x, double *restrict y)
Definition: BLAS.hpp:118
void test_ger_cases()
float imag(const float &c)
imaginary part of a scalar. Cannot be replaced by std::imag due to AFQMC specific needs...
void ger(BLASHandle< PlatformKind::CUDA > &handle, const int m, const int n, const float &alpha, const float *const x, const int incx, const float *const y, const int incy, float *const A, const int lda)
void gemv_batched(BLASHandle< PlatformKind::CUDA > &handle, const char trans, const int m, const int n, const T *alpha, const T *const A[], const int lda, const T *const x[], const int incx, const T *beta, T *const y[], const int incy, const int batch_count)
These allocators are to make code that should be generic with the respect to accelerator code flavor ...
static void ger(int m, int n, double alpha, const double *x, int incx, const double *y, int incy, double *a, int lda)
Definition: BLAS.hpp:437
void gemm_batched(BLASHandle< PlatformKind::CUDA > &handle, const char transa, const char transb, int m, int n, int k, const float &alpha, const float *const A[], int lda, const float *const B[], int ldb, const float &beta, float *const C[], int ldc, int batchCount)
void test_one_gemv(const int M_b, const int N_b, const char trans)
void test_gemm_cases()
void test_one_gemm(const int M, const int N, const int K, const char transa, const char transb)
Declaraton of Vector<T,Alloc> Manage memory through Alloc directly and allow referencing an existing ...
CHECK(log_values[0]==ComplexApprox(std::complex< double >{ 5.603777579195571, -6.1586603331188225 }))
void ger_batched(BLASHandle< PlatformKind::CUDA > &handle, const int m, const int n, const T *alpha, const T *const x[], const int incx, const T *const y[], const int incy, T *const A[], const int lda, const int batch_count)
static void gemm(char Atrans, char Btrans, int M, int N, int K, double alpha, const double *A, int lda, const double *restrict B, int ldb, double beta, double *restrict C, int ldc)
Definition: BLAS.hpp:235
void updateTo(size_type size=0, std::ptrdiff_t offset=0)
Definition: OhmmsVector.h:263
double B(double x, int k, int i, const std::vector< double > &t)
void gemv(BLASHandle< PlatformKind::CUDA > &handle, const char trans, const int m, const int n, const float &alpha, const float *const A, const int lda, const float *const x, const int incx, const float &beta, float *const y, const int incy)
void test_gemv_cases()
void test_one_ger(const int M, const int N)