QMCPACK
ompBLAS.cpp
Go to the documentation of this file.
1 //////////////////////////////////////////////////////////////////////////////////////
2 // This file is distributed under the University of Illinois/NCSA Open Source License.
3 // See LICENSE file in top directory for details.
4 //
5 // Copyright (c) 2020 QMCPACK developers.
6 //
7 // File developed by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
8 //
9 // File created by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
10 //////////////////////////////////////////////////////////////////////////////////////
11 
12 
13 #include "ompBLAS.hpp"
14 #include <cstdint>
15 #include <stdexcept>
16 #include "config.h"
17 #if !defined(OPENMP_NO_COMPLEX)
18 #include "ompReductionComplex.hpp"
19 #endif
20 
21 namespace qmcplusplus
22 {
23 namespace ompBLAS
24 {
25 
26 template<typename T>
28  const char transa,
29  const char transb,
30  const int M,
31  const int N,
32  const int K,
33  const T& alpha,
34  const T* const A,
35  const int lda,
36  const T* const B,
37  const int ldb,
38  const T& beta,
39  T* const C,
40  const int ldc)
41 {
42  if (M == 0 || N == 0 || K == 0)
43  return 0;
44 
45  if (transa == 'T' && transb == 'N') //A(ji) * B(jk) -> C(ik)
46  {
47  PRAGMA_OFFLOAD("omp target teams distribute parallel for collapse(2) is_device_ptr(A, B, C)")
48  for (size_t m = 0; m < M; m++)
49  for (size_t n = 0; n < N; n++)
50  {
51  T sum(0);
52  for (size_t k = 0; k < K; k++)
53  sum += A[lda * m + k] * B[ldb * n + k];
54  C[n * ldc + m] = alpha * sum + (beta == T(0) ? T(0) : C[n * ldc + m] * beta);
55  }
56  }
57  else if (transa == 'T' && transb == 'T')
58  {
59  PRAGMA_OFFLOAD("omp target teams distribute parallel for collapse(2) is_device_ptr(A, B, C)")
60  for (size_t m = 0; m < M; m++)
61  for (size_t n = 0; n < N; n++)
62  {
63  T sum(0);
64  for (size_t k = 0; k < K; k++)
65  sum += A[lda * m + k] * B[ldb * k + n];
66  C[n * ldc + m] = alpha * sum + (beta == T(0) ? T(0) : C[n * ldc + m] * beta);
67  }
68  }
69  else if (transa == 'N' && transb == 'T')
70  {
71  PRAGMA_OFFLOAD("omp target teams distribute parallel for collapse(2) is_device_ptr(A, B, C)")
72  for (size_t m = 0; m < M; m++)
73  for (size_t n = 0; n < N; n++)
74  {
75  T sum(0);
76  for (size_t k = 0; k < K; k++)
77  sum += A[lda * k + m] * B[ldb * k + n];
78  C[n * ldc + m] = alpha * sum + (beta == T(0) ? T(0) : C[n * ldc + m] * beta);
79  }
80  }
81  else if (transa == 'N' && transb == 'N')
82  {
83  PRAGMA_OFFLOAD("omp target teams distribute parallel for collapse(2) is_device_ptr(A, B, C)")
84  for (size_t n = 0; n < N; n++)
85  for (size_t m = 0; m < M; m++)
86  {
87  T sum(0);
88  for (size_t k = 0; k < K; k++)
89  sum += A[lda * k + m] * B[ldb * n + k];
90  C[n * ldc + m] = alpha * sum + (beta == T(0) ? T(0) : C[n * ldc + m] * beta);
91  }
92  }
93  else
94  throw std::runtime_error("Error: trans=='C' not yet implemented for ompBLAS::gemm.");
95 
96  return 0;
97 }
98 
99 template<>
101  const char transa,
102  const char transb,
103  const int M,
104  const int N,
105  const int K,
106  const float& alpha,
107  const float* const A,
108  const int lda,
109  const float* const B,
110  const int ldb,
111  const float& beta,
112  float* const C,
113  const int ldc)
114 {
115  return gemm_impl(handle, transa, transb, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
116 }
117 
118 template<>
120  const char transa,
121  const char transb,
122  const int M,
123  const int N,
124  const int K,
125  const double& alpha,
126  const double* const A,
127  const int lda,
128  const double* const B,
129  const int ldb,
130  const double& beta,
131  double* const C,
132  const int ldc)
133 {
134  return gemm_impl(handle, transa, transb, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
135 }
136 
137 #if !defined(OPENMP_NO_COMPLEX)
138 template<>
139 ompBLAS_status gemm<std::complex<float>>(ompBLAS_handle& handle,
140  const char transa,
141  const char transb,
142  const int M,
143  const int N,
144  const int K,
145  const std::complex<float>& alpha,
146  const std::complex<float>* const A,
147  const int lda,
148  const std::complex<float>* const B,
149  const int ldb,
150  const std::complex<float>& beta,
151  std::complex<float>* const C,
152  const int ldc)
153 {
154  return gemm_impl(handle, transa, transb, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
155 }
156 
157 template<>
158 ompBLAS_status gemm<std::complex<double>>(ompBLAS_handle& handle,
159  const char transa,
160  const char transb,
161  const int M,
162  const int N,
163  const int K,
164  const std::complex<double>& alpha,
165  const std::complex<double>* const A,
166  const int lda,
167  const std::complex<double>* const B,
168  const int ldb,
169  const std::complex<double>& beta,
170  std::complex<double>* const C,
171  const int ldc)
172 {
173  return gemm_impl(handle, transa, transb, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
174 }
175 #endif
176 
177 template<typename T>
179  const char transa,
180  const char transb,
181  const int M,
182  const int N,
183  const int K,
184  const T alpha,
185  const T* const Aarray[],
186  const int lda,
187  const T* const Barray[],
188  const int ldb,
189  const T beta,
190  T* const Carray[],
191  const int ldc,
192  const int batch_count)
193 {
194  if (M == 0 || N == 0 || K == 0 || batch_count == 0)
195  return 0;
196 
197  if (transa == 'T' && transb == 'N') //A(ji) * B(jk) -> C(ik)
198  {
199  PRAGMA_OFFLOAD("omp target teams distribute is_device_ptr(Aarray, Barray, Carray)")
200  for (size_t iw = 0; iw < batch_count; iw++)
201  {
202  auto A = Aarray[iw];
203  auto B = Barray[iw];
204  auto C = Carray[iw];
205  PRAGMA_OFFLOAD("omp parallel for collapse(2)")
206  for (size_t m = 0; m < M; m++)
207  for (size_t n = 0; n < N; n++)
208  {
209  T sum(0);
210  for (size_t k = 0; k < K; k++)
211  sum += A[lda * m + k] * B[ldb * n + k];
212  C[n * ldc + m] = alpha * sum + (beta == T(0) ? T(0) : C[n * ldc + m] * beta);
213  }
214  }
215  }
216  else if (transa == 'T' && transb == 'T')
217  {
218  PRAGMA_OFFLOAD("omp target teams distribute is_device_ptr(Aarray, Barray, Carray)")
219  for (size_t iw = 0; iw < batch_count; iw++)
220  {
221  auto A = Aarray[iw];
222  auto B = Barray[iw];
223  auto C = Carray[iw];
224  PRAGMA_OFFLOAD("omp parallel for collapse(2)")
225  for (size_t m = 0; m < M; m++)
226  for (size_t n = 0; n < N; n++)
227  {
228  T sum(0);
229  for (size_t k = 0; k < K; k++)
230  sum += A[lda * m + k] * B[ldb * k + n];
231  C[n * ldc + m] = alpha * sum + (beta == T(0) ? T(0) : C[n * ldc + m] * beta);
232  }
233  }
234  }
235  else if (transa == 'N' && transb == 'T')
236  {
237  PRAGMA_OFFLOAD("omp target teams distribute is_device_ptr(Aarray, Barray, Carray)")
238  for (size_t iw = 0; iw < batch_count; iw++)
239  {
240  auto A = Aarray[iw];
241  auto B = Barray[iw];
242  auto C = Carray[iw];
243  PRAGMA_OFFLOAD("omp parallel for collapse(2)")
244  for (size_t m = 0; m < M; m++)
245  for (size_t n = 0; n < N; n++)
246  {
247  T sum(0);
248  for (size_t k = 0; k < K; k++)
249  sum += A[lda * k + m] * B[ldb * k + n];
250  C[n * ldc + m] = alpha * sum + (beta == T(0) ? T(0) : C[n * ldc + m] * beta);
251  }
252  }
253  }
254  else if (transa == 'N' && transb == 'N')
255  {
256  PRAGMA_OFFLOAD("omp target teams distribute is_device_ptr(Aarray, Barray, Carray)")
257  for (size_t iw = 0; iw < batch_count; iw++)
258  {
259  auto A = Aarray[iw];
260  auto B = Barray[iw];
261  auto C = Carray[iw];
262  PRAGMA_OFFLOAD("omp parallel for collapse(2)")
263  for (size_t n = 0; n < N; n++)
264  for (size_t m = 0; m < M; m++)
265  {
266  T sum(0);
267  for (size_t k = 0; k < K; k++)
268  sum += A[lda * k + m] * B[ldb * n + k];
269  C[n * ldc + m] = alpha * sum + (beta == T(0) ? T(0) : C[n * ldc + m] * beta);
270  }
271  }
272  }
273  else
274  throw std::runtime_error("Error: trans=='C' not yet implemented for ompBLAS::gemm.");
275 
276  return 0;
277 }
278 
279 template<>
281  const char transa,
282  const char transb,
283  const int M,
284  const int N,
285  const int K,
286  const float& alpha,
287  const float* const A[],
288  const int lda,
289  const float* const B[],
290  const int ldb,
291  const float& beta,
292  float* const C[],
293  const int ldc,
294  const int batch_count)
295 {
296  return gemm_batched_impl(handle, transa, transb, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc, batch_count);
297 }
298 
299 template<>
301  const char transa,
302  const char transb,
303  const int M,
304  const int N,
305  const int K,
306  const double& alpha,
307  const double* const A[],
308  const int lda,
309  const double* const B[],
310  const int ldb,
311  const double& beta,
312  double* const C[],
313  const int ldc,
314  const int batch_count)
315 {
316  return gemm_batched_impl(handle, transa, transb, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc, batch_count);
317 }
318 
319 #if !defined(OPENMP_NO_COMPLEX)
320 template<>
321 ompBLAS_status gemm_batched<std::complex<float>>(ompBLAS_handle& handle,
322  const char transa,
323  const char transb,
324  const int M,
325  const int N,
326  const int K,
327  const std::complex<float>& alpha,
328  const std::complex<float>* const A[],
329  const int lda,
330  const std::complex<float>* const B[],
331  const int ldb,
332  const std::complex<float>& beta,
333  std::complex<float>* const C[],
334  const int ldc,
335  const int batch_count)
336 {
337  return gemm_batched_impl(handle, transa, transb, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc, batch_count);
338 }
339 
340 template<>
341 ompBLAS_status gemm_batched<std::complex<double>>(ompBLAS_handle& handle,
342  const char transa,
343  const char transb,
344  const int M,
345  const int N,
346  const int K,
347  const std::complex<double>& alpha,
348  const std::complex<double>* const A[],
349  const int lda,
350  const std::complex<double>* const B[],
351  const int ldb,
352  const std::complex<double>& beta,
353  std::complex<double>* const C[],
354  const int ldc,
355  const int batch_count)
356 {
357  return gemm_batched_impl(handle, transa, transb, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc, batch_count);
358 }
359 #endif
360 
361 template<typename T>
363  const char trans,
364  const int m,
365  const int n,
366  const T alpha,
367  const T* const A,
368  const int lda,
369  const T* const x,
370  const int incx,
371  const T beta,
372  T* const y,
373  const int incy)
374 {
375  if (m == 0 || n == 0)
376  return 0;
377 
378  if (trans == 'T')
379  {
380  if (incx != 1 || incy != 1)
381  throw std::runtime_error("incx!=1 or incy!=1 are not implemented in ompBLAS::gemv_impl trans='T'!");
382 
383  PRAGMA_OFFLOAD("omp target teams distribute num_teams(n) is_device_ptr(A, x, y)")
384  for (uint32_t i = 0; i < n; i++)
385  {
386  T dot_sum(0);
387  PRAGMA_OFFLOAD("omp parallel for simd reduction(+: dot_sum)")
388  for (uint32_t j = 0; j < m; j++)
389  dot_sum += x[j] * A[i * lda + j];
390  if (beta == T(0))
391  y[i] = alpha * dot_sum; // protecting NaN from y
392  else
393  y[i] = alpha * dot_sum + beta * y[i];
394  }
395  return 0;
396  }
397  else if (trans == 'N')
398  {
399  if (incx != 1 || incy != 1)
400  throw std::runtime_error("incx !=1 or incy != 1 are not implemented in ompBLAS::gemv_impl trans='N'!");
401 
402  PRAGMA_OFFLOAD("omp target teams distribute num_teams(m) is_device_ptr(A, x, y)")
403  for (uint32_t i = 0; i < m; i++)
404  {
405  T dot_sum(0);
406  PRAGMA_OFFLOAD("omp parallel for simd reduction(+: dot_sum)")
407  for (uint32_t j = 0; j < n; j++)
408  dot_sum += x[j] * A[j * lda + i];
409  if (beta == T(0))
410  y[i] = alpha * dot_sum; // protecting NaN from y
411  else
412  y[i] = alpha * dot_sum + beta * y[i];
413  }
414  return 0;
415  }
416  else
417  throw std::runtime_error("Error: trans=='C' not yet implemented for ompBLAS::gemv_impl.");
418 }
419 
420 template<>
422  const char trans,
423  const int m,
424  const int n,
425  const float alpha,
426  const float* const A,
427  const int lda,
428  const float* const x,
429  const int incx,
430  const float beta,
431  float* const y,
432  const int incy)
433 {
434  return gemv_impl(handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
435 }
436 
437 template<>
439  const char trans,
440  const int m,
441  const int n,
442  const double alpha,
443  const double* const A,
444  const int lda,
445  const double* const x,
446  const int incx,
447  const double beta,
448  double* const y,
449  const int incy)
450 {
451  return gemv_impl(handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
452 }
453 
454 #if !defined(OPENMP_NO_COMPLEX)
455 template<>
456 ompBLAS_status gemv<std::complex<float>>(ompBLAS_handle& handle,
457  const char trans,
458  const int m,
459  const int n,
460  const std::complex<float> alpha,
461  const std::complex<float>* const A,
462  const int lda,
463  const std::complex<float>* const x,
464  const int incx,
465  const std::complex<float> beta,
466  std::complex<float>* const y,
467  const int incy)
468 {
469  return gemv_impl(handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
470 }
471 
472 template<>
473 ompBLAS_status gemv<std::complex<double>>(ompBLAS_handle& handle,
474  const char trans,
475  const int m,
476  const int n,
477  const std::complex<double> alpha,
478  const std::complex<double>* const A,
479  const int lda,
480  const std::complex<double>* const x,
481  const int incx,
482  const std::complex<double> beta,
483  std::complex<double>* const y,
484  const int incy)
485 {
486  return gemv_impl(handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
487 }
488 #endif
489 
490 
491 template<typename T>
493  const char trans,
494  const int m,
495  const int n,
496  const T* alpha,
497  const T* const A[],
498  const int lda,
499  const T* const x[],
500  const int incx,
501  const T* beta,
502  T* const y[],
503  const int incy,
504  const int batch_count)
505 {
506  if (m == 0 || n == 0 || batch_count == 0)
507  return 0;
508 
509  if (trans == 'T')
510  {
511  if (incx != 1)
512  throw std::runtime_error("incx!=1 are not implemented in ompBLAS::gemv_batched_impl trans='T'!");
513 
514  PRAGMA_OFFLOAD("omp target teams distribute collapse(2) num_teams(batch_count * n) \
515  is_device_ptr(A, x, y, alpha, beta)")
516  for (uint32_t ib = 0; ib < batch_count; ib++)
517  for (uint32_t i = 0; i < n; i++)
518  {
519  T dot_sum(0);
520  PRAGMA_OFFLOAD("omp parallel for simd reduction(+: dot_sum)")
521  for (uint32_t j = 0; j < m; j++)
522  dot_sum += x[ib][j] * A[ib][i * lda + j];
523  if (beta[ib] == T(0))
524  y[ib][i * incy] = alpha[ib] * dot_sum; // protecting NaN from y
525  else
526  y[ib][i * incy] = alpha[ib] * dot_sum + beta[ib] * y[ib][i * incy];
527  }
528  return 0;
529  }
530  else if (trans == 'N')
531  {
532  if (incx != 1)
533  throw std::runtime_error("incx!=1 are not implemented in ompBLAS::gemv_batched_impl trans='N'!");
534 
535  PRAGMA_OFFLOAD("omp target teams distribute collapse(2) num_teams(batch_count * n) \
536  is_device_ptr(A, x, y, alpha, beta)")
537  for (uint32_t ib = 0; ib < batch_count; ib++)
538  for (uint32_t i = 0; i < m; i++)
539  {
540  T dot_sum(0);
541  PRAGMA_OFFLOAD("omp parallel for simd reduction(+: dot_sum)")
542  for (uint32_t j = 0; j < n; j++)
543  dot_sum += x[ib][j] * A[ib][j * lda + i];
544  if (beta[ib] == T(0))
545  y[ib][i * incy] = alpha[ib] * dot_sum; // protecting NaN from y
546  else
547  y[ib][i * incy] = alpha[ib] * dot_sum + beta[ib] * y[ib][i * incy];
548  }
549  return 0;
550  }
551  else
552  throw std::runtime_error("Error: trans=='C' not yet implemented for ompBLAS::gemv_impl.");
553 }
554 
555 template<>
557  const char trans,
558  const int m,
559  const int n,
560  const float* alpha,
561  const float* const A[],
562  const int lda,
563  const float* const x[],
564  const int incx,
565  const float* beta,
566  float* const y[],
567  const int incy,
568  const int batch_count)
569 {
570  return gemv_batched_impl(handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy, batch_count);
571 }
572 
573 template<>
575  const char trans,
576  const int m,
577  const int n,
578  const double* alpha,
579  const double* const A[],
580  const int lda,
581  const double* const x[],
582  const int incx,
583  const double* beta,
584  double* const y[],
585  const int incy,
586  const int batch_count)
587 {
588  return gemv_batched_impl(handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy, batch_count);
589 }
590 
591 #if !defined(OPENMP_NO_COMPLEX)
592 template<>
593 ompBLAS_status gemv_batched<std::complex<float>>(ompBLAS_handle& handle,
594  const char trans,
595  const int m,
596  const int n,
597  const std::complex<float>* alpha,
598  const std::complex<float>* const A[],
599  const int lda,
600  const std::complex<float>* const x[],
601  const int incx,
602  const std::complex<float>* beta,
603  std::complex<float>* const y[],
604  const int incy,
605  const int batch_count)
606 {
607  return gemv_batched_impl(handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy, batch_count);
608 }
609 
610 template<>
611 ompBLAS_status gemv_batched<std::complex<double>>(ompBLAS_handle& handle,
612  const char trans,
613  const int m,
614  const int n,
615  const std::complex<double>* alpha,
616  const std::complex<double>* const A[],
617  const int lda,
618  const std::complex<double>* const x[],
619  const int incx,
620  const std::complex<double>* beta,
621  std::complex<double>* const y[],
622  const int incy,
623  const int batch_count)
624 {
625  return gemv_batched_impl(handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy, batch_count);
626 }
627 #endif
628 
629 
630 template<typename T>
632  const int m,
633  const int n,
634  const T alpha,
635  const T* const x,
636  const int incx,
637  const T* const y,
638  const int incy,
639  T* const A,
640  const int lda)
641 {
642  if (m == 0 || n == 0)
643  return 0;
644 
645  if (incx != 1 || incy != 1)
646  throw std::runtime_error("incx !=1 or incy != 1 are not implemented in ompBLAS::ger_impl!");
647 
648  //BLAS::ger(m, n, alpha, x, incx, y, incy, A, lda);
649  PRAGMA_OFFLOAD("omp target teams distribute parallel for collapse(2) is_device_ptr(A, x, y)")
650  for (uint32_t i = 0; i < n; i++)
651  for (uint32_t j = 0; j < m; j++)
652  A[i * lda + j] += alpha * x[j] * y[i];
653  return 0;
654 }
655 
656 template<>
658  const int m,
659  const int n,
660  const float alpha,
661  const float* const x,
662  const int incx,
663  const float* const y,
664  const int incy,
665  float* const A,
666  const int lda)
667 {
668  return ger_impl(handle, m, n, alpha, x, incx, y, incy, A, lda);
669 }
670 
671 template<>
673  const int m,
674  const int n,
675  const double alpha,
676  const double* const x,
677  const int incx,
678  const double* const y,
679  const int incy,
680  double* const A,
681  const int lda)
682 {
683  return ger_impl(handle, m, n, alpha, x, incx, y, incy, A, lda);
684 }
685 
686 #if !defined(OPENMP_NO_COMPLEX)
687 template<>
688 ompBLAS_status ger<std::complex<float>>(ompBLAS_handle& handle,
689  const int m,
690  const int n,
691  const std::complex<float> alpha,
692  const std::complex<float>* const x,
693  const int incx,
694  const std::complex<float>* const y,
695  const int incy,
696  std::complex<float>* const A,
697  const int lda)
698 {
699  return ger_impl(handle, m, n, alpha, x, incx, y, incy, A, lda);
700 }
701 
702 template<>
703 ompBLAS_status ger<std::complex<double>>(ompBLAS_handle& handle,
704  const int m,
705  const int n,
706  const std::complex<double> alpha,
707  const std::complex<double>* const x,
708  const int incx,
709  const std::complex<double>* const y,
710  const int incy,
711  std::complex<double>* const A,
712  const int lda)
713 {
714  return ger_impl(handle, m, n, alpha, x, incx, y, incy, A, lda);
715 }
716 #endif
717 
718 
719 template<typename T>
721  const int m,
722  const int n,
723  const T* alpha,
724  const T* const x[],
725  const int incx,
726  const T* const y[],
727  const int incy,
728  T* const A[],
729  const int lda,
730  const int batch_count)
731 {
732  if (m == 0 || n == 0 || batch_count == 0)
733  return 0;
734 
735 
736  if (incx != 1)
737  throw std::runtime_error("incx!=1 are not implemented in ompBLAS::ger_batched_impl!");
738 
739  PRAGMA_OFFLOAD("omp target teams distribute parallel for collapse(3) is_device_ptr(A, x, y, alpha)")
740  for (uint32_t ib = 0; ib < batch_count; ib++)
741  for (uint32_t i = 0; i < n; i++)
742  for (uint32_t j = 0; j < m; j++)
743  A[ib][i * lda + j] += alpha[ib] * x[ib][j] * y[ib][i * incy];
744  return 0;
745 }
746 
747 template<>
749  const int m,
750  const int n,
751  const float* alpha,
752  const float* const x[],
753  const int incx,
754  const float* const y[],
755  const int incy,
756  float* const A[],
757  const int lda,
758  const int batch_count)
759 {
760  return ger_batched_impl(handle, m, n, alpha, x, incx, y, incy, A, lda, batch_count);
761 }
762 
763 template<>
765  const int m,
766  const int n,
767  const double* alpha,
768  const double* const x[],
769  const int incx,
770  const double* const y[],
771  const int incy,
772  double* const A[],
773  const int lda,
774  const int batch_count)
775 {
776  return ger_batched_impl(handle, m, n, alpha, x, incx, y, incy, A, lda, batch_count);
777 }
778 
779 #if !defined(OPENMP_NO_COMPLEX)
780 template<>
781 ompBLAS_status ger_batched<std::complex<float>>(ompBLAS_handle& handle,
782  const int m,
783  const int n,
784  const std::complex<float>* alpha,
785  const std::complex<float>* const x[],
786  const int incx,
787  const std::complex<float>* const y[],
788  const int incy,
789  std::complex<float>* const A[],
790  const int lda,
791  const int batch_count)
792 {
793  return ger_batched_impl(handle, m, n, alpha, x, incx, y, incy, A, lda, batch_count);
794 }
795 
796 template<>
797 ompBLAS_status ger_batched<std::complex<double>>(ompBLAS_handle& handle,
798  const int m,
799  const int n,
800  const std::complex<double>* alpha,
801  const std::complex<double>* const x[],
802  const int incx,
803  const std::complex<double>* const y[],
804  const int incy,
805  std::complex<double>* const A[],
806  const int lda,
807  const int batch_count)
808 {
809  return ger_batched_impl(handle, m, n, alpha, x, incx, y, incy, A, lda, batch_count);
810 }
811 #endif
812 
813 
814 template<typename T>
816  const int n,
817  const T* const x[],
818  const int incx,
819  T* const y[],
820  const int incy,
821  const int batch_count)
822 {
823  if (n == 0 || batch_count == 0)
824  return 0;
825 
826  PRAGMA_OFFLOAD("omp target teams distribute parallel for collapse(2) is_device_ptr(x, y)")
827  for (uint32_t ib = 0; ib < batch_count; ib++)
828  for (uint32_t i = 0; i < n; i++)
829  y[ib][i * incy] = x[ib][i * incx];
830  return 0;
831 }
832 
833 template<>
835  const int n,
836  const float* const x[],
837  const int incx,
838  float* const y[],
839  const int incy,
840  const int batch_count)
841 {
842  return copy_batched_impl(handle, n, x, incx, y, incy, batch_count);
843 }
844 
845 template<>
847  const int n,
848  const double* const x[],
849  const int incx,
850  double* const y[],
851  const int incy,
852  const int batch_count)
853 {
854  return copy_batched_impl(handle, n, x, incx, y, incy, batch_count);
855 }
856 
857 #if !defined(OPENMP_NO_COMPLEX)
858 template<>
859 ompBLAS_status copy_batched<std::complex<float>>(ompBLAS_handle& handle,
860  const int n,
861  const std::complex<float>* const x[],
862  const int incx,
863  std::complex<float>* const y[],
864  const int incy,
865  const int batch_count)
866 {
867  return copy_batched_impl(handle, n, x, incx, y, incy, batch_count);
868 }
869 
870 template<>
871 ompBLAS_status copy_batched<std::complex<double>>(ompBLAS_handle& handle,
872  const int n,
873  const std::complex<double>* const x[],
874  const int incx,
875  std::complex<double>* const y[],
876  const int incy,
877  const int batch_count)
878 {
879  return copy_batched_impl(handle, n, x, incx, y, incy, batch_count);
880 }
881 #endif
882 
883 template<typename T>
885  const int n,
886  const T* const x[],
887  const int x_offset,
888  const int incx,
889  T* const y[],
890  const int y_offset,
891  const int incy,
892  const int batch_count)
893 {
894  if (n == 0 || batch_count == 0)
895  return 0;
896 
897  PRAGMA_OFFLOAD("omp target teams distribute parallel for collapse(2) is_device_ptr(x, y)")
898  for (uint32_t ib = 0; ib < batch_count; ib++)
899  for (uint32_t i = 0; i < n; i++)
900  y[ib][y_offset + i * incy] = x[ib][x_offset + i * incx];
901  return 0;
902 }
903 
904 template<>
906  const int n,
907  const float* const x[],
908  const int x_offset,
909  const int incx,
910  float* const y[],
911  const int y_offset,
912  const int incy,
913  const int batch_count)
914 {
915  return copy_batched_offset_impl(handle, n, x, x_offset, incx, y, y_offset, incy, batch_count);
916 }
917 
918 template<>
920  const int n,
921  const double* const x[],
922  const int x_offset,
923  const int incx,
924  double* const y[],
925  const int y_offset,
926  const int incy,
927  const int batch_count)
928 {
929  return copy_batched_offset_impl(handle, n, x, x_offset, incx, y, y_offset, incy, batch_count);
930 }
931 
932 #if !defined(OPENMP_NO_COMPLEX)
933 template<>
934 ompBLAS_status copy_batched_offset<std::complex<float>>(ompBLAS_handle& handle,
935  const int n,
936  const std::complex<float>* const x[],
937  const int x_offset,
938  const int incx,
939  std::complex<float>* const y[],
940  const int y_offset,
941  const int incy,
942  const int batch_count)
943 {
944  return copy_batched_offset_impl(handle, n, x, x_offset, incx, y, y_offset, incy, batch_count);
945 }
946 
947 template<>
948 ompBLAS_status copy_batched_offset<std::complex<double>>(ompBLAS_handle& handle,
949  const int n,
950  const std::complex<double>* const x[],
951  const int x_offset,
952  const int incx,
953  std::complex<double>* const y[],
954  const int y_offset,
955  const int incy,
956  const int batch_count)
957 {
958  return copy_batched_offset_impl(handle, n, x, x_offset, incx, y, y_offset, incy, batch_count);
959 }
960 #endif
961 
962 template<typename T>
964  const int n,
965  const T* const x,
966  const int incx,
967  T* const y,
968  const int incy)
969 {
970  if (n == 0)
971  return 0;
972  PRAGMA_OFFLOAD("omp target teams distribute parallel for is_device_ptr(x, y)")
973  for (size_t i = 0; i < n; i++)
974  y[i * incy] = x[i * incx];
975  return 0;
976 }
977 
978 template<>
980  const int n,
981  const float* const x,
982  const int incx,
983  float* const y,
984  const int incy)
985 {
986  return copy_impl(handle, n, x, incx, y, incy);
987 }
988 
989 template<>
991  const int n,
992  const double* const x,
993  const int incx,
994  double* const y,
995  const int incy)
996 {
997  return copy_impl(handle, n, x, incx, y, incy);
998 }
999 
1000 template<>
1001 ompBLAS_status copy<std::complex<float>>(ompBLAS_handle& handle,
1002  const int n,
1003  const std::complex<float>* const x,
1004  const int incx,
1005  std::complex<float>* const y,
1006  const int incy)
1007 {
1008  return copy_impl(handle, n, x, incx, y, incy);
1009 }
1010 
1011 template<>
1012 ompBLAS_status copy<std::complex<double>>(ompBLAS_handle& handle,
1013  const int n,
1014  const std::complex<double>* const x,
1015  const int incx,
1016  std::complex<double>* const y,
1017  const int incy)
1018 {
1019  return copy_impl(handle, n, x, incx, y, incy);
1020 }
1021 } // namespace ompBLAS
1022 } // namespace qmcplusplus
ompBLAS_status ger_batched< float >(ompBLAS_handle &handle, const int m, const int n, const float *alpha, const float *const x[], const int incx, const float *const y[], const int incy, float *const A[], const int lda, const int batch_count)
Definition: ompBLAS.cpp:748
ompBLAS_status copy_batched_offset< float >(ompBLAS_handle &handle, const int n, const float *const x[], const int x_offset, const int incx, float *const y[], const int y_offset, const int incy, const int batch_count)
Definition: ompBLAS.cpp:905
ompBLAS_status copy_batched< double >(ompBLAS_handle &handle, const int n, const double *const x[], const int incx, double *const y[], const int incy, const int batch_count)
Definition: ompBLAS.cpp:846
ompBLAS_status gemm_batched_impl(ompBLAS_handle &handle, const char transa, const char transb, const int M, const int N, const int K, const T alpha, const T *const Aarray[], const int lda, const T *const Barray[], const int ldb, const T beta, T *const Carray[], const int ldc, const int batch_count)
Definition: ompBLAS.cpp:178
helper functions for EinsplineSetBuilder
Definition: Configuration.h:43
ompBLAS_status ger_impl(ompBLAS_handle &handle, const int m, const int n, const T alpha, const T *const x, const int incx, const T *const y, const int incy, T *const A, const int lda)
Definition: ompBLAS.cpp:631
ompBLAS_status gemm_impl(ompBLAS_handle &handle, const char transa, const char transb, const int M, const int N, const int K, const T &alpha, const T *const A, const int lda, const T *const B, const int ldb, const T &beta, T *const C, const int ldc)
Definition: ompBLAS.cpp:27
ompBLAS_status copy_batched_impl(ompBLAS_handle &handle, const int n, const T *const x[], const int incx, T *const y[], const int incy, const int batch_count)
Definition: ompBLAS.cpp:815
ompBLAS_status copy< double >(ompBLAS_handle &handle, const int n, const double *const x, const int incx, double *const y, const int incy)
Definition: ompBLAS.cpp:990
ompBLAS_status ger_batched_impl(ompBLAS_handle &handle, const int m, const int n, const T *alpha, const T *const x[], const int incx, const T *const y[], const int incy, T *const A[], const int lda, const int batch_count)
Definition: ompBLAS.cpp:720
ompBLAS_status copy_batched_offset_impl(ompBLAS_handle &handle, const int n, const T *const x[], const int x_offset, const int incx, T *const y[], const int y_offset, const int incy, const int batch_count)
Definition: ompBLAS.cpp:884
ompBLAS_status gemv_batched< float >(ompBLAS_handle &handle, const char trans, const int m, const int n, const float *alpha, const float *const A[], const int lda, const float *const x[], const int incx, const float *beta, float *const y[], const int incy, const int batch_count)
Definition: ompBLAS.cpp:556
ompBLAS_status gemm< float >(ompBLAS_handle &handle, const char transa, const char transb, const int M, const int N, const int K, const float &alpha, const float *const A, const int lda, const float *const B, const int ldb, const float &beta, float *const C, const int ldc)
Definition: ompBLAS.cpp:100
ompBLAS_status gemv< double >(ompBLAS_handle &handle, const char trans, const int m, const int n, const double alpha, const double *const A, const int lda, const double *const x, const int incx, const double beta, double *const y, const int incy)
Definition: ompBLAS.cpp:438
ompBLAS_status copy< float >(ompBLAS_handle &handle, const int n, const float *const x, const int incx, float *const y, const int incy)
Definition: ompBLAS.cpp:979
ompBLAS_status gemv_impl(ompBLAS_handle &handle, const char trans, const int m, const int n, const T alpha, const T *const A, const int lda, const T *const x, const int incx, const T beta, T *const y, const int incy)
Definition: ompBLAS.cpp:362
ompBLAS_status copy_impl(ompBLAS_handle &handle, const int n, const T *const x, const int incx, T *const y, const int incy)
Definition: ompBLAS.cpp:963
ompBLAS_status gemm_batched< double >(ompBLAS_handle &handle, const char transa, const char transb, const int M, const int N, const int K, const double &alpha, const double *const A[], const int lda, const double *const B[], const int ldb, const double &beta, double *const C[], const int ldc, const int batch_count)
Definition: ompBLAS.cpp:300
ompBLAS_status gemv_batched_impl(ompBLAS_handle &handle, const char trans, const int m, const int n, const T *alpha, const T *const A[], const int lda, const T *const x[], const int incx, const T *beta, T *const y[], const int incy, const int batch_count)
Definition: ompBLAS.cpp:492
ompBLAS_status ger< float >(ompBLAS_handle &handle, const int m, const int n, const float alpha, const float *const x, const int incx, const float *const y, const int incy, float *const A, const int lda)
Definition: ompBLAS.cpp:657
ompBLAS_status ger_batched< double >(ompBLAS_handle &handle, const int m, const int n, const double *alpha, const double *const x[], const int incx, const double *const y[], const int incy, double *const A[], const int lda, const int batch_count)
Definition: ompBLAS.cpp:764
ompBLAS_status gemm_batched< float >(ompBLAS_handle &handle, const char transa, const char transb, const int M, const int N, const int K, const float &alpha, const float *const A[], const int lda, const float *const B[], const int ldb, const float &beta, float *const C[], const int ldc, const int batch_count)
Definition: ompBLAS.cpp:280
ompBLAS_status ger< double >(ompBLAS_handle &handle, const int m, const int n, const double alpha, const double *const x, const int incx, const double *const y, const int incy, double *const A, const int lda)
Definition: ompBLAS.cpp:672
ompBLAS_status gemm< double >(ompBLAS_handle &handle, const char transa, const char transb, const int M, const int N, const int K, const double &alpha, const double *const A, const int lda, const double *const B, const int ldb, const double &beta, double *const C, const int ldc)
Definition: ompBLAS.cpp:119
ompBLAS_status copy_batched< float >(ompBLAS_handle &handle, const int n, const float *const x[], const int incx, float *const y[], const int incy, const int batch_count)
Definition: ompBLAS.cpp:834
double B(double x, int k, int i, const std::vector< double > &t)
ompBLAS_status copy_batched_offset< double >(ompBLAS_handle &handle, const int n, const double *const x[], const int x_offset, const int incx, double *const y[], const int y_offset, const int incy, const int batch_count)
Definition: ompBLAS.cpp:919
ompBLAS_status gemv< float >(ompBLAS_handle &handle, const char trans, const int m, const int n, const float alpha, const float *const A, const int lda, const float *const x, const int incx, const float beta, float *const y, const int incy)
Definition: ompBLAS.cpp:421
ompBLAS_status gemv_batched< double >(ompBLAS_handle &handle, const char trans, const int m, const int n, const double *alpha, const double *const A[], const int lda, const double *const x[], const int incx, const double *beta, double *const y[], const int incy, const int batch_count)
Definition: ompBLAS.cpp:574