QMCPACK
cuBLAS_LU.hpp
Go to the documentation of this file.
1 //////////////////////////////////////////////////////////////////////////////////////
2 // This file is distributed under the University of Illinois/NCSA Open Source License.
3 // See LICENSE file in top directory for details.
4 //
5 // Copyright (c) 2021 QMCPACK developers.
6 //
7 // File developed by: Peter Doak, doakpw@ornl.gov, Oak Ridge National Laboratory
8 //
9 // File created by: Peter Doak, doakpw@ornl.gov, Oak Ridge National Laboratory
10 //////////////////////////////////////////////////////////////////////////////////////
11 
12 #ifndef QMCPLUSPLUS_CUBLAS_LU_HPP
13 #define QMCPLUSPLUS_CUBLAS_LU_HPP
14 
15 #include <complex>
16 #include <type_traits>
17 #include "config.h"
18 #include <CUDA/CUDAruntime.hpp>
19 #ifndef QMC_CUDA2HIP
20 #include <cublas_v2.h>
21 #include <cuComplex.h>
22 #else
23 #include <hipblas/hipblas.h>
24 #include <hip/hip_complex.h>
25 #include <ROCm/hipBLAS.hpp>
26 #endif
27 
28 /** \file
29  * At the qmcplusplus cuBLAS_LU level all *, **, *[] are assumed to be to device
30  * addresses.
31  */
32 namespace qmcplusplus
33 {
34 namespace cuBLAS_LU
35 {
36 /** Takes PsiM in column major layout and uses LU factorization to compute the log determinant and invPsiM.
37  * This is the call the QMCPACK should use.
38  *
39  * \param[inout] Ms - device pointers to pointers to Ms on input and to LU matrices on output
40  * \param[out] Cs - device pointers to memory space same size as M which over written with invM
41  * \param[in] pivots - pointer to n * nw ints allocated in device memory for pivots array.
42  * \param[in] host_infos - pointer to nw ints allocated in pinned host memory for factorization infos
43  * \param[in] infos - pointer to nw ints allocated in device memory factorization infos
44  * \param[out] log_dets - pointer device memory for nw log determinant values to be returned will be zeroed.
45  * \param[in] batch_size - if this changes over run a huge performance hit will be taken as memory allocation syncs device.
46  *
47  * The host infos is an exception to this that may be changed in the future. The logic for this should probably be in
48  * the next class up. This would obviously split the computeInverseAndDetLog_batched call.
49  */
50 template<typename T>
53  const int n,
54  const int lda,
55  T* Ms[],
56  T* Cs[],
57  T* LU_diags,
58  int* pivots,
59  int* host_infos,
60  int* infos,
61  std::complex<double>* log_dets,
62  const int batch_size);
63 
64 template<typename T>
67  const int n,
68  const int lda,
69  T* Ms[],
70  int* pivots,
71  int* host_infos,
72  int* infos,
73  const int batch_size);
74 
75 template<typename T>
77  const int n,
78  const int lda,
79  T** Ms,
80  const int* pivots,
81  std::complex<double>* logdets,
82  const int batch_size);
83 
84 template<typename T>
87  const int n,
88  const int lda,
89  T* Ms[],
90  T* Cs[],
91  int* pivots,
92  int* host_infos,
93  int* infos,
94  const int batch_size);
95 
96 extern template void computeInverseAndDetLog_batched<double>(cublasHandle_t& h_cublas,
98  const int n,
99  const int lda,
100  double* Ms[],
101  double* Cs[],
102  double* LU_diags,
103  int* pivots,
104  int* host_infos,
105  int* infos,
106  std::complex<double>* log_dets,
107  const int batch_size);
108 
109 extern template void computeInverseAndDetLog_batched<std::complex<double>>(cublasHandle_t& h_cublas,
111  const int n,
112  const int lda,
113  std::complex<double>* Ms[],
114  std::complex<double>* Cs[],
115  std::complex<double>* LU_diags,
116  int* pivots,
117  int* host_infos,
118  int* infos,
119  std::complex<double>* log_dets,
120  const int batch_size);
121 
122 } // namespace cuBLAS_LU
123 } // namespace qmcplusplus
124 #endif
helper functions for EinsplineSetBuilder
Definition: Configuration.h:43
handle CUDA/HIP runtime selection.
template void computeInverseAndDetLog_batched< double >(cublasHandle_t &h_cublas, cudaStream_t &hstream, const int n, const int lda, double *Ms[], double *Cs[], double *LU_diags, int *pivots, int *host_infos, int *infos, std::complex< double > *log_dets, const int batch_size)
#define cudaStream_t
Definition: cuda2hip.h:149
void computeInverseAndDetLog_batched(cublasHandle_t &h_cublas, cudaStream_t &hstream, const int n, const int lda, T *Ms[], T *Cs[], T *LU_diags, int *pivots, int *host_infos, int *infos, std::complex< double > *log_dets, const int batch_size)
Takes PsiM in column major layout and uses LU factorization to compute the log determinant and invPsi...
void computeGetrf_batched(cublasHandle_t &h_cublas, cudaStream_t &hstream, const int n, const int lda, T *Ms[], int *pivots, int *host_infos, int *infos, const int batch_size)
void computeLogDet_batched(cudaStream_t &hstream, const int n, const int lda, T **Ms, const int *pivots, std::complex< double > *logdets, const int batch_size)
std::vector< int, CUDAHostAllocator< int > > pivots
std::vector< int, CUDAHostAllocator< int > > infos(8, 1.0)
std::vector< double *, CUDAHostAllocator< double * > > Ms
void computeGetri_batched(cublasHandle_t &h_cublas, cudaStream_t &hstream, const int n, const int lda, T *Ms[], T *Cs[], int *pivots, int *host_infos, int *infos, const int batch_size)
#define cublasHandle_t
Definition: cuda2hip.h:35