QMCPACK
DiracMatrixComputeCUDA.hpp
Go to the documentation of this file.
1 //////////////////////////////////////////////////////////////////////////////////////
2 // This file is distributed under the University of Illinois/NCSA Open Source License.
3 // See LICENSE file in top directory for details.
4 //
5 // Copyright (c) 2021 QMCPACK developers.
6 //
7 // File developed by: Peter Doak, doakpw@ornl.gov, Oak Ridge National Lab
8 //
9 // File created by: Peter Doak, doakpw@ornl.gov, Oak Ridge National Lab
10 //////////////////////////////////////////////////////////////////////////////////////
11 
12 #ifndef QMCPLUSPLUS_DIRAC_MATRIX_COMPUTE_CUDA_H
13 #define QMCPLUSPLUS_DIRAC_MATRIX_COMPUTE_CUDA_H
14 
15 #include <type_traits>
16 
17 #include "OhmmsPETE/OhmmsMatrix.h"
18 #include "DualAllocatorAliases.hpp"
23 #include "Concurrency/OpenMP.h"
24 #include "CPU/SIMD/algorithm.hpp"
25 #include "ResourceCollection.h"
26 
27 namespace qmcplusplus
28 {
29 /** class defining a compute and memory resource to compute matrix inversion and
30  * the log determinants of a batch of DiracMatrixes.
31  * Multiplicty is one per crowd not one per UpdateEngine
32  * It matches the multiplicity of the accelerator call
33  * and batched resource requirement.
34  *
35  * @tparam VALUE_FP the datatype used in the actual computation of matrix inversion
36  *
37  * There are no per walker variables, resources specific to the per crowd
38  * compute object are owned here. The compute object itself is the resource
39  * to the per walker DiracDeterminantBatched.
40  * Resources used by this object but owned by the
41  * surrounding scope are passed as arguments.
42  *
43  * All the public APIs are synchronous. The asynchronous queue argument gets synchronized before return.
44  * rocBLAS, indirectly used via hipBLAS, requires synchronizing the old stream before setting a new one.
45  * We don't need to actively synchronize the old stream because it gets synchronized right after each use.
46  *
47  */
48 template<typename VALUE_FP>
50 {
52  using LogValue = std::complex<FullPrecReal>;
53 
54  template<typename T>
56 
57  template<typename T>
59 
60  // Contiguous memory for fp precision Matrices for each walker, n^2 * nw elements
63 
64  // working vectors
68 
69  //DualMatrix<T_FP> temp_mat_;
70 
71  /** Transfer buffer for device pointers to matrices.
72  * The element count is usually low and the transfer launch cost are more than the transfer themselves.
73  * For this reason, it is beneficial to fusing multiple lists of pointers.
74  * Right now this buffer packs nw psiM pointers and then packs nw invM pointers.
75  * Use only within a function scope and do not rely on previous value.
76  */
78 
79  // cuBLAS geam wants these.
80  VALUE_FP host_one{1.0};
81  VALUE_FP host_zero{0.0};
82 
83  // cublas handle owned by this inverter
85 
86  /** Calculates the actual inv and log determinant on accelerator
87  *
88  * \param[in] h_cublas cublas handle, h_stream handle is retrieved from it.
89  * \param[in,out] a_mats dual A matrices, they will be transposed on the device side as a side effect.
90  * \param[out] inv_a_mats dual invM matrices
91  * \param[in] n matrices rank.
92  * \param[out] log_values log determinant value for each matrix, batch_size = log_values.size()
93  *
94  * On Volta so far little seems to be achieved by having the mats continuous.
95  *
96  * List of operations:
97  * 1. matrix-by-matrix. Copy a_mat to inv_a_mat on host, transfer inv_a_mat to device, transpose inv_a_mat to a_mat on device.
98  * 2. batched. LU and invert
99  * 3. matrix-by-matrix. Transfer inv_a_mat to host
100  *
101  * Pros and cons:
102  * 1. \todo try to do like mw_computeInvertAndLog_stride, copy and transpose to psiM_fp_ and fuse transfer.
103  * 3. \todo Remove Transfer inv_a_mat to host and let the upper level code handle it.
104  */
106  const RefVector<const DualMatrix<VALUE_FP>>& a_mats,
107  const RefVector<DualMatrix<VALUE_FP>>& inv_a_mats,
108  const int n,
110  {
111  const int nw = a_mats.size();
112  assert(a_mats.size() == inv_a_mats.size());
113 
114  psiM_invM_ptrs_.resize(nw * 2);
115  const int lda = a_mats[0].get().cols();
116  const int ldinv = inv_a_mats[0].get().cols();
117  cudaStream_t h_stream = queue.getNative();
118  psiM_fp_.resize(n * ldinv * nw);
119 
120  for (int iw = 0; iw < nw; ++iw)
121  {
122  psiM_invM_ptrs_[iw] = psiM_fp_.device_data() + iw * n * ldinv;
123  psiM_invM_ptrs_[iw + nw] = inv_a_mats[iw].get().device_data();
124  // Since inv_a_mat can have a different leading dimension from a_mat first we remap copy on the host
125  simd::remapCopy(n, n, a_mats[iw].get().data(), lda, inv_a_mats[iw].get().data(), ldinv);
126  // Then copy a_mat in inv_a_mats to the device
127  cudaErrorCheck(cudaMemcpyAsync(inv_a_mats[iw].get().device_data(), inv_a_mats[iw].get().data(),
128  inv_a_mats[iw].get().size() * sizeof(VALUE_FP), cudaMemcpyHostToDevice, h_stream),
129  "cudaMemcpyAsync failed copying DiracMatrixBatch::psiM to device");
130  // On the device Here we transpose to a_mat;
132  inv_a_mats[iw].get().device_data(), ldinv, &host_zero,
133  a_mats[iw].get().device_data(), lda, psiM_invM_ptrs_[iw], ldinv),
134  "cuBLAS::geam failed.");
135  }
136  pivots_.resize(n * nw);
137  infos_.resize(nw);
138  LU_diags_fp_.resize(n * nw);
140  psiM_invM_ptrs_.size() * sizeof(VALUE_FP*), cudaMemcpyHostToDevice, h_stream),
141  "cudaMemcpyAsync psiM_invM_ptrs_ failed!");
143  psiM_invM_ptrs_.device_data() + nw, LU_diags_fp_.device_data(),
144  pivots_.device_data(), infos_.data(), infos_.device_data(),
145  log_values.device_data(), nw);
146  for (int iw = 0; iw < nw; ++iw)
147  {
148  cudaErrorCheck(cudaMemcpyAsync(inv_a_mats[iw].get().data(), inv_a_mats[iw].get().device_data(),
149  inv_a_mats[iw].get().size() * sizeof(VALUE_FP), cudaMemcpyDeviceToHost, h_stream),
150  "cudaMemcpyAsync failed copying DiracMatrixBatch::inv_psiM to host");
151  }
152  cudaErrorCheck(cudaMemcpyAsync(log_values.data(), log_values.device_data(), log_values.size() * sizeof(LogValue),
153  cudaMemcpyDeviceToHost, h_stream),
154  "cudaMemcpyAsync log_values failed!");
155  cudaErrorCheck(cudaStreamSynchronize(h_stream), "cudaStreamSynchronize failed!");
156  }
157 
158 
159  /** Calculates the actual inv and log determinant on accelerator with psiMs and invMs widened to full precision
160  * and copied into continuous vectors.
161  *
162  * \param[in] h_cublas cublas handle, h_stream handle is retrieved from it.
163  * \param[in,out] psi_Ms matrices flattened into single pinned vector, returned with LU matrices.
164  * \param[out] inv_Ms matrices flattened into single pinned vector.
165  * \param[in] n matrices rank.
166  * \param[in] lda leading dimension of each matrix
167  * \param[out] log_values log determinant value for each matrix, batch_size = log_values.size()
168  *
169  * List of operations:
170  * 1. batched. Transfer psi_Ms to device
171  * 2. batched. LU and invert
172  * 3. batched. Transfer inv_Ms to host
173  * \todo Remove 1 and 3. Handle transfer at upper level.
174  */
176  DualVector<VALUE_FP>& psi_Ms,
177  DualVector<VALUE_FP>& inv_Ms,
178  const int n,
179  const int lda,
181  {
182  // This is probably dodgy
183  const int nw = log_values.size();
184  psiM_invM_ptrs_.resize(nw * 2);
185  for (int iw = 0; iw < nw; ++iw)
186  {
187  psiM_invM_ptrs_[iw] = psi_Ms.device_data() + iw * n * lda;
188  psiM_invM_ptrs_[iw + nw] = inv_Ms.device_data() + iw * n * lda;
189  }
190  pivots_.resize(n * nw);
191  infos_.resize(nw);
192  LU_diags_fp_.resize(n * nw);
193 
194  cudaStream_t h_stream = queue.getNative();
195  cudaErrorCheck(cudaMemcpyAsync(psi_Ms.device_data(), psi_Ms.data(), psi_Ms.size() * sizeof(VALUE_FP),
196  cudaMemcpyHostToDevice, h_stream),
197  "cudaMemcpyAsync failed copying DiracMatrixBatch::psiM_fp to device");
199  psiM_invM_ptrs_.size() * sizeof(VALUE_FP*), cudaMemcpyHostToDevice, h_stream),
200  "cudaMemcpyAsync psiM_invM_ptrs_ failed!");
202  psiM_invM_ptrs_.device_data() + nw, LU_diags_fp_.device_data(),
203  pivots_.device_data(), infos_.data(), infos_.device_data(),
204  log_values.device_data(), nw);
205 #if NDEBUG
206  // This is very useful to see whether the data after all kernels and cublas calls are run is wrong on the device or due to copy.
207  // cuBLAS_LU::peekinvM_batched(h_stream, psiM_mw_ptr, invM_mw_ptr, pivots_.device_data(), infos_.device_data(),
208  // log_values.device_data(), nw);
209 #endif
210  cudaErrorCheck(cudaMemcpyAsync(inv_Ms.data(), inv_Ms.device_data(), inv_Ms.size() * sizeof(VALUE_FP),
211  cudaMemcpyDeviceToHost, h_stream),
212  "cudaMemcpyAsync failed copying back DiracMatrixBatch::invM_fp from device");
213  cudaErrorCheck(cudaMemcpyAsync(log_values.data(), log_values.device_data(), log_values.size() * sizeof(LogValue),
214  cudaMemcpyDeviceToHost, h_stream),
215  "cudaMemcpyAsync log_values failed!");
216  cudaErrorCheck(cudaStreamSynchronize(h_stream), "cudaStreamSynchronize failed!");
217  }
218 
219 public:
220  DiracMatrixComputeCUDA() : Resource("DiracMatrixComputeCUDA")
221  {
222  cublasErrorCheck(cublasCreate(&h_cublas_), "cublasCreate failed!");
223  }
224 
226  {
227  cublasErrorCheck(cublasCreate(&h_cublas_), "cublasCreate failed!");
228  }
229 
231 
232  std::unique_ptr<Resource> makeClone() const override { return std::make_unique<DiracMatrixComputeCUDA>(*this); }
233 
234  /** Given a_mat returns inverted amit and log determinant of a_matches.
235  * \param [in] a_mat a matrix input
236  * \param [out] inv_a_mat inverted matrix
237  * \param [out] log determinant is in logvalues[0]
238  *
239  * I consider this single call to be semi depricated so the log determinant values
240  * vector is used to match the primary batched interface to the accelerated routings.
241  * There is no optimization (yet) for TMAT same type as TREAL
242  */
243  template<typename TMAT>
245  DualMatrix<TMAT>& a_mat,
246  DualMatrix<TMAT>& inv_a_mat,
248  {
249  cudaStream_t h_stream = queue.getNative();
250  cublasErrorCheck(cublasSetStream(h_cublas_, h_stream), "cublasSetStream failed!");
251  const int n = a_mat.rows();
252  const int lda = a_mat.cols();
253  psiM_fp_.resize(n * lda);
254  invM_fp_.resize(n * lda);
255  std::fill(log_values.begin(), log_values.end(), LogValue{0.0, 0.0});
256  // making sure we know the log_values are zero'd on the device.
257  cudaErrorCheck(cudaMemcpyAsync(log_values.device_data(), log_values.data(), log_values.size() * sizeof(LogValue),
258  cudaMemcpyHostToDevice, h_stream),
259  "cudaMemcpyAsync failed copying DiracMatrixBatch::log_values to device");
260  simd::transpose(a_mat.data(), n, lda, psiM_fp_.data(), n, lda);
261  cudaErrorCheck(cudaMemcpyAsync(psiM_fp_.device_data(), psiM_fp_.data(), psiM_fp_.size() * sizeof(VALUE_FP),
262  cudaMemcpyHostToDevice, h_stream),
263  "cudaMemcpyAsync failed copying DiracMatrixBatch::psiM_fp to device");
265  DualMatrix<VALUE_FP> data_ref_matrix;
266 
267  data_ref_matrix.attachReference(invM_fp_.data(), n, n);
268 
269  // We can't use operator= with different lda, ldb which can happen so we use this assignment which is over the
270  // smaller of the two's dimensions
271  inv_a_mat.assignUpperLeft(data_ref_matrix);
272  cudaErrorCheck(cudaMemcpyAsync(inv_a_mat.device_data(), inv_a_mat.data(), inv_a_mat.size() * sizeof(TMAT),
273  cudaMemcpyHostToDevice, h_stream),
274  "cudaMemcpyAsync of inv_a_mat to device failed!");
275  }
276 
277  /** Mixed precision specialization
278  * When TMAT is not full precision we need to still do the inversion and log
279  * at full precision. This is not yet optimized to transpose on the GPU
280  *
281  * List of operations:
282  * 1. matrix-by-matrix. Transpose a_mat to psiM_fp_ used on host
283  * 2. batched. Call mw_computeInvertAndLog_stride, H2D, invert, D2H
284  * 3. matrix-by-matrix. Copy invM_fp_ to inv_a_mat on host. Transfer inv_a_mat to device.
285  *
286  * Pros and cons:
287  * 1. transfer is batched but double the transfer size due to precision promotion
288  * 3. \todo Copy invM_fp_ to inv_a_mat on device is desired. Transfer inv_a_mat to host should be handled by the upper level code.
289  */
290  template<typename TMAT>
291  inline std::enable_if_t<!std::is_same<VALUE_FP, TMAT>::value> mw_invertTranspose(
293  const RefVector<const DualMatrix<TMAT>>& a_mats,
294  const RefVector<DualMatrix<TMAT>>& inv_a_mats,
296  {
297  cudaStream_t h_stream = queue.getNative();
298  cublasErrorCheck(cublasSetStream(h_cublas_, h_stream), "cublasSetStream failed!");
299  assert(log_values.size() == a_mats.size());
300  const int nw = a_mats.size();
301  const int n = a_mats[0].get().rows();
302  const int lda = a_mats[0].get().cols();
303  size_t nsqr = n * n;
304  psiM_fp_.resize(n * lda * nw);
305  invM_fp_.resize(n * lda * nw);
306  std::fill(log_values.begin(), log_values.end(), LogValue{0.0, 0.0});
307  // making sure we know the log_values are zero'd on the device.
308  cudaErrorCheck(cudaMemcpyAsync(log_values.device_data(), log_values.data(), log_values.size() * sizeof(LogValue),
309  cudaMemcpyHostToDevice, h_stream),
310  "cudaMemcpyAsync failed copying DiracMatrixBatch::log_values to device");
311  for (int iw = 0; iw < nw; ++iw)
312  simd::transpose(a_mats[iw].get().data(), n, a_mats[iw].get().cols(), psiM_fp_.data() + nsqr * iw, n, lda);
314  for (int iw = 0; iw < a_mats.size(); ++iw)
315  {
316  DualMatrix<VALUE_FP> data_ref_matrix;
317  data_ref_matrix.attachReference(invM_fp_.data() + nsqr * iw, n, lda);
318  // We can't use operator= with different lda, ldb which can happen so we use this assignment which is over the
319  // smaller of the two's dimensions
320  inv_a_mats[iw].get().assignUpperLeft(data_ref_matrix);
321  cudaErrorCheck(cudaMemcpyAsync(inv_a_mats[iw].get().device_data(), inv_a_mats[iw].get().data(),
322  inv_a_mats[iw].get().size() * sizeof(TMAT), cudaMemcpyHostToDevice, h_stream),
323  "cudaMemcpyAsync of inv_a_mat to device failed!");
324  }
325  }
326 
327  /** Batched inversion and calculation of log determinants.
328  * When TMAT is full precision we can use the a_mat and inv_mat directly
329  * Side effect of this is after this call the device copy of a_mats contains
330  * the LU factorization matrix.
331  */
332  template<typename TMAT>
333  inline std::enable_if_t<std::is_same<VALUE_FP, TMAT>::value> mw_invertTranspose(
335  const RefVector<const DualMatrix<TMAT>>& a_mats,
336  const RefVector<DualMatrix<TMAT>>& inv_a_mats,
338  {
339  cudaStream_t h_stream = queue.getNative();
340  cublasErrorCheck(cublasSetStream(h_cublas_, h_stream), "cublasSetStream failed!");
341  assert(log_values.size() == a_mats.size());
342  const int n = a_mats[0].get().rows();
343  mw_computeInvertAndLog(queue, a_mats, inv_a_mats, n, log_values);
344  }
345 };
346 
347 } // namespace qmcplusplus
348 
349 #endif //QMCPLUSPLUS_DIRAC_MATRIX_COMPUTE_CUDA_H
#define CUBLAS_OP_N
Definition: cuda2hip.h:19
const std::string & getName() const
Definition: Resource.h:26
helper functions for EinsplineSetBuilder
Definition: Configuration.h:43
DualVector< VALUE_FP * > psiM_invM_ptrs_
Transfer buffer for device pointers to matrices.
std::vector< StdComp, CUDAHostAllocator< StdComp > > log_values(batch_size)
pointer device_data()
Return the device_ptr matching X if this is a vector attached or owning dual space memory...
Definition: OhmmsVector.h:245
void transpose(const T *restrict A, size_t m, size_t lda, TO *restrict B, size_t n, size_t ldb)
transpose of A(m,n) to B(n,m)
Definition: algorithm.hpp:97
#define cublasDestroy
Definition: cuda2hip.h:38
#define cudaStream_t
Definition: cuda2hip.h:149
std::enable_if_t<!std::is_same< VALUE_FP, TMAT >::value > mw_invertTranspose(compute::Queue< PlatformKind::CUDA > &queue, const RefVector< const DualMatrix< TMAT >> &a_mats, const RefVector< DualMatrix< TMAT >> &inv_a_mats, DualVector< LogValue > &log_values)
Mixed precision specialization When TMAT is not full precision we need to still do the inversion and ...
void computeInverseAndDetLog_batched(cublasHandle_t &h_cublas, cudaStream_t &hstream, const int n, const int lda, T *Ms[], T *Cs[], T *LU_diags, int *pivots, int *host_infos, int *infos, std::complex< double > *log_dets, const int batch_size)
Takes PsiM in column major layout and uses LU factorization to compute the log determinant and invPsi...
size_type cols() const
Definition: OhmmsMatrix.h:78
At the qmcplusplus cuBLAS_LU level all *, **, *[] are assumed to be to device addresses.
std::unique_ptr< Resource > makeClone() const override
size_type size() const
Definition: OhmmsMatrix.h:76
size_type size() const
return the current size
Definition: OhmmsVector.h:162
void assignUpperLeft(const Matrix< T_FROM, ALLOC_FROM > &from)
This assigns from a matrix with larger row size (used for alignment) to whatever the rowsize is here...
Definition: OhmmsMatrix.h:155
void mw_computeInvertAndLog(compute::Queue< PlatformKind::CUDA > &queue, const RefVector< const DualMatrix< VALUE_FP >> &a_mats, const RefVector< DualMatrix< VALUE_FP >> &inv_a_mats, const int n, DualVector< LogValue > &log_values)
Calculates the actual inv and log determinant on accelerator.
std::enable_if_t< std::is_same< VALUE_FP, TMAT >::value > mw_invertTranspose(compute::Queue< PlatformKind::CUDA > &queue, const RefVector< const DualMatrix< TMAT >> &a_mats, const RefVector< DualMatrix< TMAT >> &inv_a_mats, DualVector< LogValue > &log_values)
Batched inversion and calculation of log determinants.
void remapCopy(size_t m, size_t n, const T *restrict A, size_t lda, TO *restrict B, size_t ldb)
copy of A(m,n) to B(m,n)
Definition: algorithm.hpp:115
These allocators are to make code that should be generic with the respect to accelerator code flavor ...
void invert_transpose(compute::Queue< PlatformKind::CUDA > &queue, DualMatrix< TMAT > &a_mat, DualMatrix< TMAT > &inv_a_mat, DualVector< LogValue > &log_values)
Given a_mat returns inverted amit and log determinant of a_matches.
cudaErrorCheck(cudaMemcpyAsync(dev_lu.data(), lu.data(), sizeof(decltype(lu)::value_type) *lu.size(), cudaMemcpyHostToDevice, hstream), "cudaMemcpyAsync failed copying log_values to device")
#define cudaMemcpyDeviceToHost
Definition: cuda2hip.h:138
#define CUBLAS_OP_T
Definition: cuda2hip.h:20
#define cudaStreamSynchronize
Definition: cuda2hip.h:152
#define cudaMemcpyHostToDevice
Definition: cuda2hip.h:139
size_type rows() const
Definition: OhmmsMatrix.h:77
pointer device_data()
Definition: OhmmsMatrix.h:188
void mw_computeInvertAndLog_stride(compute::Queue< PlatformKind::CUDA > &queue, DualVector< VALUE_FP > &psi_Ms, DualVector< VALUE_FP > &inv_Ms, const int n, const int lda, DualVector< LogValue > &log_values)
Calculates the actual inv and log determinant on accelerator with psiMs and invMs widened to full pre...
#define cublasCreate
Definition: cuda2hip.h:37
#define cublasSetStream
Definition: cuda2hip.h:39
std::vector< std::reference_wrapper< T > > RefVector
void attachReference(T *ref)
Definition: OhmmsMatrix.h:113
class defining a compute and memory resource to compute matrix inversion and the log determinants of ...
cublasStatus_t geam(cublasHandle_t &handle, cublasOperation_t &transa, cublasOperation_t &transb, int m, int n, const float *alpha, const float *A, int lda, const float *beta, const float *B, int ldb, float *C, int ldc)
Definition: cuBLAS.hpp:110
typename RealAlias_impl< T >::value_type RealAlias
If you have a function templated on a value that can be real or complex and you need to get the base ...
DiracMatrixComputeCUDA(const DiracMatrixComputeCUDA &other)
#define cublasErrorCheck(ans, cause)
Definition: cuBLAS.hpp:34
SIMD version of functions in algorithm.
#define cublasHandle_t
Definition: cuda2hip.h:35
#define cudaMemcpyAsync
Definition: cuda2hip.h:136