QMCPACK
DiracMatrixComputeOMPTarget.hpp
Go to the documentation of this file.
1 //////////////////////////////////////////////////////////////////////////////////////
2 // This file is distributed under the University of Illinois/NCSA Open Source License.
3 // See LICENSE file in top directory for details.
4 //
5 // Copyright (c) 2021 QMCPACK developers.
6 //
7 // File developed by: Peter Doak, doakpw@ornl.gov, Oak Ridge National Lab
8 //
9 // File created by: Peter Doak, doakpw@ornl.gov, Oak Ridge National Lab
10 //////////////////////////////////////////////////////////////////////////////////////
11 
12 #ifndef QMCPLUSPLUS_DIRAC_MATRIX_COMPUTE_OMPTARGET_H
13 #define QMCPLUSPLUS_DIRAC_MATRIX_COMPUTE_OMPTARGET_H
14 
15 #include "CPU/Blasf.h"
16 #include "CPU/BlasThreadingEnv.h"
17 #include "OhmmsPETE/OhmmsMatrix.h"
21 #include "DiracMatrix.h"
24 #include "Concurrency/OpenMP.h"
25 #include "CPU/SIMD/algorithm.hpp"
26 #include "ResourceCollection.h"
27 
28 namespace qmcplusplus
29 {
30 /** class to compute matrix inversion and the log value of determinant
31  * of a batch of DiracMatrixes.
32  *
33  * @tparam VALUE_FP the datatype used in the actual computation of the matrix
34  *
35  * There is one per crowd not one per MatrixUpdateEngine.
36  * this puts ownership of the scratch resources in a sensible place.
37  *
38  * Currently this is CPU only but its external API is somewhat written to
39  * enforce the passing Dual data objects as arguments. Except for the single
40  * particle API log_value which is not Dual type but had better have an address in a OMPtarget
41  * mapped region if target is used with it. This makes this API incompatible to
42  * that used by MatrixDelayedUpdateCuda and DiracMatrixComputeCUDA.
43  */
44 template<typename VALUE_FP>
46 {
47 public:
49  using LogValue = std::complex<FullPrecReal>;
50 
51  // This class only works with OMPallocator so explicitly call OffloadAllocator what it
52  // is and not DUAL
53  template<typename T>
55  template<typename T>
57  template<typename T>
59 
60  // maybe you'll want a resource someday, then change here.
62 
63 private:
65  int lwork_;
66 
67  /// Matrices held in memory matrices n^2 * nw elements
72 
73  /** reset internal work space.
74  * My understanding might be off.
75  *
76  * it smells that this is so complex.
77  */
78  inline void reset(OffloadPinnedVector<VALUE_FP>& psi_Ms, const int n, const int lda, const int batch_size)
79  {
80  const int nw = batch_size;
81  pivots_.resize(lda * nw);
82  for (int iw = 0; iw < nw; ++iw)
83  {
84  lwork_ = -1;
85  VALUE_FP tmp;
86  FullPrecReal lw;
87  auto psi_M_ptr = psi_Ms.data() + iw * n * n;
88  Xgetri(lda, psi_M_ptr, lda, pivots_.data() + iw * n, &tmp, lwork_);
89  convert(tmp, lw);
90  lwork_ = static_cast<int>(lw);
91  m_work_.resize(lwork_);
92  }
93  }
94 
95  /** reset internal work space for single walker case
96  * My understanding might be off.
97  *
98  * it smells that this is so complex.
99  */
100  inline void reset(OffloadPinnedMatrix<VALUE_FP>& psi_M, const int n, const int lda)
101  {
102  pivots_.resize(lda);
103  LU_diags_fp_.resize(lda);
104  lwork_ = -1;
105  VALUE_FP tmp;
106  FullPrecReal lw;
107  Xgetri(lda, psi_M.data(), lda, pivots_.data(), &tmp, lwork_);
108  lw = std::real(tmp);
109  lwork_ = static_cast<int>(lw);
110  m_work_.resize(lwork_);
111  }
112 
113  /** compute the inverse of invMat (in place) and the log value of determinant
114  * \tparam TMAT value type of matrix
115  * \param[inout] a_mat the matrix
116  * \param[in] n actual dimension of square matrix (no guarantee it really has full column rank)
117  * \param[in] lda leading dimension of Matrix container
118  * \param[out] log_value log a_mat before inversion
119  */
120  template<typename TMAT>
121  inline void computeInvertAndLog(OffloadPinnedMatrix<TMAT>& a_mat, const int n, const int lda, LogValue& log_value)
122  {
124  if (lwork_ < lda)
125  reset(a_mat, n, lda);
126  Xgetrf(n, n, a_mat.data(), lda, pivots_.data());
127  for (int i = 0; i < n; i++)
128  LU_diags_fp_[i] = a_mat.data()[i * lda + i];
129  log_value = {0.0, 0.0};
130  computeLogDet(LU_diags_fp_.data(), n, pivots_.data(), log_value);
131  Xgetri(n, a_mat.data(), lda, pivots_.data(), m_work_.data(), lwork_);
132  }
133 
134  template<typename TMAT>
136  const int n,
137  const int lda,
139  {
140  const int nw = log_values.size();
142  if (lwork_ < lda)
143  reset(psi_Ms, n, lda, nw);
144  pivots_.resize(n * nw);
145  LU_diags_fp_.resize(n * nw);
146  for (int iw = 0; iw < nw; ++iw)
147  {
148  VALUE_FP* LU_M = psi_Ms.data() + iw * n * n;
149  Xgetrf(n, n, LU_M, lda, pivots_.data() + iw * n);
150  for (int i = 0; i < n; i++)
151  *(LU_diags_fp_.data() + iw * n + i) = LU_M[i * lda + i];
152  LogValue log_value{0.0, 0.0};
153  computeLogDet(LU_diags_fp_.data() + iw * n, n, pivots_.data() + iw * n, log_value);
154  log_values[iw] = log_value;
155  Xgetri(n, LU_M, lda, pivots_.data() + iw * n, m_work_.data(), lwork_);
156  }
157  }
158 
159  /// matrix inversion engine
161 
162 public:
163  DiracMatrixComputeOMPTarget() : Resource("DiracMatrixComputeOMPTarget"), lwork_(0) {}
164 
165  std::unique_ptr<Resource> makeClone() const override { return std::make_unique<DiracMatrixComputeOMPTarget>(*this); }
166 
167  /** compute the inverse of the transpose of matrix A and its determinant value in log
168  * when VALUE_FP and TMAT are the same
169  * @tparam TMAT matrix value type
170  * @tparam TREAL real type
171  * \param [in] resource compute resource
172  * \param [in] a_mat matrix to be inverted
173  * \param [out] inv_a_mat the inverted matrix
174  * \param [out] log_value breaks compatibility of MatrixUpdateOmpTarget with
175  * DiracMatrixComputeCUDA but is fine for OMPTarget
176  */
177  template<typename TMAT>
178  inline std::enable_if_t<std::is_same<VALUE_FP, TMAT>::value> invert_transpose(HandleResource& resource,
179  const OffloadPinnedMatrix<TMAT>& a_mat,
180  OffloadPinnedMatrix<TMAT>& inv_a_mat,
181  LogValue& log_value)
182  {
183  const int n = a_mat.rows();
184  const int lda = a_mat.cols();
185  const int ldb = inv_a_mat.cols();
186  simd::transpose(a_mat.data(), n, lda, inv_a_mat.data(), n, ldb);
187  // In this case we just pass the value since
188  // that makes sense for a single walker API
189  computeInvertAndLog(inv_a_mat, n, ldb, log_value);
190  }
191 
192  /** compute the inverse of the transpose of matrix A and its determinant value in log
193  * when VALUE_FP and TMAT are the different
194  * @tparam TMAT matrix value type
195  * @tparam TREAL real type
196  */
197  template<typename TMAT>
198  inline std::enable_if_t<!std::is_same<VALUE_FP, TMAT>::value> invert_transpose(HandleResource& resource,
199  const OffloadPinnedMatrix<TMAT>& a_mat,
200  OffloadPinnedMatrix<TMAT>& inv_a_mat,
201  LogValue& log_value)
202  {
203  const int n = a_mat.rows();
204  const int lda = a_mat.cols();
205  const int ldb = inv_a_mat.cols();
206 
207  psiM_fp_.resize(n * lda);
208  simd::transpose(a_mat.data(), n, lda, psiM_fp_.data(), n, lda);
209  OffloadPinnedMatrix<VALUE_FP> psiM_fp_view(psiM_fp_, psiM_fp_.data(), n, lda);
210  computeInvertAndLog(psiM_fp_view, n, lda, log_value);
211 
212  //Matrix<TMAT> data_ref_matrix;
213  //maybe n, lda
214  //data_ref_matrix.attachReference(psiM_fp_.data(), n, n);
215  //Because inv_a_mat is "aligned" this is unlikely to work.
216  simd::remapCopy(n, n, psiM_fp_.data(), lda, inv_a_mat.data(), ldb);
217  }
218 
219  /** This covers both mixed and Full precision case.
220  *
221  * \todo measure if using the a_mats without a copy to contiguous vector is better.
222  */
223  template<typename TMAT, PlatformKind PL>
224  inline void mw_invertTranspose(compute::Queue<PL>& resource_ignored,
225  const RefVector<const OffloadPinnedMatrix<TMAT>>& a_mats,
226  const RefVector<OffloadPinnedMatrix<TMAT>>& inv_a_mats,
228  {
229  for (int iw = 0; iw < a_mats.size(); iw++)
230  {
231  auto& Ainv = inv_a_mats[iw].get();
232  detEng_.invert_transpose(a_mats[iw].get(), Ainv, log_values[iw]);
233  Ainv.updateTo();
234  }
235 
236  /* FIXME
237  const int nw = a_mats.size();
238  const size_t n = a_mats[0].get().rows();
239  const size_t lda = a_mats[0].get().cols();
240  const size_t ldb = inv_a_mats[0].get().cols();
241 
242  size_t nsqr{n * n};
243  psiM_fp_.resize(n * lda * nw);
244  for (int iw = 0; iw < nw; ++iw)
245  simd::transpose(a_mats[iw].get().data(), n, lda, psiM_fp_.data() + nsqr * iw, n, lda);
246 
247  computeInvertAndLog(psiM_fp_, n, lda, log_values);
248  for (int iw = 0; iw < nw; ++iw)
249  {
250  simd::remapCopy(n, n, psiM_fp_.data() + nsqr * iw, lda, inv_a_mats[iw].get().data(), ldb);
251  }
252  */
253  }
254 };
255 } // namespace qmcplusplus
256 
257 #endif // QMCPLUSPLUS_DIRAC_MATRIX_COMPUTE_OMPTARGET_H
std::unique_ptr< Resource > makeClone() const override
std::vector< T, aligned_allocator< T > > aligned_vector
QMCTraits::RealType real
helper functions for EinsplineSetBuilder
Definition: Configuration.h:43
class to compute matrix inversion and the log value of determinant of a batch of DiracMatrixes.
std::vector< StdComp, CUDAHostAllocator< StdComp > > log_values(batch_size)
void computeInvertAndLog(OffloadPinnedMatrix< TMAT > &a_mat, const int n, const int lda, LogValue &log_value)
compute the inverse of invMat (in place) and the log value of determinant
void convert(const PL &lat, const PV &pin, PV &pout)
void mw_invertTranspose(compute::Queue< PL > &resource_ignored, const RefVector< const OffloadPinnedMatrix< TMAT >> &a_mats, const RefVector< OffloadPinnedMatrix< TMAT >> &inv_a_mats, OffloadPinnedVector< LogValue > &log_values)
This covers both mixed and Full precision case.
service class for explicitly managing the threading of BLAS/LAPACK calls from OpenMP parallel region ...
void transpose(const T *restrict A, size_t m, size_t lda, TO *restrict B, size_t n, size_t ldb)
transpose of A(m,n) to B(n,m)
Definition: algorithm.hpp:97
int Xgetrf(int n, int m, float *restrict a, int lda, int *restrict piv)
wrappers around xgetrf lapack routines
Definition: DiracMatrix.h:31
OffloadPinnedVector< VALUE_FP > psiM_fp_
Matrices held in memory matrices n^2 * nw elements.
size_type cols() const
Definition: OhmmsMatrix.h:78
DiracMatrix< VALUE_FP > detEng_
matrix inversion engine
int Xgetri(int n, float *restrict a, int lda, int *restrict piv, float *restrict work, int &lwork)
inversion of a float matrix after lu factorization
Definition: DiracMatrix.h:60
void remapCopy(size_t m, size_t n, const T *restrict A, size_t lda, TO *restrict B, size_t ldb)
copy of A(m,n) to B(m,n)
Definition: algorithm.hpp:115
void computeLogDet(const T *restrict diag, int n, const int *restrict pivot, std::complex< T_FP > &logdet)
Definition: DiracMatrix.h:100
void reset(OffloadPinnedVector< VALUE_FP > &psi_Ms, const int n, const int lda, const int batch_size)
reset internal work space.
OMPallocator is an allocator with fused device and dualspace allocator functionality.
size_type rows() const
Definition: OhmmsMatrix.h:77
std::enable_if_t< std::is_same< VALUE_FP, TMAT >::value > invert_transpose(HandleResource &resource, const OffloadPinnedMatrix< TMAT > &a_mat, OffloadPinnedMatrix< TMAT > &inv_a_mat, LogValue &log_value)
compute the inverse of the transpose of matrix A and its determinant value in log when VALUE_FP and T...
std::vector< std::reference_wrapper< T > > RefVector
typename RealAlias_impl< T >::value_type RealAlias
If you have a function templated on a value that can be real or complex and you need to get the base ...
void reset(OffloadPinnedMatrix< VALUE_FP > &psi_M, const int n, const int lda)
reset internal work space for single walker case My understanding might be off.
void computeInvertAndLog(OffloadPinnedVector< TMAT > &psi_Ms, const int n, const int lda, OffloadPinnedVector< LogValue > &log_values)
std::enable_if_t<!std::is_same< VALUE_FP, TMAT >::value > invert_transpose(HandleResource &resource, const OffloadPinnedMatrix< TMAT > &a_mat, OffloadPinnedMatrix< TMAT > &inv_a_mat, LogValue &log_value)
compute the inverse of the transpose of matrix A and its determinant value in log when VALUE_FP and T...
SIMD version of functions in algorithm.
std::enable_if_t< std::is_same< T_FP, TMAT >::value > invert_transpose(const Matrix< TMAT, ALLOC1 > &amat, Matrix< TMAT, ALLOC2 > &invMat, std::complex< TREAL > &LogDet)
compute the inverse of the transpose of matrix A and its determinant value in log when T_FP and TMAT ...
Definition: DiracMatrix.h:188
int getNextLevelNumThreads()
get the number of threads at the next parallel level
Definition: OpenMP.h:35