StatMech
Loading...
Searching...
No Matches
ETHmeasure.hpp
Go to the documentation of this file.
1#include "Headers/mydebug.hpp"
6#include <algorithm>
7
8#if __has_include(<omp.h>)
9 #include <omp.h>
10#endif
11
12#ifdef GPU
17
18__host__ __device__ static inline int round_down(int x, int base) { return (x / base) * base; }
19__host__ __device__ static inline int round_up(int x, int base) {
20 return (x % base == 0 ? x : (x / base + 1) * base);
21}
22__host__ __device__ static inline int round_nearest(int x, int base) {
23 return (x % base < base / 2 ? (x / base) * base : (x / base + 1) * base);
24}
25__device__ unsigned dynamic_smem_size() {
26 unsigned ret;
27 asm volatile("mov.u32 %0, %dynamic_smem_size;" : "=r"(ret));
28 return ret;
29}
30__device__ unsigned get_smid(void) {
31 unsigned ret;
32 asm("mov.u32 %0, %smid;" : "=r"(ret));
33 return ret;
34}
35template<class Derived1, class Derived2, class Derived3, class Derived4, class Derived5,
36 class TotalSpace, typename Scalar>
37__global__ void ETHmeasure_kernel(
38 Eigen::DenseBase<Derived1> const* __restrict__ resPtr,
39 Eigen::DenseBase<Derived2> const* __restrict__ dEigValPtr,
41 Eigen::DenseBase<Derived3> const* __restrict__ dEigVecPtr,
42 SubSpace<TotalSpace, Scalar> const* __restrict__ subSpacePtr,
43 SparseCompressed<Scalar> const* __restrict__ adjointBasisPtr,
44 ManyBodyOperatorSpaceBase<Derived4> const* __restrict__ dmBodyOpSpacePtr, int const transEqDim,
45 int const* __restrict__ transEqClassRep, int const* __restrict__ transPeriod,
46 Eigen::DenseBase<Derived5>* __restrict__ dWorkPtr) {
47 using Real = typename SubSpace<TotalSpace, Scalar>::Real;
48
49 Eigen::DenseBase<Derived1>& res = *const_cast< Eigen::DenseBase<Derived1>* >(resPtr);
50 Eigen::DenseBase<Derived2> const& dEigVal = *dEigValPtr;
51 Eigen::DenseBase<Derived3> const& dEigVec = *dEigVecPtr;
52 ManyBodyOperatorSpaceBase<Derived4> const& dmBodyOpSpace = *dmBodyOpSpacePtr;
53 SparseCompressed<Scalar> const& adjointBasis = (*adjointBasisPtr);
54 int const dimSpace = subSpacePtr->dim();
55
56 int const opOrdinal = blockIdx.x + gridDim.x * blockIdx.y;
57 if(opOrdinal >= transEqDim) return;
58 #ifndef NDEBUG
59 if(opOrdinal == 0 && threadIdx.x == 0)
60 printf("%s\n\t transEqDim=%d, dimSpace=%d, dWorkPtr->rows()=%d\n", __PRETTY_FUNCTION__,
61 int(transEqDim), int(dimSpace), int(dWorkPtr->rows()));
62 #endif
63
64 extern __shared__ int sharedMem[];
65 Real* expValPtr = reinterpret_cast<Real*>(sharedMem);
66 Eigen::Map< Eigen::VectorX<Real> > expVal(expValPtr, dimSpace);
67 Real* eigValPtr = expValPtr + expVal.size();
68 int configStorage[14];
69 Eigen::Map< Eigen::VectorX<int> > config(&configStorage[0], dmBodyOpSpace.sysSize());
70
71 for(int j = threadIdx.x; j < dimSpace; j += blockDim.x) { expVal[j] = 0; }
72 __syncthreads();
73
74 int* outStatePtr = reinterpret_cast<int*>(expValPtr + expVal.size());
75 Scalar* coeffPtr = reinterpret_cast<Scalar*>(outStatePtr + blockDim.x);
76 for(int j = 0; j != round_up(subSpacePtr->totalSpace().dim(), blockDim.x) / blockDim.x; ++j) {
77 int const inState = threadIdx.x + j * blockDim.x;
78 if(inState < subSpacePtr->totalSpace().dim()) {
79 dmBodyOpSpace.action(outStatePtr[threadIdx.x], coeffPtr[threadIdx.x],
80 transEqClassRep[opOrdinal], inState, config);
81 coeffPtr[threadIdx.x] = adjointBasis.valuePtr()[outStatePtr[threadIdx.x]]
82 * coeffPtr[threadIdx.x]
83 * conj(adjointBasis.valuePtr()[inState]);
84 }
85 __syncthreads();
86
87 for(int k = 0; k != blockDim.x; ++k) {
88 int const inState = k + j * blockDim.x;
89 if(inState >= subSpacePtr->totalSpace().dim()) continue;
90 int const id1 = adjointBasis.innerIndexPtr()[inState];
91 int const id2 = adjointBasis.innerIndexPtr()[outStatePtr[k]];
92 Scalar const coeff = coeffPtr[k];
93 for(int expvalId = threadIdx.x; expvalId < dimSpace; expvalId += blockDim.x) {
94 atomicAdd(&expVal[expvalId],
95 real(conj(dEigVec(id2, expvalId)) * coeff * dEigVec(id1, expvalId)));
96 }
97 }
98 __syncthreads();
99 }
100
101 int const nEigVals
102 = (dynamic_smem_size() - sizeof(Real) * dimSpace) / (sizeof(Real) * dimSpace);
103 for(int j = threadIdx.x; j < nEigVals * dimSpace; j += blockDim.x) {
104 *(eigValPtr + j) = dEigVal(j % dimSpace);
105 }
106 __syncthreads();
107 // Calculate squared differences from Microcanonical average;
108 int const k = threadIdx.x % nEigVals;
109 Eigen::Map< Eigen::VectorX<Real> > eigVal(eigValPtr + k * dimSpace, dimSpace);
110 Real const opTransPeriod = transPeriod[opOrdinal];
111 for(int j = threadIdx.x; j < dimSpace; j += blockDim.x) {
112 int idMin = j, idMax = j;
113 for(idMin = j; idMin >= 0 && eigVal(j) - eigVal(idMin) <= dE; --idMin) {};
114 ++idMin;
115 for(idMax = j; idMax < dimSpace && eigVal(idMax) - eigVal(j) <= dE; ++idMax) {};
116 --idMax;
117 assert(idMin >= 0);
118 assert(idMax < dimSpace);
119 assert(idMin <= idMax);
120
121 Real MCave = 0;
122 for(int k = idMin; k != idMax + 1; ++k) MCave += expVal[k];
123 MCave /= static_cast<Real>(idMax - idMin + 1);
124 Real const diff = opTransPeriod * (expVal[j] - MCave) * (expVal[j] - MCave);
125 atomicAdd(&res(j, get_smid()), diff);
126 }
127}
128#endif
129
130#pragma omp declare reduction(+ : Eigen::VectorXd : omp_out=omp_out+omp_in) initializer(omp_priv = omp_orig)
131
132template<class SubSpace_>
134
135template<class TotalSpace_, typename Scalar_>
136class FuncETHmeasure< SubSpace<TotalSpace_, Scalar_> > {
137 private:
138 using TotalSpace = TotalSpace_;
139 using Scalar = Scalar_;
141 using Vector = Eigen::VectorXd;
142 std::vector<Vector> m_expVal;
143 std::vector<Vector> m_mcAverage;
144
145 public:
146 FuncETHmeasure() : m_expVal(omp_get_max_threads()), m_mcAverage(omp_get_max_threads()) {
147 debug_constructor_printf(1);
148 }
149
150 template<class Derived>
151 void operator()(Eigen::VectorXd& res, Eigen::MatrixBase<Derived> const& eigVector,
152 SubSpace<TotalSpace, Scalar> const& subSpace,
153 mBodyOperatorSpace<Scalar> const& mBodyOpSpace,
154 MicroCanonicalAverage const& MCaverage);
155
156#ifdef GPU
157 public:
158 template<typename Matrix_t, class Derived>
159 void operator()(Eigen::VectorXd& res, ObjectOnGPU<Matrix_t> const& dEigVector,
160 SubSpace<TotalSpace, Scalar> const& subSpace,
161 ManyBodyOperatorSpaceBase<Derived> const& mBodyOpSpace,
162 MicroCanonicalAverage const& MCaverage);
163#endif
164};
165
166template<class TotalSpace_, typename Scalar_>
167template<class Derived>
169 Eigen::VectorXd& res, Eigen::MatrixBase<Derived> const& eigVector,
170 SubSpace<TotalSpace, Scalar> const& subSpace, mBodyOperatorSpace<Scalar> const& mBodyOpSpace,
171 MicroCanonicalAverage const& MCaverage) {
172 debug_print("FuncETHmeasure " << __func__
173 << ": eigVector is NOT on GPU. Using CPU algorithm...");
174
175 mBodyOpSpace.computeTransEqClass();
176 std::cout << "FuncETHmeasure():\tm = " << mBodyOpSpace.m()
177 << ", \tmBodyOpSpace.dim() = " << mBodyOpSpace.dim()
178 << ", \tmBodyOpSpace.transEqDim() = " << mBodyOpSpace.transEqDim()
179 << ", \tsubSpace.dim() = " << subSpace.dim() << std::endl;
180 res = Eigen::VectorXd::Zero(subSpace.dim());
181 if(res.norm() > 1.0e-4) {
182 std::cerr << "Error(" << __func__
183 << ") : failed to initialize res: res.norm() = " << res.norm() << " is too large."
184 << std::endl;
185 std::exit(EXIT_FAILURE);
186 }
187 std::for_each(m_expVal.begin(), m_expVal.end(),
188 [&eigVector](auto& x) { x.resize(eigVector.cols()); });
189
190 debug_print(eigVector);
191 debug_print(subSpace.basis());
192 debug_print(mBodyOpSpace.basisOp(0));
193
194 omp_set_max_active_levels(1);
195// #pragma omp parallel for reduction(+ : res)
196 for(int opEqClass = 0; opEqClass < mBodyOpSpace.transEqDim(); ++opEqClass) {
197 int opNum = mBodyOpSpace.transEqClassRep(opEqClass);
198 int thread = omp_get_thread_num();
199
200 m_expVal[thread]
201 = (eigVector.adjoint()
202 * (subSpace.basis().adjoint() * mBodyOpSpace.basisOp(opNum) * subSpace.basis())
203 .pruned()
204 .eval()
205 * eigVector)
206 .diagonal()
207 .real();
208 debug_print("\tBefore MCaverage: opNum=" << opNum << ", thread=" << thread);
209 MCaverage(m_mcAverage[thread], m_expVal[thread]);
210
211 res += mBodyOpSpace.transPeriod(opEqClass)
212 * (m_expVal[thread] - m_mcAverage[thread]).cwiseAbs2();
213 }
214 debug_print("FuncETHmeasure " << __func__
215 << ": eigVector is NOT on GPU. Using CPU algorithm...");
216}
217
218#ifdef GPU
219template<class TotalSpace_, typename Scalar_>
220template<typename Matrix_t, class Derived>
222 Eigen::VectorXd& res, ObjectOnGPU<Matrix_t> const& dEigVector,
223 SubSpace<TotalSpace, Scalar> const& subSpace,
224 ManyBodyOperatorSpaceBase<Derived> const& mBodyOpSpace,
225 MicroCanonicalAverage const& MCaverage) {
226
227 // dEigVector should be stored in row-major.
228
229 debug_print("FuncETHmeasure " << __func__
230 << ": dEigVector is on GPU. (Algorithm is NOT implemented)");
231 int nGPUs;
232 cuCHECK(cudaGetDeviceCount(&nGPUs));
233 mBodyOpSpace.computeTransEqClass();
234 std::cout << "FuncETHmeasure(): nGPUs = " << nGPUs
235 << ", \tmBodyOpSpace.dim() = " << mBodyOpSpace.dim()
236 << ", \tmBodyOpSpace.transEqDim() = " << mBodyOpSpace.transEqDim()
237 << ", \tsubSpace.dim() = " << subSpace.dim() << std::endl;
238 res = Eigen::VectorXd::Zero(subSpace.dim());
239 if(res.norm() > 1.0e-4) {
240 std::cerr << "Error(" << __func__
241 << ") : failed to initialize res: res.norm() = " << res.norm() << " is too large."
242 << std::endl;
243 std::exit(EXIT_FAILURE);
244 }
245 size_t const expValMemSize = sizeof(Real) * subSpace.dim();
246 size_t const eigValMemSize = sizeof(Real) * subSpace.dim();
247 size_t const requiredSmSize = expValMemSize + eigValMemSize;
248
249 // GPU-side preparation
250 cudaDeviceProp deviceProp;
251 cudaGetDeviceProperties(&deviceProp, 0);
252
254 Eigen::MatrixX<Real>::Zero(subSpace.dim(), deviceProp.multiProcessorCount).eval());
255 ObjectOnGPU< SubSpace<TotalSpace, Scalar> > dSubSpace(subSpace);
256 ObjectOnGPU< SparseCompressed<Scalar> > dAdjointBasis(subSpace.basis().adjoint());
257 ObjectOnGPU< Eigen::VectorX<Real> > dEigVal(MCaverage.eigVal());
259 ObjectOnGPU<Derived> dmBodyOpSpace(static_cast<Derived const&>(mBodyOpSpace));
260 int* transEqClassRep = nullptr;
261 int* transPeriod = nullptr;
262 cuCHECK(cudaMalloc(&transEqClassRep, mBodyOpSpace.transEqDim() * sizeof(int)));
263 cuCHECK(cudaMalloc(&transPeriod, mBodyOpSpace.transEqDim() * sizeof(int)));
264 cuCHECK(cudaMemcpyAsync(transEqClassRep, mBodyOpSpace.transEqClassRep().data(),
265 mBodyOpSpace.transEqDim() * sizeof(int), cudaMemcpyHostToDevice));
266 cuCHECK(cudaMemcpyAsync(transPeriod, mBodyOpSpace.transPeriod().data(),
267 mBodyOpSpace.transEqDim() * sizeof(int), cudaMemcpyHostToDevice));
268
269 void (*m_kernel)(
270 Eigen::DenseBase< std::remove_reference_t<decltype(*dRes.ptr())> > const*,
271 Eigen::DenseBase< std::remove_reference_t<decltype(*dEigVal.ptr())> > const*, Real,
272 Eigen::DenseBase< std::remove_reference_t<decltype(*dEigVector.ptr())> > const*,
274 ManyBodyOperatorSpaceBase<Derived> const*, int const, int const*, int const*,
275 Eigen::DenseBase< std::remove_reference_t<decltype(*dWork.ptr())> >*)
277
278 // determine the configuration of shared memory
279 int shared_memory_size = deviceProp.sharedMemPerMultiprocessor - 1024;
280 int nEigVals = (shared_memory_size - expValMemSize) / eigValMemSize;
281 int smSize = expValMemSize + nEigVals * eigValMemSize;
282
283 cuCHECK(cudaFuncSetAttribute(m_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smSize));
284 struct cudaFuncAttributes m_attr;
285 cuCHECK(cudaFuncGetAttributes(&m_attr, m_kernel));
286 shared_memory_size = m_attr.maxDynamicSharedSizeBytes;
287
288 int constexpr warpSize = 32;
289 int const nThread = min(round_up(subSpace.dim(), warpSize), m_attr.maxThreadsPerBlock);
290 int const nBlock = static_cast<int>(sqrt(static_cast<double>(mBodyOpSpace.transEqDim()))) + 1;
291 nEigVals = 2;
292
293 smSize
294 = expValMemSize + max(nEigVals * eigValMemSize, (sizeof(int) + sizeof(Scalar)) * nThread);
295
296 std::cout << "\tnThread = " << nThread << ", nBlock = " << nBlock
297 << ", m_attr.maxThreadsPerBlock = " << m_attr.maxThreadsPerBlock
298 << ", requiredSmSize = " << requiredSmSize << ", smSize = " << smSize
299 << ", shared_memory_size = " << shared_memory_size << ", nEigVals = " << nEigVals
300 << ", deviceProp.sharedMemPerMultiprocessor = "
301 << deviceProp.sharedMemPerMultiprocessor << std::endl;
302 assert(nThread >= 1);
303 assert(nBlock >= 1);
304 assert(smSize <= shared_memory_size);
305
306 m_kernel<<<dim3(nBlock, nBlock), dim3(nThread, 1), smSize>>>(
307 dRes.ptr(), dEigVal.ptr(), static_cast<Real>(MCaverage.shellWidth()), dEigVector.ptr(),
308 dSubSpace.ptr(), dAdjointBasis.ptr(), dmBodyOpSpace.ptr(), mBodyOpSpace.transEqDim(),
309 transEqClassRep, transPeriod, dWork.ptr());
310 cuCHECK(cudaGetLastError());
311 cuCHECK(cudaFree(transEqClassRep));
312 cuCHECK(cudaFree(transPeriod));
313
314 cuCHECK(cudaDeviceSynchronize());
315
316 res = dRes.get().template cast<double>().rowwise().sum();
317}
318#endif // #ifdef GPU
319
320// without the hack to avoid bank conflicts and SM-wise summation (SMid,opOrdinal), 80.8295 sec for L=11 on AI-g11.
321// with the hack to avoid bank conflicts and without SM-wise summation (SMid,opOrdinal), 84.4658 sec for L=11 on AI-g11.
322// with the hack to avoid bank conflicts and SM-wise summation (SMid,opOrdinal), 84.0022 sec for L=11 on AI-g11.
323// without the hack to avoid bank conflicts and with SM-wise summation (SMid,opOrdinal), 81.1254 sec for L=11 on AI-g11.
324// without the hack to avoid bank conflicts and with SM-wise summation (opOrdinal,SMid), 81.4284 sec for L=11 on AI-g11.
325// with the hack to avoid bank conflicts and SM-wise summation (opOrdinal,SMid), 84.9689 sec for L=11 on AI-g11.
__global__ void ETHmeasure_kernel(Eigen::DenseBase< Derived1 > const *__restrict__ resPtr, Eigen::DenseBase< Derived2 > const *__restrict__ dEigValPtr, typename SubSpace< TotalSpace, Scalar >::Real dE, Eigen::DenseBase< Derived3 > const *__restrict__ dEigVecPtr, SubSpace< TotalSpace, Scalar > const *__restrict__ subSpacePtr, SparseCompressed< Scalar > const *__restrict__ adjointBasisPtr, ManyBodyOperatorSpaceBase< Derived4 > const *__restrict__ dmBodyOpSpacePtr, int const transEqDim, int const *__restrict__ transEqClassRep, int const *__restrict__ transPeriod, Eigen::DenseBase< Derived5 > *__restrict__ dWorkPtr)
Definition ETHmeasure.hpp:37
__device__ unsigned dynamic_smem_size()
Definition ETHmeasure.hpp:25
__device__ unsigned get_smid(void)
Definition ETHmeasure.hpp:30
Functional object class to calculate microcanonical average.
Definition mytypes.hpp:147
FuncETHmeasure()
Definition ETHmeasure.hpp:146
Eigen::VectorXd Vector
Definition ETHmeasure.hpp:141
typename SubSpace< TotalSpace_, Scalar_ >::Real Real
Definition ETHmeasure.hpp:140
std::vector< Vector > m_mcAverage
Definition ETHmeasure.hpp:143
Scalar_ Scalar
Definition ETHmeasure.hpp:139
std::vector< Vector > m_expVal
Definition ETHmeasure.hpp:142
TotalSpace_ TotalSpace
Definition ETHmeasure.hpp:138
Definition ETHmeasure.hpp:133
__host__ __device__ int dim() const
Definition HilbertSpace.hpp:34
Definition OperatorSpace.hpp:213
__host__ __device__ int sysSize() const
Definition HilbertSpace.hpp:265
Calculate the microcanonical averages with respect to a given sorted vector 'eigVal'.
Definition MicroCanonicalAverage.hpp:25
Object_t * ptr() const
Definition ObjectOnGPU.cuh:144
Definition ObjectOnGPU.cuh:149
__host__ __device__ void action(int &resBasisNum, Complex_t< RealType > &coeff, int opNum, int basisNum) const
Definition OperatorSpace.hpp:83
Definition MatrixUtils.cuh:280
__host__ __device__ int * innerIndexPtr() const
Definition MatrixUtils.cuh:421
__host__ __device__ Scalar_t * valuePtr() const
Definition MatrixUtils.cuh:422
Definition HilbertSpace.hpp:568
typename Eigen::NumTraits< ScalarType_ >::Real Real
Definition HilbertSpace.hpp:572
__host__ __device__ TotalSpace const & totalSpace() const
Definition HilbertSpace.hpp:671
Definition OperatorSpace.hpp:430
cuCHECK(cudaFuncGetAttributes(&attr, MatrixElementsInSector))
debug_print("# Determining GPU configuration.")
Integer_t const nBlock
Definition getAttributesOfMatrixElementsInSector.cpp:5
Integer_t const nThread
Definition getAttributesOfMatrixElementsInSector.cpp:4
double const dE
Definition setVariablesForMCAverage.cpp:2