225 {
226
227
228
230 << ": dEigVector is on GPU. (Algorithm is NOT implemented)");
231 int nGPUs;
232 cuCHECK(cudaGetDeviceCount(&nGPUs));
234 std::cout << "FuncETHmeasure(): nGPUs = " << nGPUs
235 <<
", \tmBodyOpSpace.dim() = " << mBodyOpSpace.
dim()
236 <<
", \tmBodyOpSpace.transEqDim() = " << mBodyOpSpace.
transEqDim()
237 <<
", \tsubSpace.dim() = " << subSpace.
dim() << std::endl;
238 res = Eigen::VectorXd::Zero(subSpace.
dim());
239 if(res.norm() > 1.0e-4) {
240 std::cerr << "Error(" << __func__
241 << ") : failed to initialize res: res.norm() = " << res.norm() << " is too large."
242 << std::endl;
243 std::exit(EXIT_FAILURE);
244 }
245 size_t const expValMemSize =
sizeof(
Real) * subSpace.
dim();
246 size_t const eigValMemSize =
sizeof(
Real) * subSpace.
dim();
247 size_t const requiredSmSize = expValMemSize + eigValMemSize;
248
249
250 cudaDeviceProp deviceProp;
251 cudaGetDeviceProperties(&deviceProp, 0);
252
254 Eigen::MatrixX<Real>::Zero(subSpace.
dim(), deviceProp.multiProcessorCount).eval());
260 int* transEqClassRep = nullptr;
261 int* transPeriod = nullptr;
265 mBodyOpSpace.
transEqDim() *
sizeof(
int), cudaMemcpyHostToDevice));
267 mBodyOpSpace.
transEqDim() *
sizeof(
int), cudaMemcpyHostToDevice));
268
269 void (*m_kernel)(
270 Eigen::DenseBase< std::remove_reference_t<decltype(*dRes.ptr())> > const*,
271 Eigen::DenseBase< std::remove_reference_t<
decltype(*dEigVal.ptr())> >
const*,
Real,
272 Eigen::DenseBase< std::remove_reference_t<
decltype(*dEigVector.
ptr())> >
const*,
275 Eigen::DenseBase< std::remove_reference_t<
decltype(*dWork.
ptr())> >*)
277
278
279 int shared_memory_size = deviceProp.sharedMemPerMultiprocessor - 1024;
280 int nEigVals = (shared_memory_size - expValMemSize) / eigValMemSize;
281 int smSize = expValMemSize + nEigVals * eigValMemSize;
282
283 cuCHECK(cudaFuncSetAttribute(m_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smSize));
284 struct cudaFuncAttributes m_attr;
285 cuCHECK(cudaFuncGetAttributes(&m_attr, m_kernel));
286 shared_memory_size = m_attr.maxDynamicSharedSizeBytes;
287
288 int constexpr warpSize = 32;
289 int const nThread = min(round_up(subSpace.
dim(), warpSize), m_attr.maxThreadsPerBlock);
290 int const nBlock =
static_cast<int>(sqrt(
static_cast<double>(mBodyOpSpace.
transEqDim()))) + 1;
291 nEigVals = 2;
292
293 smSize
294 = expValMemSize + max(nEigVals * eigValMemSize, (
sizeof(
int) +
sizeof(
Scalar)) *
nThread);
295
296 std::cout <<
"\tnThread = " <<
nThread <<
", nBlock = " <<
nBlock
297 << ", m_attr.maxThreadsPerBlock = " << m_attr.maxThreadsPerBlock
298 << ", requiredSmSize = " << requiredSmSize << ", smSize = " << smSize
299 << ", shared_memory_size = " << shared_memory_size << ", nEigVals = " << nEigVals
300 << ", deviceProp.sharedMemPerMultiprocessor = "
301 << deviceProp.sharedMemPerMultiprocessor << std::endl;
304 assert(smSize <= shared_memory_size);
305
307 dRes.ptr(), dEigVal.ptr(),
static_cast<Real>(MCaverage.shellWidth()), dEigVector.
ptr(),
308 dSubSpace.ptr(), dAdjointBasis.ptr(), dmBodyOpSpace.ptr(), mBodyOpSpace.
transEqDim(),
309 transEqClassRep, transPeriod, dWork.
ptr());
311 cuCHECK(cudaFree(transEqClassRep));
312 cuCHECK(cudaFree(transPeriod));
313
314 cuCHECK(cudaDeviceSynchronize());
315
316 res = dRes.get().template cast<double>().rowwise().sum();
317}
__global__ void ETHmeasure_kernel(Eigen::DenseBase< Derived1 > const *__restrict__ resPtr, Eigen::DenseBase< Derived2 > const *__restrict__ dEigValPtr, typename SubSpace< TotalSpace, Scalar >::Real dE, Eigen::DenseBase< Derived3 > const *__restrict__ dEigVecPtr, SubSpace< TotalSpace, Scalar > const *__restrict__ subSpacePtr, SparseCompressed< Scalar > const *__restrict__ adjointBasisPtr, ManyBodyOperatorSpaceBase< Derived4 > const *__restrict__ dmBodyOpSpacePtr, int const transEqDim, int const *__restrict__ transEqClassRep, int const *__restrict__ transPeriod, Eigen::DenseBase< Derived5 > *__restrict__ dWorkPtr)
Definition ETHmeasure.hpp:37
Definition mytypes.hpp:147
typename SubSpace< TotalSpace_, Scalar_ >::Real Real
Definition ETHmeasure.hpp:140
Definition OperatorSpace.hpp:213
Object_t * ptr() const
Definition ObjectOnGPU.cuh:144
Definition ObjectOnGPU.cuh:149
Definition HilbertSpace.hpp:568
cuCHECK(cudaFuncGetAttributes(&attr, MatrixElementsInSector))
Integer_t const nBlock
Definition getAttributesOfMatrixElementsInSector.cpp:5
Integer_t const nThread
Definition getAttributesOfMatrixElementsInSector.cpp:4