From 8a16ff8e3150cd19bc7d0bac02303c3a90c59c97 Mon Sep 17 00:00:00 2001 From: Gilles Grospellier Date: Thu, 28 Nov 2024 20:01:39 +0100 Subject: [PATCH 1/2] =?UTF-8?q?[arcane,accelerator]=20Pr=C3=A9pare=20suppo?= =?UTF-8?q?rt=20pour=20positionner=20les=20arguments=20des=20kernels=20en?= =?UTF-8?q?=20fonction=20de=20l'occupancy.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../src/arcane/accelerator/RunQueueInternal.h | 12 +++++++----- .../accelerator/core/RunCommandLaunchInfo.cc | 9 +++++++++ .../accelerator/core/RunCommandLaunchInfo.h | 19 +++++++++++++++---- 3 files changed, 31 insertions(+), 9 deletions(-) diff --git a/arcane/src/arcane/accelerator/RunQueueInternal.h b/arcane/src/arcane/accelerator/RunQueueInternal.h index d3636d2f8..d0f2c4667 100644 --- a/arcane/src/arcane/accelerator/RunQueueInternal.h +++ b/arcane/src/arcane/accelerator/RunQueueInternal.h @@ -311,7 +311,7 @@ class InvalidKernelClass * \param func fonction à exécuter par le noyau * \param args arguments de la fonction lambda * - * TODO: Tester si Lambda est bien un fonction, le SFINAE étant peu lisible : + * TODO: Tester si Lambda est bien une fonction, le SFINAE étant peu lisible : * typename std::enable_if_t > >* = nullptr * attendons les concepts c++20 (requires) * @@ -321,10 +321,11 @@ _applyKernelCUDA(impl::RunCommandLaunchInfo& launch_info, const CudaKernel& kern const LambdaArgs& args, [[maybe_unused]] const RemainingArgs&... other_args) { #if defined(ARCANE_COMPILING_CUDA) - auto [b, t] = launch_info.threadBlockInfo(); + Int32 wanted_shared_memory = 0; + auto [b, t] = launch_info._threadBlockInfo(reinterpret_cast(kernel), wanted_shared_memory); cudaStream_t* s = reinterpret_cast(launch_info._internalStreamImpl()); // TODO: utiliser cudaLaunchKernel() à la place. - kernel<<>>(args, func, other_args...); + kernel<<>>(args, func, other_args...); #else ARCANE_UNUSED(launch_info); ARCANE_UNUSED(kernel); @@ -348,9 +349,10 @@ _applyKernelHIP(impl::RunCommandLaunchInfo& launch_info, const HipKernel& kernel const LambdaArgs& args, [[maybe_unused]] const RemainingArgs&... other_args) { #if defined(ARCANE_COMPILING_HIP) - auto [b, t] = launch_info.threadBlockInfo(); + Int32 wanted_shared_memory = 0; + auto [b, t] = launch_info._threadBlockInfo(reinterpret_cast(kernel), wanted_shared_memory); hipStream_t* s = reinterpret_cast(launch_info._internalStreamImpl()); - hipLaunchKernelGGL(kernel, b, t, 0, *s, args, func, other_args...); + hipLaunchKernelGGL(kernel, b, t, wanted_shared_memory, *s, args, func, other_args...); #else ARCANE_UNUSED(launch_info); ARCANE_UNUSED(kernel); diff --git a/arcane/src/arcane/accelerator/core/RunCommandLaunchInfo.cc b/arcane/src/arcane/accelerator/core/RunCommandLaunchInfo.cc index 21e29052b..3591ff94d 100644 --- a/arcane/src/arcane/accelerator/core/RunCommandLaunchInfo.cc +++ b/arcane/src/arcane/accelerator/core/RunCommandLaunchInfo.cc @@ -167,6 +167,15 @@ computeLoopRunInfo() /*---------------------------------------------------------------------------*/ /*---------------------------------------------------------------------------*/ +RunCommandLaunchInfo::ThreadBlockInfo RunCommandLaunchInfo:: +_threadBlockInfo([[maybe_unused]] const void* func,[[maybe_unused]] Int64 shared_memory_size) const +{ + return m_thread_block_info; +} + +/*---------------------------------------------------------------------------*/ +/*---------------------------------------------------------------------------*/ + void RunCommandLaunchInfo:: _addSyclEvent(void* sycl_event_ptr) { diff --git a/arcane/src/arcane/accelerator/core/RunCommandLaunchInfo.h b/arcane/src/arcane/accelerator/core/RunCommandLaunchInfo.h index 955105a07..c56cdc092 100644 --- a/arcane/src/arcane/accelerator/core/RunCommandLaunchInfo.h +++ b/arcane/src/arcane/accelerator/core/RunCommandLaunchInfo.h @@ -35,9 +35,16 @@ namespace Arcane::Accelerator::impl */ class ARCANE_ACCELERATOR_CORE_EXPORT RunCommandLaunchInfo { + // Les fonctions suivantes permettent de lancer les kernels. template friend void _applyKernelSYCL(impl::RunCommandLaunchInfo& launch_info, SyclKernel kernel, Lambda& func, const LambdaArgs& args, const ReducerArgs&... reducer_args); + template + friend void _applyKernelCUDA(impl::RunCommandLaunchInfo& launch_info, const CudaKernel& kernel, Lambda& func, + const LambdaArgs& args, [[maybe_unused]] const RemainingArgs&... other_args); + template + friend void _applyKernelHIP(impl::RunCommandLaunchInfo& launch_info, const HipKernel& kernel, const Lambda& func, + const LambdaArgs& args, [[maybe_unused]] const RemainingArgs&... other_args); public: @@ -89,10 +96,6 @@ class ARCANE_ACCELERATOR_CORE_EXPORT RunCommandLaunchInfo //! Taille totale de la boucle Int64 totalLoopSize() const { return m_total_loop_size; } - public: - - void* _internalStreamImpl(); - private: RunCommand& m_command; @@ -107,6 +110,14 @@ class ARCANE_ACCELERATOR_CORE_EXPORT RunCommandLaunchInfo private: + /*! + * \brief Informations dynamiques sur le nombre de block/thread/grille du noyau à lancer. + * + * Ces informations sont calculées à partir de méthodes fournies par le runtime accélérateur + * sous-jacent. + */ + ThreadBlockInfo _threadBlockInfo(const void* func, Int64 shared_memory_size) const; + void* _internalStreamImpl(); void _begin(); void _doEndKernelLaunch(); ThreadBlockInfo _computeThreadBlockInfo() const; From 404853612ca51fccc1b9c3d12e359b8480792b00 Mon Sep 17 00:00:00 2001 From: Gilles Grospellier Date: Thu, 28 Nov 2024 20:36:12 +0100 Subject: [PATCH 2/2] =?UTF-8?q?[arcane,accelerator]=20Renomme=20la=20class?= =?UTF-8?q?e=20'ThreadBlockInfo'=20en=20'KernelLaunchArgs'=20et=20la=20d?= =?UTF-8?q?=C3=A9place=20dans=20son=20propre=20fichier.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../src/arcane/accelerator/RunQueueInternal.h | 12 ++-- .../accelerator/core/AcceleratorCoreGlobal.h | 1 + .../accelerator/core/KernelLaunchArgs.h | 63 +++++++++++++++++++ .../accelerator/core/RunCommandLaunchInfo.cc | 12 ++-- .../accelerator/core/RunCommandLaunchInfo.h | 17 ++--- arcane/src/arcane/accelerator/core/srcs.cmake | 1 + 6 files changed, 84 insertions(+), 22 deletions(-) create mode 100644 arcane/src/arcane/accelerator/core/KernelLaunchArgs.h diff --git a/arcane/src/arcane/accelerator/RunQueueInternal.h b/arcane/src/arcane/accelerator/RunQueueInternal.h index d0f2c4667..c5162a0ce 100644 --- a/arcane/src/arcane/accelerator/RunQueueInternal.h +++ b/arcane/src/arcane/accelerator/RunQueueInternal.h @@ -322,10 +322,10 @@ _applyKernelCUDA(impl::RunCommandLaunchInfo& launch_info, const CudaKernel& kern { #if defined(ARCANE_COMPILING_CUDA) Int32 wanted_shared_memory = 0; - auto [b, t] = launch_info._threadBlockInfo(reinterpret_cast(kernel), wanted_shared_memory); + auto tbi = launch_info._threadBlockInfo(reinterpret_cast(kernel), wanted_shared_memory); cudaStream_t* s = reinterpret_cast(launch_info._internalStreamImpl()); // TODO: utiliser cudaLaunchKernel() à la place. - kernel<<>>(args, func, other_args...); + kernel<<>>(args, func, other_args...); #else ARCANE_UNUSED(launch_info); ARCANE_UNUSED(kernel); @@ -350,9 +350,9 @@ _applyKernelHIP(impl::RunCommandLaunchInfo& launch_info, const HipKernel& kernel { #if defined(ARCANE_COMPILING_HIP) Int32 wanted_shared_memory = 0; - auto [b, t] = launch_info._threadBlockInfo(reinterpret_cast(kernel), wanted_shared_memory); + auto tbi = launch_info._threadBlockInfo(reinterpret_cast(kernel), wanted_shared_memory); hipStream_t* s = reinterpret_cast(launch_info._internalStreamImpl()); - hipLaunchKernelGGL(kernel, b, t, wanted_shared_memory, *s, args, func, other_args...); + hipLaunchKernelGGL(kernel, tbi.nbBlockPerGrid(), tbi.nbThreadPerBlock(), wanted_shared_memory, *s, args, func, other_args...); #else ARCANE_UNUSED(launch_info); ARCANE_UNUSED(kernel); @@ -379,7 +379,9 @@ void _applyKernelSYCL(impl::RunCommandLaunchInfo& launch_info, SyclKernel kernel sycl::queue* s = reinterpret_cast(launch_info._internalStreamImpl()); sycl::event event; if constexpr (sizeof...(ReducerArgs) > 0) { - auto [b, t] = launch_info.threadBlockInfo(); + auto tbi = launch_info.kernelLaunchArgs(); + Int32 b = tbi.nbBlockPerGrid(); + Int32 t = tbi.nbThreadPerBlock(); sycl::nd_range<1> loop_size(b * t, t); event = s->parallel_for(loop_size, [=](sycl::nd_item<1> i) { kernel(i, args, func, reducer_args...); }); } diff --git a/arcane/src/arcane/accelerator/core/AcceleratorCoreGlobal.h b/arcane/src/arcane/accelerator/core/AcceleratorCoreGlobal.h index 078f277cc..b31627846 100644 --- a/arcane/src/arcane/accelerator/core/AcceleratorCoreGlobal.h +++ b/arcane/src/arcane/accelerator/core/AcceleratorCoreGlobal.h @@ -63,6 +63,7 @@ enum class eMemoryAdvice; namespace impl { + class KernelLaunchArgs; class RuntimeStaticInfo; class IRunnerRuntime; // typedef pour compatibilité avec anciennes versions (octobre 2022) diff --git a/arcane/src/arcane/accelerator/core/KernelLaunchArgs.h b/arcane/src/arcane/accelerator/core/KernelLaunchArgs.h new file mode 100644 index 000000000..13877720a --- /dev/null +++ b/arcane/src/arcane/accelerator/core/KernelLaunchArgs.h @@ -0,0 +1,63 @@ +// -*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*- +//----------------------------------------------------------------------------- +// Copyright 2000-2024 CEA (www.cea.fr) IFPEN (www.ifpenergiesnouvelles.com) +// See the top-level COPYRIGHT file for details. +// SPDX-License-Identifier: Apache-2.0 +//----------------------------------------------------------------------------- +/*---------------------------------------------------------------------------*/ +/* KernelLaunchArgs.h (C) 2000-2024 */ +/* */ +/* Arguments pour lancer un kernel. */ +/*---------------------------------------------------------------------------*/ +#ifndef ARCANE_ACCELERATOR_CORE_KERNELLAUNCHARGS_H +#define ARCANE_ACCELERATOR_CORE_KERNELLAUNCHARGS_H +/*---------------------------------------------------------------------------*/ +/*---------------------------------------------------------------------------*/ + +#include "arcane/accelerator/core/AcceleratorCoreGlobal.h" + +/*---------------------------------------------------------------------------*/ +/*---------------------------------------------------------------------------*/ + +namespace Arcane::Accelerator::impl +{ + +/*---------------------------------------------------------------------------*/ +/*---------------------------------------------------------------------------*/ +/*! + * \internal + * \brief Arguments pour lancer un kernel. + */ +class ARCANE_ACCELERATOR_CORE_EXPORT KernelLaunchArgs +{ + friend RunCommandLaunchInfo; + + public: + + KernelLaunchArgs() = default; + KernelLaunchArgs(Int32 nb_block_per_grid, Int32 nb_thread_per_block) + : m_nb_block_per_grid(nb_block_per_grid) + , m_nb_thread_per_block(nb_thread_per_block) + { + } + + public: + + int nbBlockPerGrid() const { return m_nb_block_per_grid; } + int nbThreadPerBlock() const { return m_nb_thread_per_block; } + + private: + + int m_nb_block_per_grid = 0; + int m_nb_thread_per_block = 0; +}; + +/*---------------------------------------------------------------------------*/ +/*---------------------------------------------------------------------------*/ + +} // End namespace Arcane::Accelerator::impl + +/*---------------------------------------------------------------------------*/ +/*---------------------------------------------------------------------------*/ + +#endif diff --git a/arcane/src/arcane/accelerator/core/RunCommandLaunchInfo.cc b/arcane/src/arcane/accelerator/core/RunCommandLaunchInfo.cc index 3591ff94d..9710e1e36 100644 --- a/arcane/src/arcane/accelerator/core/RunCommandLaunchInfo.cc +++ b/arcane/src/arcane/accelerator/core/RunCommandLaunchInfo.cc @@ -33,7 +33,7 @@ RunCommandLaunchInfo(RunCommand& command, Int64 total_loop_size) : m_command(command) , m_total_loop_size(total_loop_size) { - m_thread_block_info = _computeThreadBlockInfo(); + m_kernel_launch_args = _computeKernelLaunchArgs(); _begin(); } @@ -58,7 +58,7 @@ _begin() m_exec_policy = queue.executionPolicy(); m_queue_stream = queue._internalStream(); m_runtime = queue._internalRuntime(); - m_command._allocateReduceMemory(m_thread_block_info.nb_block_per_grid); + m_command._allocateReduceMemory(m_kernel_launch_args.nbBlockPerGrid()); } /*---------------------------------------------------------------------------*/ @@ -118,8 +118,8 @@ _internalStreamImpl() /*---------------------------------------------------------------------------*/ //! Calcule le nombre de block/thread/grille du noyau en fonction de \a full_size -auto RunCommandLaunchInfo:: -_computeThreadBlockInfo() const -> ThreadBlockInfo +KernelLaunchArgs RunCommandLaunchInfo:: +_computeKernelLaunchArgs() const { int threads_per_block = m_command.nbThreadPerBlock(); if (threads_per_block<=0) @@ -167,10 +167,10 @@ computeLoopRunInfo() /*---------------------------------------------------------------------------*/ /*---------------------------------------------------------------------------*/ -RunCommandLaunchInfo::ThreadBlockInfo RunCommandLaunchInfo:: +KernelLaunchArgs RunCommandLaunchInfo:: _threadBlockInfo([[maybe_unused]] const void* func,[[maybe_unused]] Int64 shared_memory_size) const { - return m_thread_block_info; + return m_kernel_launch_args; } /*---------------------------------------------------------------------------*/ diff --git a/arcane/src/arcane/accelerator/core/RunCommandLaunchInfo.h b/arcane/src/arcane/accelerator/core/RunCommandLaunchInfo.h index c56cdc092..ec86d3d5c 100644 --- a/arcane/src/arcane/accelerator/core/RunCommandLaunchInfo.h +++ b/arcane/src/arcane/accelerator/core/RunCommandLaunchInfo.h @@ -14,11 +14,10 @@ /*---------------------------------------------------------------------------*/ /*---------------------------------------------------------------------------*/ -#include "arcane/utils/CheckedConvert.h" #include "arcane/utils/ConcurrencyUtils.h" #include "arcane/utils/Profiling.h" -#include "arcane/accelerator/core/AcceleratorCoreGlobal.h" +#include "arcane/accelerator/core/KernelLaunchArgs.h" /*---------------------------------------------------------------------------*/ /*---------------------------------------------------------------------------*/ @@ -48,11 +47,7 @@ class ARCANE_ACCELERATOR_CORE_EXPORT RunCommandLaunchInfo public: - struct ThreadBlockInfo - { - int nb_block_per_grid = 0; - int nb_thread_per_block = 0; - }; + using ThreadBlockInfo = KernelLaunchArgs; public: @@ -82,7 +77,7 @@ class ARCANE_ACCELERATOR_CORE_EXPORT RunCommandLaunchInfo void endExecute(); //! Informations sur le nombre de block/thread/grille du noyau à lancer. - ThreadBlockInfo threadBlockInfo() const { return m_thread_block_info; } + KernelLaunchArgs kernelLaunchArgs() const { return m_kernel_launch_args; } //! Calcul les informations pour les boucles multi-thread ParallelLoopOptions computeParallelLoopOptions() const; @@ -104,7 +99,7 @@ class ARCANE_ACCELERATOR_CORE_EXPORT RunCommandLaunchInfo IRunnerRuntime* m_runtime = nullptr; IRunQueueStream* m_queue_stream = nullptr; eExecutionPolicy m_exec_policy = eExecutionPolicy::Sequential; - ThreadBlockInfo m_thread_block_info; + KernelLaunchArgs m_kernel_launch_args; ForLoopRunInfo m_loop_run_info; Int64 m_total_loop_size = 0; @@ -116,11 +111,11 @@ class ARCANE_ACCELERATOR_CORE_EXPORT RunCommandLaunchInfo * Ces informations sont calculées à partir de méthodes fournies par le runtime accélérateur * sous-jacent. */ - ThreadBlockInfo _threadBlockInfo(const void* func, Int64 shared_memory_size) const; + KernelLaunchArgs _threadBlockInfo(const void* func, Int64 shared_memory_size) const; void* _internalStreamImpl(); void _begin(); void _doEndKernelLaunch(); - ThreadBlockInfo _computeThreadBlockInfo() const; + KernelLaunchArgs _computeKernelLaunchArgs() const; private: diff --git a/arcane/src/arcane/accelerator/core/srcs.cmake b/arcane/src/arcane/accelerator/core/srcs.cmake index 164ccb323..f53e2b247 100644 --- a/arcane/src/arcane/accelerator/core/srcs.cmake +++ b/arcane/src/arcane/accelerator/core/srcs.cmake @@ -10,6 +10,7 @@ set( ARCANE_SOURCES DeviceInfoList.h IReduceMemoryImpl.h IDeviceInfoList.h + KernelLaunchArgs.h Memory.h Memory.cc MemoryTracer.cc