arcaneframework · grospelliergilles · Nov 28, 2024 · Nov 28, 2024 · Nov 28, 2024
diff --git a/arcane/src/arcane/accelerator/RunQueueInternal.h b/arcane/src/arcane/accelerator/RunQueueInternal.h
@@ -311,7 +311,7 @@ class InvalidKernelClass
  * \param func fonction à exécuter par le noyau
  * \param args arguments de la fonction lambda
  * 
- * TODO: Tester si Lambda est bien un fonction, le SFINAE étant peu lisible :
+ * TODO: Tester si Lambda est bien une fonction, le SFINAE étant peu lisible :
  * typename std::enable_if_t<std::is_function_v<std::decay_t<Lambda> > >* = nullptr
  * attendons les concepts c++20 (requires)
  * 
@@ -321,10 +321,11 @@ _applyKernelCUDA(impl::RunCommandLaunchInfo& launch_info, const CudaKernel& kern
                  const LambdaArgs& args, [[maybe_unused]] const RemainingArgs&... other_args)
 {
 #if defined(ARCANE_COMPILING_CUDA)
-  auto [b, t] = launch_info.threadBlockInfo();
+  Int32 wanted_shared_memory = 0;
+  auto tbi = launch_info._threadBlockInfo(reinterpret_cast<const void*>(kernel), wanted_shared_memory);
   cudaStream_t* s = reinterpret_cast<cudaStream_t*>(launch_info._internalStreamImpl());
   // TODO: utiliser cudaLaunchKernel() à la place.
-  kernel<<<b, t, 0, *s>>>(args, func, other_args...);
+  kernel<<<tbi.nbBlockPerGrid(), tbi.nbThreadPerBlock(), wanted_shared_memory, *s>>>(args, func, other_args...);
 #else
   ARCANE_UNUSED(launch_info);
   ARCANE_UNUSED(kernel);
@@ -348,9 +349,10 @@ _applyKernelHIP(impl::RunCommandLaunchInfo& launch_info, const HipKernel& kernel
                 const LambdaArgs& args, [[maybe_unused]] const RemainingArgs&... other_args)
 {
 #if defined(ARCANE_COMPILING_HIP)
-  auto [b, t] = launch_info.threadBlockInfo();
+  Int32 wanted_shared_memory = 0;
+  auto tbi = launch_info._threadBlockInfo(reinterpret_cast<const void*>(kernel), wanted_shared_memory);
   hipStream_t* s = reinterpret_cast<hipStream_t*>(launch_info._internalStreamImpl());
-  hipLaunchKernelGGL(kernel, b, t, 0, *s, args, func, other_args...);
+  hipLaunchKernelGGL(kernel, tbi.nbBlockPerGrid(), tbi.nbThreadPerBlock(), wanted_shared_memory, *s, args, func, other_args...);
 #else
   ARCANE_UNUSED(launch_info);
   ARCANE_UNUSED(kernel);
@@ -377,7 +379,9 @@ void _applyKernelSYCL(impl::RunCommandLaunchInfo& launch_info, SyclKernel kernel
   sycl::queue* s = reinterpret_cast<sycl::queue*>(launch_info._internalStreamImpl());
   sycl::event event;
   if constexpr (sizeof...(ReducerArgs) > 0) {
-    auto [b, t] = launch_info.threadBlockInfo();
+    auto tbi = launch_info.kernelLaunchArgs();
+    Int32 b = tbi.nbBlockPerGrid();
+    Int32 t = tbi.nbThreadPerBlock();
     sycl::nd_range<1> loop_size(b * t, t);
     event = s->parallel_for(loop_size, [=](sycl::nd_item<1> i) { kernel(i, args, func, reducer_args...); });
   }

diff --git a/arcane/src/arcane/accelerator/core/AcceleratorCoreGlobal.h b/arcane/src/arcane/accelerator/core/AcceleratorCoreGlobal.h
@@ -63,6 +63,7 @@ enum class eMemoryAdvice;
 
 namespace impl
 {
+  class KernelLaunchArgs;
   class RuntimeStaticInfo;
   class IRunnerRuntime;
   // typedef pour compatibilité avec anciennes versions (octobre 2022)

diff --git a/arcane/src/arcane/accelerator/core/KernelLaunchArgs.h b/arcane/src/arcane/accelerator/core/KernelLaunchArgs.h
@@ -0,0 +1,63 @@
+// -*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*-
+//-----------------------------------------------------------------------------
+// Copyright 2000-2024 CEA (www.cea.fr) IFPEN (www.ifpenergiesnouvelles.com)
+// See the top-level COPYRIGHT file for details.
+// SPDX-License-Identifier: Apache-2.0
+//-----------------------------------------------------------------------------
+/*---------------------------------------------------------------------------*/
+/* KernelLaunchArgs.h                                          (C) 2000-2024 */
+/*                                                                           */
+/* Arguments pour lancer un kernel.                                          */
+/*---------------------------------------------------------------------------*/
+#ifndef ARCANE_ACCELERATOR_CORE_KERNELLAUNCHARGS_H
+#define ARCANE_ACCELERATOR_CORE_KERNELLAUNCHARGS_H
+/*---------------------------------------------------------------------------*/
+/*---------------------------------------------------------------------------*/
+
+#include "arcane/accelerator/core/AcceleratorCoreGlobal.h"
+
+/*---------------------------------------------------------------------------*/
+/*---------------------------------------------------------------------------*/
+
+namespace Arcane::Accelerator::impl
+{
+
+/*---------------------------------------------------------------------------*/
+/*---------------------------------------------------------------------------*/
+/*!
+ * \internal
+ * \brief Arguments pour lancer un kernel.
+ */
+class ARCANE_ACCELERATOR_CORE_EXPORT KernelLaunchArgs
+{
+  friend RunCommandLaunchInfo;
+
+ public:
+
+  KernelLaunchArgs() = default;
+  KernelLaunchArgs(Int32 nb_block_per_grid, Int32 nb_thread_per_block)
+  : m_nb_block_per_grid(nb_block_per_grid)
+  , m_nb_thread_per_block(nb_thread_per_block)
+  {
+  }
+
+ public:
+
+  int nbBlockPerGrid() const { return m_nb_block_per_grid; }
+  int nbThreadPerBlock() const { return m_nb_thread_per_block; }
+
+ private:
+
+  int m_nb_block_per_grid = 0;
+  int m_nb_thread_per_block = 0;
+};
+
+/*---------------------------------------------------------------------------*/
+/*---------------------------------------------------------------------------*/
+
+} // End namespace Arcane::Accelerator::impl
+
+/*---------------------------------------------------------------------------*/
+/*---------------------------------------------------------------------------*/
+
+#endif  
diff --git a/arcane/src/arcane/accelerator/core/RunCommandLaunchInfo.cc b/arcane/src/arcane/accelerator/core/RunCommandLaunchInfo.cc
@@ -33,7 +33,7 @@
 : m_command(command)
 , m_total_loop_size(total_loop_size)
 {
-  m_thread_block_info = _computeThreadBlockInfo();
+  m_kernel_launch_args = _computeKernelLaunchArgs();
   _begin();
 }
 
@@ -58,7 +58,7 @@
   m_exec_policy = queue.executionPolicy();
   m_queue_stream = queue._internalStream();
   m_runtime = queue._internalRuntime();
-  m_command._allocateReduceMemory(m_thread_block_info.nb_block_per_grid);
+  m_command._allocateReduceMemory(m_kernel_launch_args.nbBlockPerGrid());
 }
 
 /*---------------------------------------------------------------------------*/
@@ -118,8 +118,8 @@
 /*---------------------------------------------------------------------------*/
 
 //! Calcule le nombre de block/thread/grille du noyau en fonction de \a full_size
-auto RunCommandLaunchInfo::
-_computeThreadBlockInfo() const -> ThreadBlockInfo
+KernelLaunchArgs RunCommandLaunchInfo::
+_computeKernelLaunchArgs() const
 {
   int threads_per_block = m_command.nbThreadPerBlock();
   if (threads_per_block<=0)
@@ -167,6 +167,15 @@
 /*---------------------------------------------------------------------------*/
 /*---------------------------------------------------------------------------*/
 
+KernelLaunchArgs RunCommandLaunchInfo::
+_threadBlockInfo([[maybe_unused]] const void* func,[[maybe_unused]] Int64 shared_memory_size) const
+{
+  return m_kernel_launch_args;
+}
+
+/*---------------------------------------------------------------------------*/
+/*---------------------------------------------------------------------------*/
+
 void RunCommandLaunchInfo::
 _addSyclEvent(void* sycl_event_ptr)
 {

diff --git a/arcane/src/arcane/accelerator/core/RunCommandLaunchInfo.h b/arcane/src/arcane/accelerator/core/RunCommandLaunchInfo.h
@@ -14,11 +14,10 @@
 /*---------------------------------------------------------------------------*/
 /*---------------------------------------------------------------------------*/
 
-#include "arcane/utils/CheckedConvert.h"
 #include "arcane/utils/ConcurrencyUtils.h"
 #include "arcane/utils/Profiling.h"
 
-#include "arcane/accelerator/core/AcceleratorCoreGlobal.h"
+#include "arcane/accelerator/core/KernelLaunchArgs.h"
 
 /*---------------------------------------------------------------------------*/
 /*---------------------------------------------------------------------------*/
@@ -35,17 +34,20 @@ namespace Arcane::Accelerator::impl
  */
 class ARCANE_ACCELERATOR_CORE_EXPORT RunCommandLaunchInfo
 {
+  // Les fonctions suivantes permettent de lancer les kernels.
   template <typename SyclKernel, typename Lambda, typename LambdaArgs, typename... ReducerArgs>
   friend void _applyKernelSYCL(impl::RunCommandLaunchInfo& launch_info, SyclKernel kernel, Lambda& func,
                                const LambdaArgs& args, const ReducerArgs&... reducer_args);
+  template <typename CudaKernel, typename Lambda, typename LambdaArgs, typename... RemainingArgs>
+  friend void _applyKernelCUDA(impl::RunCommandLaunchInfo& launch_info, const CudaKernel& kernel, Lambda& func,
+                               const LambdaArgs& args, [[maybe_unused]] const RemainingArgs&... other_args);
+  template <typename HipKernel, typename Lambda, typename LambdaArgs, typename... RemainingArgs>
+  friend void _applyKernelHIP(impl::RunCommandLaunchInfo& launch_info, const HipKernel& kernel, const Lambda& func,
+                              const LambdaArgs& args, [[maybe_unused]] const RemainingArgs&... other_args);
 
  public:
 
-  struct ThreadBlockInfo
-  {
-    int nb_block_per_grid = 0;
-    int nb_thread_per_block = 0;
-  };
+  using ThreadBlockInfo = KernelLaunchArgs;
 
  public:
 
@@ -75,7 +77,7 @@ class ARCANE_ACCELERATOR_CORE_EXPORT RunCommandLaunchInfo
   void endExecute();
 
   //! Informations sur le nombre de block/thread/grille du noyau à lancer.
-  ThreadBlockInfo threadBlockInfo() const { return m_thread_block_info; }
+  KernelLaunchArgs kernelLaunchArgs() const { return m_kernel_launch_args; }
 
   //! Calcul les informations pour les boucles multi-thread
   ParallelLoopOptions computeParallelLoopOptions() const;
@@ -89,10 +91,6 @@ class ARCANE_ACCELERATOR_CORE_EXPORT RunCommandLaunchInfo
   //! Taille totale de la boucle
   Int64 totalLoopSize() const { return m_total_loop_size; }
 
- public:
-
-  void* _internalStreamImpl();
-
  private:
 
   RunCommand& m_command;
@@ -101,15 +99,23 @@ class ARCANE_ACCELERATOR_CORE_EXPORT RunCommandLaunchInfo
   IRunnerRuntime* m_runtime = nullptr;
   IRunQueueStream* m_queue_stream = nullptr;
   eExecutionPolicy m_exec_policy = eExecutionPolicy::Sequential;
-  ThreadBlockInfo m_thread_block_info;
+  KernelLaunchArgs m_kernel_launch_args;
   ForLoopRunInfo m_loop_run_info;
   Int64 m_total_loop_size = 0;
 
  private:
 
+  /*!
+   * \brief Informations dynamiques sur le nombre de block/thread/grille du noyau à lancer.
+   *
+   * Ces informations sont calculées à partir de méthodes fournies par le runtime accélérateur
+   * sous-jacent.
+   */
+  KernelLaunchArgs _threadBlockInfo(const void* func, Int64 shared_memory_size) const;
+  void* _internalStreamImpl();
   void _begin();
   void _doEndKernelLaunch();
-  ThreadBlockInfo _computeThreadBlockInfo() const;
+  KernelLaunchArgs _computeKernelLaunchArgs() const;
 
  private:
 

diff --git a/arcane/src/arcane/accelerator/core/srcs.cmake b/arcane/src/arcane/accelerator/core/srcs.cmake
@@ -10,6 +10,7 @@ set( ARCANE_SOURCES
   DeviceInfoList.h
   IReduceMemoryImpl.h
   IDeviceInfoList.h
+  KernelLaunchArgs.h
   Memory.h
   Memory.cc
   MemoryTracer.cc