From 8a16ff8e3150cd19bc7d0bac02303c3a90c59c97 Mon Sep 17 00:00:00 2001
From: Gilles Grospellier <Gilles.Grospellier@cea.fr>
Date: Thu, 28 Nov 2024 20:01:39 +0100
Subject: [PATCH 1/2] =?UTF-8?q?[arcane,accelerator]=20Pr=C3=A9pare=20suppo?=
 =?UTF-8?q?rt=20pour=20positionner=20les=20arguments=20des=20kernels=20en?=
 =?UTF-8?q?=20fonction=20de=20l'occupancy.?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../src/arcane/accelerator/RunQueueInternal.h | 12 +++++++-----
 .../accelerator/core/RunCommandLaunchInfo.cc  |  9 +++++++++
 .../accelerator/core/RunCommandLaunchInfo.h   | 19 +++++++++++++++----
 3 files changed, 31 insertions(+), 9 deletions(-)
diff --git a/arcane/src/arcane/accelerator/RunQueueInternal.h b/arcane/src/arcane/accelerator/RunQueueInternal.h
index d3636d2f8..d0f2c4667 100644
--- a/arcane/src/arcane/accelerator/RunQueueInternal.h
+++ b/arcane/src/arcane/accelerator/RunQueueInternal.h
@@ -311,7 +311,7 @@ class InvalidKernelClass
  * \param func fonction à exécuter par le noyau
  * \param args arguments de la fonction lambda
  * 
- * TODO: Tester si Lambda est bien un fonction, le SFINAE étant peu lisible :
+ * TODO: Tester si Lambda est bien une fonction, le SFINAE étant peu lisible :
  * typename std::enable_if_t<std::is_function_v<std::decay_t<Lambda> > >* = nullptr
  * attendons les concepts c++20 (requires)
  * 
@@ -321,10 +321,11 @@ _applyKernelCUDA(impl::RunCommandLaunchInfo& launch_info, const CudaKernel& kern
                  const LambdaArgs& args, [[maybe_unused]] const RemainingArgs&... other_args)
 {
 #if defined(ARCANE_COMPILING_CUDA)
-  auto [b, t] = launch_info.threadBlockInfo();
+  Int32 wanted_shared_memory = 0;
+  auto [b, t] = launch_info._threadBlockInfo(reinterpret_cast<const void*>(kernel), wanted_shared_memory);
   cudaStream_t* s = reinterpret_cast<cudaStream_t*>(launch_info._internalStreamImpl());
   // TODO: utiliser cudaLaunchKernel() à la place.
-  kernel<<<b, t, 0, *s>>>(args, func, other_args...);
+  kernel<<<b, t, wanted_shared_memory, *s>>>(args, func, other_args...);
 #else
   ARCANE_UNUSED(launch_info);
   ARCANE_UNUSED(kernel);
@@ -348,9 +349,10 @@ _applyKernelHIP(impl::RunCommandLaunchInfo& launch_info, const HipKernel& kernel
                 const LambdaArgs& args, [[maybe_unused]] const RemainingArgs&... other_args)
 {
 #if defined(ARCANE_COMPILING_HIP)
-  auto [b, t] = launch_info.threadBlockInfo();
+  Int32 wanted_shared_memory = 0;
+  auto [b, t] = launch_info._threadBlockInfo(reinterpret_cast<const void*>(kernel), wanted_shared_memory);
   hipStream_t* s = reinterpret_cast<hipStream_t*>(launch_info._internalStreamImpl());
-  hipLaunchKernelGGL(kernel, b, t, 0, *s, args, func, other_args...);
+  hipLaunchKernelGGL(kernel, b, t, wanted_shared_memory, *s, args, func, other_args...);
 #else
   ARCANE_UNUSED(launch_info);
   ARCANE_UNUSED(kernel);
diff --git a/arcane/src/arcane/accelerator/core/RunCommandLaunchInfo.cc b/arcane/src/arcane/accelerator/core/RunCommandLaunchInfo.cc
index 21e29052b..3591ff94d 100644
--- a/arcane/src/arcane/accelerator/core/RunCommandLaunchInfo.cc
+++ b/arcane/src/arcane/accelerator/core/RunCommandLaunchInfo.cc
@@ -167,6 +167,15 @@ computeLoopRunInfo()
 /*---------------------------------------------------------------------------*/
 /*---------------------------------------------------------------------------*/
 
+RunCommandLaunchInfo::ThreadBlockInfo RunCommandLaunchInfo::
+_threadBlockInfo([[maybe_unused]] const void* func,[[maybe_unused]] Int64 shared_memory_size) const
+{
+  return m_thread_block_info;
+}
+
+/*---------------------------------------------------------------------------*/
+/*---------------------------------------------------------------------------*/
+
 void RunCommandLaunchInfo::
 _addSyclEvent(void* sycl_event_ptr)
 {
diff --git a/arcane/src/arcane/accelerator/core/RunCommandLaunchInfo.h b/arcane/src/arcane/accelerator/core/RunCommandLaunchInfo.h
index 955105a07..c56cdc092 100644
--- a/arcane/src/arcane/accelerator/core/RunCommandLaunchInfo.h
+++ b/arcane/src/arcane/accelerator/core/RunCommandLaunchInfo.h
@@ -35,9 +35,16 @@ namespace Arcane::Accelerator::impl
  */
 class ARCANE_ACCELERATOR_CORE_EXPORT RunCommandLaunchInfo
 {
+  // Les fonctions suivantes permettent de lancer les kernels.
   template <typename SyclKernel, typename Lambda, typename LambdaArgs, typename... ReducerArgs>
   friend void _applyKernelSYCL(impl::RunCommandLaunchInfo& launch_info, SyclKernel kernel, Lambda& func,
                                const LambdaArgs& args, const ReducerArgs&... reducer_args);
+  template <typename CudaKernel, typename Lambda, typename LambdaArgs, typename... RemainingArgs>
+  friend void _applyKernelCUDA(impl::RunCommandLaunchInfo& launch_info, const CudaKernel& kernel, Lambda& func,
+                               const LambdaArgs& args, [[maybe_unused]] const RemainingArgs&... other_args);
+  template <typename HipKernel, typename Lambda, typename LambdaArgs, typename... RemainingArgs>
+  friend void _applyKernelHIP(impl::RunCommandLaunchInfo& launch_info, const HipKernel& kernel, const Lambda& func,
+                              const LambdaArgs& args, [[maybe_unused]] const RemainingArgs&... other_args);
 
  public:
 
@@ -89,10 +96,6 @@ class ARCANE_ACCELERATOR_CORE_EXPORT RunCommandLaunchInfo
   //! Taille totale de la boucle
   Int64 totalLoopSize() const { return m_total_loop_size; }
 
- public:
-
-  void* _internalStreamImpl();
-
  private:
 
   RunCommand& m_command;
@@ -107,6 +110,14 @@ class ARCANE_ACCELERATOR_CORE_EXPORT RunCommandLaunchInfo
 
  private:
 
+  /*!
+   * \brief Informations dynamiques sur le nombre de block/thread/grille du noyau à lancer.
+   *
+   * Ces informations sont calculées à partir de méthodes fournies par le runtime accélérateur
+   * sous-jacent.
+   */
+  ThreadBlockInfo _threadBlockInfo(const void* func, Int64 shared_memory_size) const;
+  void* _internalStreamImpl();
   void _begin();
   void _doEndKernelLaunch();
   ThreadBlockInfo _computeThreadBlockInfo() const;

From 404853612ca51fccc1b9c3d12e359b8480792b00 Mon Sep 17 00:00:00 2001
From: Gilles Grospellier <Gilles.Grospellier@cea.fr>
Date: Thu, 28 Nov 2024 20:36:12 +0100
Subject: [PATCH 2/2] =?UTF-8?q?[arcane,accelerator]=20Renomme=20la=20class?=
 =?UTF-8?q?e=20'ThreadBlockInfo'=20en=20'KernelLaunchArgs'=20et=20la=20d?=
 =?UTF-8?q?=C3=A9place=20dans=20son=20propre=20fichier.?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../src/arcane/accelerator/RunQueueInternal.h | 12 ++--
 .../accelerator/core/AcceleratorCoreGlobal.h  |  1 +
 .../accelerator/core/KernelLaunchArgs.h       | 63 +++++++++++++++++++
 .../accelerator/core/RunCommandLaunchInfo.cc  | 12 ++--
 .../accelerator/core/RunCommandLaunchInfo.h   | 17 ++---
 arcane/src/arcane/accelerator/core/srcs.cmake |  1 +
 6 files changed, 84 insertions(+), 22 deletions(-)
 create mode 100644 arcane/src/arcane/accelerator/core/KernelLaunchArgs.h

diff --git a/arcane/src/arcane/accelerator/RunQueueInternal.h b/arcane/src/arcane/accelerator/RunQueueInternal.h
index d0f2c4667..c5162a0ce 100644
--- a/arcane/src/arcane/accelerator/RunQueueInternal.h
+++ b/arcane/src/arcane/accelerator/RunQueueInternal.h
@@ -322,10 +322,10 @@ _applyKernelCUDA(impl::RunCommandLaunchInfo& launch_info, const CudaKernel& kern
 {
 #if defined(ARCANE_COMPILING_CUDA)
   Int32 wanted_shared_memory = 0;
-  auto [b, t] = launch_info._threadBlockInfo(reinterpret_cast<const void*>(kernel), wanted_shared_memory);
+  auto tbi = launch_info._threadBlockInfo(reinterpret_cast<const void*>(kernel), wanted_shared_memory);
   cudaStream_t* s = reinterpret_cast<cudaStream_t*>(launch_info._internalStreamImpl());
   // TODO: utiliser cudaLaunchKernel() à la place.
-  kernel<<<b, t, wanted_shared_memory, *s>>>(args, func, other_args...);
+  kernel<<<tbi.nbBlockPerGrid(), tbi.nbThreadPerBlock(), wanted_shared_memory, *s>>>(args, func, other_args...);
 #else
   ARCANE_UNUSED(launch_info);
   ARCANE_UNUSED(kernel);
@@ -350,9 +350,9 @@ _applyKernelHIP(impl::RunCommandLaunchInfo& launch_info, const HipKernel& kernel
 {
 #if defined(ARCANE_COMPILING_HIP)
   Int32 wanted_shared_memory = 0;
-  auto [b, t] = launch_info._threadBlockInfo(reinterpret_cast<const void*>(kernel), wanted_shared_memory);
+  auto tbi = launch_info._threadBlockInfo(reinterpret_cast<const void*>(kernel), wanted_shared_memory);
   hipStream_t* s = reinterpret_cast<hipStream_t*>(launch_info._internalStreamImpl());
-  hipLaunchKernelGGL(kernel, b, t, wanted_shared_memory, *s, args, func, other_args...);
+  hipLaunchKernelGGL(kernel, tbi.nbBlockPerGrid(), tbi.nbThreadPerBlock(), wanted_shared_memory, *s, args, func, other_args...);
 #else
   ARCANE_UNUSED(launch_info);
   ARCANE_UNUSED(kernel);
@@ -379,7 +379,9 @@ void _applyKernelSYCL(impl::RunCommandLaunchInfo& launch_info, SyclKernel kernel
   sycl::queue* s = reinterpret_cast<sycl::queue*>(launch_info._internalStreamImpl());
   sycl::event event;
   if constexpr (sizeof...(ReducerArgs) > 0) {
-    auto [b, t] = launch_info.threadBlockInfo();
+    auto tbi = launch_info.kernelLaunchArgs();
+    Int32 b = tbi.nbBlockPerGrid();
+    Int32 t = tbi.nbThreadPerBlock();
     sycl::nd_range<1> loop_size(b * t, t);
     event = s->parallel_for(loop_size, [=](sycl::nd_item<1> i) { kernel(i, args, func, reducer_args...); });
   }
diff --git a/arcane/src/arcane/accelerator/core/AcceleratorCoreGlobal.h b/arcane/src/arcane/accelerator/core/AcceleratorCoreGlobal.h
index 078f277cc..b31627846 100644
--- a/arcane/src/arcane/accelerator/core/AcceleratorCoreGlobal.h
+++ b/arcane/src/arcane/accelerator/core/AcceleratorCoreGlobal.h
@@ -63,6 +63,7 @@ enum class eMemoryAdvice;
 
 namespace impl
 {
+  class KernelLaunchArgs;
   class RuntimeStaticInfo;
   class IRunnerRuntime;
   // typedef pour compatibilité avec anciennes versions (octobre 2022)
diff --git a/arcane/src/arcane/accelerator/core/KernelLaunchArgs.h b/arcane/src/arcane/accelerator/core/KernelLaunchArgs.h
new file mode 100644
index 000000000..13877720a
--- /dev/null
+++ b/arcane/src/arcane/accelerator/core/KernelLaunchArgs.h
@@ -0,0 +1,63 @@
+﻿// -*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*-
+//-----------------------------------------------------------------------------
+// Copyright 2000-2024 CEA (www.cea.fr) IFPEN (www.ifpenergiesnouvelles.com)
+// See the top-level COPYRIGHT file for details.
+// SPDX-License-Identifier: Apache-2.0
+//-----------------------------------------------------------------------------
+/*---------------------------------------------------------------------------*/
+/* KernelLaunchArgs.h                                          (C) 2000-2024 */
+/*                                                                           */
+/* Arguments pour lancer un kernel.                                          */
+/*---------------------------------------------------------------------------*/
+#ifndef ARCANE_ACCELERATOR_CORE_KERNELLAUNCHARGS_H
+#define ARCANE_ACCELERATOR_CORE_KERNELLAUNCHARGS_H
+/*---------------------------------------------------------------------------*/
+/*---------------------------------------------------------------------------*/
+
+#include "arcane/accelerator/core/AcceleratorCoreGlobal.h"
+
+/*---------------------------------------------------------------------------*/
+/*---------------------------------------------------------------------------*/
+
+namespace Arcane::Accelerator::impl
+{
+
+/*---------------------------------------------------------------------------*/
+/*---------------------------------------------------------------------------*/
+/*!
+ * \internal
+ * \brief Arguments pour lancer un kernel.
+ */
+class ARCANE_ACCELERATOR_CORE_EXPORT KernelLaunchArgs
+{
+  friend RunCommandLaunchInfo;
+
+ public:
+
+  KernelLaunchArgs() = default;
+  KernelLaunchArgs(Int32 nb_block_per_grid, Int32 nb_thread_per_block)
+  : m_nb_block_per_grid(nb_block_per_grid)
+  , m_nb_thread_per_block(nb_thread_per_block)
+  {
+  }
+
+ public:
+
+  int nbBlockPerGrid() const { return m_nb_block_per_grid; }
+  int nbThreadPerBlock() const { return m_nb_thread_per_block; }
+
+ private:
+
+  int m_nb_block_per_grid = 0;
+  int m_nb_thread_per_block = 0;
+};
+
+/*---------------------------------------------------------------------------*/
+/*---------------------------------------------------------------------------*/
+
+} // End namespace Arcane::Accelerator::impl
+
+/*---------------------------------------------------------------------------*/
+/*---------------------------------------------------------------------------*/
+
+#endif  
diff --git a/arcane/src/arcane/accelerator/core/RunCommandLaunchInfo.cc b/arcane/src/arcane/accelerator/core/RunCommandLaunchInfo.cc
index 3591ff94d..9710e1e36 100644
--- a/arcane/src/arcane/accelerator/core/RunCommandLaunchInfo.cc
+++ b/arcane/src/arcane/accelerator/core/RunCommandLaunchInfo.cc
@@ -33,7 +33,7 @@ RunCommandLaunchInfo(RunCommand& command, Int64 total_loop_size)
 : m_command(command)
 , m_total_loop_size(total_loop_size)
 {
-  m_thread_block_info = _computeThreadBlockInfo();
+  m_kernel_launch_args = _computeKernelLaunchArgs();
   _begin();
 }
 
@@ -58,7 +58,7 @@ _begin()
   m_exec_policy = queue.executionPolicy();
   m_queue_stream = queue._internalStream();
   m_runtime = queue._internalRuntime();
-  m_command._allocateReduceMemory(m_thread_block_info.nb_block_per_grid);
+  m_command._allocateReduceMemory(m_kernel_launch_args.nbBlockPerGrid());
 }
 
 /*---------------------------------------------------------------------------*/
@@ -118,8 +118,8 @@ _internalStreamImpl()
 /*---------------------------------------------------------------------------*/
 
 //! Calcule le nombre de block/thread/grille du noyau en fonction de \a full_size
-auto RunCommandLaunchInfo::
-_computeThreadBlockInfo() const -> ThreadBlockInfo
+KernelLaunchArgs RunCommandLaunchInfo::
+_computeKernelLaunchArgs() const
 {
   int threads_per_block = m_command.nbThreadPerBlock();
   if (threads_per_block<=0)
@@ -167,10 +167,10 @@ computeLoopRunInfo()
 /*---------------------------------------------------------------------------*/
 /*---------------------------------------------------------------------------*/
 
-RunCommandLaunchInfo::ThreadBlockInfo RunCommandLaunchInfo::
+KernelLaunchArgs RunCommandLaunchInfo::
 _threadBlockInfo([[maybe_unused]] const void* func,[[maybe_unused]] Int64 shared_memory_size) const
 {
-  return m_thread_block_info;
+  return m_kernel_launch_args;
 }
 
 /*---------------------------------------------------------------------------*/
diff --git a/arcane/src/arcane/accelerator/core/RunCommandLaunchInfo.h b/arcane/src/arcane/accelerator/core/RunCommandLaunchInfo.h
index c56cdc092..ec86d3d5c 100644
--- a/arcane/src/arcane/accelerator/core/RunCommandLaunchInfo.h
+++ b/arcane/src/arcane/accelerator/core/RunCommandLaunchInfo.h
@@ -14,11 +14,10 @@
 /*---------------------------------------------------------------------------*/
 /*---------------------------------------------------------------------------*/
 
-#include "arcane/utils/CheckedConvert.h"
 #include "arcane/utils/ConcurrencyUtils.h"
 #include "arcane/utils/Profiling.h"
 
-#include "arcane/accelerator/core/AcceleratorCoreGlobal.h"
+#include "arcane/accelerator/core/KernelLaunchArgs.h"
 
 /*---------------------------------------------------------------------------*/
 /*---------------------------------------------------------------------------*/
@@ -48,11 +47,7 @@ class ARCANE_ACCELERATOR_CORE_EXPORT RunCommandLaunchInfo
 
  public:
 
-  struct ThreadBlockInfo
-  {
-    int nb_block_per_grid = 0;
-    int nb_thread_per_block = 0;
-  };
+  using ThreadBlockInfo = KernelLaunchArgs;
 
  public:
 
@@ -82,7 +77,7 @@ class ARCANE_ACCELERATOR_CORE_EXPORT RunCommandLaunchInfo
   void endExecute();
 
   //! Informations sur le nombre de block/thread/grille du noyau à lancer.
-  ThreadBlockInfo threadBlockInfo() const { return m_thread_block_info; }
+  KernelLaunchArgs kernelLaunchArgs() const { return m_kernel_launch_args; }
 
   //! Calcul les informations pour les boucles multi-thread
   ParallelLoopOptions computeParallelLoopOptions() const;
@@ -104,7 +99,7 @@ class ARCANE_ACCELERATOR_CORE_EXPORT RunCommandLaunchInfo
   IRunnerRuntime* m_runtime = nullptr;
   IRunQueueStream* m_queue_stream = nullptr;
   eExecutionPolicy m_exec_policy = eExecutionPolicy::Sequential;
-  ThreadBlockInfo m_thread_block_info;
+  KernelLaunchArgs m_kernel_launch_args;
   ForLoopRunInfo m_loop_run_info;
   Int64 m_total_loop_size = 0;
 
@@ -116,11 +111,11 @@ class ARCANE_ACCELERATOR_CORE_EXPORT RunCommandLaunchInfo
    * Ces informations sont calculées à partir de méthodes fournies par le runtime accélérateur
    * sous-jacent.
    */
-  ThreadBlockInfo _threadBlockInfo(const void* func, Int64 shared_memory_size) const;
+  KernelLaunchArgs _threadBlockInfo(const void* func, Int64 shared_memory_size) const;
   void* _internalStreamImpl();
   void _begin();
   void _doEndKernelLaunch();
-  ThreadBlockInfo _computeThreadBlockInfo() const;
+  KernelLaunchArgs _computeKernelLaunchArgs() const;
 
  private:
 
diff --git a/arcane/src/arcane/accelerator/core/srcs.cmake b/arcane/src/arcane/accelerator/core/srcs.cmake
index 164ccb323..f53e2b247 100644
--- a/arcane/src/arcane/accelerator/core/srcs.cmake
+++ b/arcane/src/arcane/accelerator/core/srcs.cmake
@@ -10,6 +10,7 @@ set( ARCANE_SOURCES
   DeviceInfoList.h
   IReduceMemoryImpl.h
   IDeviceInfoList.h
+  KernelLaunchArgs.h
   Memory.h
   Memory.cc
   MemoryTracer.cc