Skip to content

Commit e4783dd

Browse files
Merge pull request #1802 from arcaneframework/dev/gg-begin-support-to-automatically-compute-thread-and-block-size
Begin support to automatically compute thread and block size
2 parents b20f9f2 + 4048536 commit e4783dd

File tree

6 files changed

+108
-24
lines changed

6 files changed

+108
-24
lines changed

arcane/src/arcane/accelerator/RunQueueInternal.h

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -311,7 +311,7 @@ class InvalidKernelClass
311311
* \param func fonction à exécuter par le noyau
312312
* \param args arguments de la fonction lambda
313313
*
314-
* TODO: Tester si Lambda est bien un fonction, le SFINAE étant peu lisible :
314+
* TODO: Tester si Lambda est bien une fonction, le SFINAE étant peu lisible :
315315
* typename std::enable_if_t<std::is_function_v<std::decay_t<Lambda> > >* = nullptr
316316
* attendons les concepts c++20 (requires)
317317
*
@@ -321,10 +321,11 @@ _applyKernelCUDA(impl::RunCommandLaunchInfo& launch_info, const CudaKernel& kern
321321
const LambdaArgs& args, [[maybe_unused]] const RemainingArgs&... other_args)
322322
{
323323
#if defined(ARCANE_COMPILING_CUDA)
324-
auto [b, t] = launch_info.threadBlockInfo();
324+
Int32 wanted_shared_memory = 0;
325+
auto tbi = launch_info._threadBlockInfo(reinterpret_cast<const void*>(kernel), wanted_shared_memory);
325326
cudaStream_t* s = reinterpret_cast<cudaStream_t*>(launch_info._internalStreamImpl());
326327
// TODO: utiliser cudaLaunchKernel() à la place.
327-
kernel<<<b, t, 0, *s>>>(args, func, other_args...);
328+
kernel<<<tbi.nbBlockPerGrid(), tbi.nbThreadPerBlock(), wanted_shared_memory, *s>>>(args, func, other_args...);
328329
#else
329330
ARCANE_UNUSED(launch_info);
330331
ARCANE_UNUSED(kernel);
@@ -348,9 +349,10 @@ _applyKernelHIP(impl::RunCommandLaunchInfo& launch_info, const HipKernel& kernel
348349
const LambdaArgs& args, [[maybe_unused]] const RemainingArgs&... other_args)
349350
{
350351
#if defined(ARCANE_COMPILING_HIP)
351-
auto [b, t] = launch_info.threadBlockInfo();
352+
Int32 wanted_shared_memory = 0;
353+
auto tbi = launch_info._threadBlockInfo(reinterpret_cast<const void*>(kernel), wanted_shared_memory);
352354
hipStream_t* s = reinterpret_cast<hipStream_t*>(launch_info._internalStreamImpl());
353-
hipLaunchKernelGGL(kernel, b, t, 0, *s, args, func, other_args...);
355+
hipLaunchKernelGGL(kernel, tbi.nbBlockPerGrid(), tbi.nbThreadPerBlock(), wanted_shared_memory, *s, args, func, other_args...);
354356
#else
355357
ARCANE_UNUSED(launch_info);
356358
ARCANE_UNUSED(kernel);
@@ -377,7 +379,9 @@ void _applyKernelSYCL(impl::RunCommandLaunchInfo& launch_info, SyclKernel kernel
377379
sycl::queue* s = reinterpret_cast<sycl::queue*>(launch_info._internalStreamImpl());
378380
sycl::event event;
379381
if constexpr (sizeof...(ReducerArgs) > 0) {
380-
auto [b, t] = launch_info.threadBlockInfo();
382+
auto tbi = launch_info.kernelLaunchArgs();
383+
Int32 b = tbi.nbBlockPerGrid();
384+
Int32 t = tbi.nbThreadPerBlock();
381385
sycl::nd_range<1> loop_size(b * t, t);
382386
event = s->parallel_for(loop_size, [=](sycl::nd_item<1> i) { kernel(i, args, func, reducer_args...); });
383387
}

arcane/src/arcane/accelerator/core/AcceleratorCoreGlobal.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ enum class eMemoryAdvice;
6363

6464
namespace impl
6565
{
66+
class KernelLaunchArgs;
6667
class RuntimeStaticInfo;
6768
class IRunnerRuntime;
6869
// typedef pour compatibilité avec anciennes versions (octobre 2022)
Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
// -*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*-
2+
//-----------------------------------------------------------------------------
3+
// Copyright 2000-2024 CEA (www.cea.fr) IFPEN (www.ifpenergiesnouvelles.com)
4+
// See the top-level COPYRIGHT file for details.
5+
// SPDX-License-Identifier: Apache-2.0
6+
//-----------------------------------------------------------------------------
7+
/*---------------------------------------------------------------------------*/
8+
/* KernelLaunchArgs.h (C) 2000-2024 */
9+
/* */
10+
/* Arguments pour lancer un kernel. */
11+
/*---------------------------------------------------------------------------*/
12+
#ifndef ARCANE_ACCELERATOR_CORE_KERNELLAUNCHARGS_H
13+
#define ARCANE_ACCELERATOR_CORE_KERNELLAUNCHARGS_H
14+
/*---------------------------------------------------------------------------*/
15+
/*---------------------------------------------------------------------------*/
16+
17+
#include "arcane/accelerator/core/AcceleratorCoreGlobal.h"
18+
19+
/*---------------------------------------------------------------------------*/
20+
/*---------------------------------------------------------------------------*/
21+
22+
namespace Arcane::Accelerator::impl
23+
{
24+
25+
/*---------------------------------------------------------------------------*/
26+
/*---------------------------------------------------------------------------*/
27+
/*!
28+
* \internal
29+
* \brief Arguments pour lancer un kernel.
30+
*/
31+
class ARCANE_ACCELERATOR_CORE_EXPORT KernelLaunchArgs
32+
{
33+
friend RunCommandLaunchInfo;
34+
35+
public:
36+
37+
KernelLaunchArgs() = default;
38+
KernelLaunchArgs(Int32 nb_block_per_grid, Int32 nb_thread_per_block)
39+
: m_nb_block_per_grid(nb_block_per_grid)
40+
, m_nb_thread_per_block(nb_thread_per_block)
41+
{
42+
}
43+
44+
public:
45+
46+
int nbBlockPerGrid() const { return m_nb_block_per_grid; }
47+
int nbThreadPerBlock() const { return m_nb_thread_per_block; }
48+
49+
private:
50+
51+
int m_nb_block_per_grid = 0;
52+
int m_nb_thread_per_block = 0;
53+
};
54+
55+
/*---------------------------------------------------------------------------*/
56+
/*---------------------------------------------------------------------------*/
57+
58+
} // End namespace Arcane::Accelerator::impl
59+
60+
/*---------------------------------------------------------------------------*/
61+
/*---------------------------------------------------------------------------*/
62+
63+
#endif

arcane/src/arcane/accelerator/core/RunCommandLaunchInfo.cc

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ RunCommandLaunchInfo(RunCommand& command, Int64 total_loop_size)
3333
: m_command(command)
3434
, m_total_loop_size(total_loop_size)
3535
{
36-
m_thread_block_info = _computeThreadBlockInfo();
36+
m_kernel_launch_args = _computeKernelLaunchArgs();
3737
_begin();
3838
}
3939

@@ -58,7 +58,7 @@ _begin()
5858
m_exec_policy = queue.executionPolicy();
5959
m_queue_stream = queue._internalStream();
6060
m_runtime = queue._internalRuntime();
61-
m_command._allocateReduceMemory(m_thread_block_info.nb_block_per_grid);
61+
m_command._allocateReduceMemory(m_kernel_launch_args.nbBlockPerGrid());
6262
}
6363

6464
/*---------------------------------------------------------------------------*/
@@ -118,8 +118,8 @@ _internalStreamImpl()
118118
/*---------------------------------------------------------------------------*/
119119

120120
//! Calcule le nombre de block/thread/grille du noyau en fonction de \a full_size
121-
auto RunCommandLaunchInfo::
122-
_computeThreadBlockInfo() const -> ThreadBlockInfo
121+
KernelLaunchArgs RunCommandLaunchInfo::
122+
_computeKernelLaunchArgs() const
123123
{
124124
int threads_per_block = m_command.nbThreadPerBlock();
125125
if (threads_per_block<=0)
@@ -167,6 +167,15 @@ computeLoopRunInfo()
167167
/*---------------------------------------------------------------------------*/
168168
/*---------------------------------------------------------------------------*/
169169

170+
KernelLaunchArgs RunCommandLaunchInfo::
171+
_threadBlockInfo([[maybe_unused]] const void* func,[[maybe_unused]] Int64 shared_memory_size) const
172+
{
173+
return m_kernel_launch_args;
174+
}
175+
176+
/*---------------------------------------------------------------------------*/
177+
/*---------------------------------------------------------------------------*/
178+
170179
void RunCommandLaunchInfo::
171180
_addSyclEvent(void* sycl_event_ptr)
172181
{

arcane/src/arcane/accelerator/core/RunCommandLaunchInfo.h

Lines changed: 20 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,10 @@
1414
/*---------------------------------------------------------------------------*/
1515
/*---------------------------------------------------------------------------*/
1616

17-
#include "arcane/utils/CheckedConvert.h"
1817
#include "arcane/utils/ConcurrencyUtils.h"
1918
#include "arcane/utils/Profiling.h"
2019

21-
#include "arcane/accelerator/core/AcceleratorCoreGlobal.h"
20+
#include "arcane/accelerator/core/KernelLaunchArgs.h"
2221

2322
/*---------------------------------------------------------------------------*/
2423
/*---------------------------------------------------------------------------*/
@@ -35,17 +34,20 @@ namespace Arcane::Accelerator::impl
3534
*/
3635
class ARCANE_ACCELERATOR_CORE_EXPORT RunCommandLaunchInfo
3736
{
37+
// Les fonctions suivantes permettent de lancer les kernels.
3838
template <typename SyclKernel, typename Lambda, typename LambdaArgs, typename... ReducerArgs>
3939
friend void _applyKernelSYCL(impl::RunCommandLaunchInfo& launch_info, SyclKernel kernel, Lambda& func,
4040
const LambdaArgs& args, const ReducerArgs&... reducer_args);
41+
template <typename CudaKernel, typename Lambda, typename LambdaArgs, typename... RemainingArgs>
42+
friend void _applyKernelCUDA(impl::RunCommandLaunchInfo& launch_info, const CudaKernel& kernel, Lambda& func,
43+
const LambdaArgs& args, [[maybe_unused]] const RemainingArgs&... other_args);
44+
template <typename HipKernel, typename Lambda, typename LambdaArgs, typename... RemainingArgs>
45+
friend void _applyKernelHIP(impl::RunCommandLaunchInfo& launch_info, const HipKernel& kernel, const Lambda& func,
46+
const LambdaArgs& args, [[maybe_unused]] const RemainingArgs&... other_args);
4147

4248
public:
4349

44-
struct ThreadBlockInfo
45-
{
46-
int nb_block_per_grid = 0;
47-
int nb_thread_per_block = 0;
48-
};
50+
using ThreadBlockInfo = KernelLaunchArgs;
4951

5052
public:
5153

@@ -75,7 +77,7 @@ class ARCANE_ACCELERATOR_CORE_EXPORT RunCommandLaunchInfo
7577
void endExecute();
7678

7779
//! Informations sur le nombre de block/thread/grille du noyau à lancer.
78-
ThreadBlockInfo threadBlockInfo() const { return m_thread_block_info; }
80+
KernelLaunchArgs kernelLaunchArgs() const { return m_kernel_launch_args; }
7981

8082
//! Calcul les informations pour les boucles multi-thread
8183
ParallelLoopOptions computeParallelLoopOptions() const;
@@ -89,10 +91,6 @@ class ARCANE_ACCELERATOR_CORE_EXPORT RunCommandLaunchInfo
8991
//! Taille totale de la boucle
9092
Int64 totalLoopSize() const { return m_total_loop_size; }
9193

92-
public:
93-
94-
void* _internalStreamImpl();
95-
9694
private:
9795

9896
RunCommand& m_command;
@@ -101,15 +99,23 @@ class ARCANE_ACCELERATOR_CORE_EXPORT RunCommandLaunchInfo
10199
IRunnerRuntime* m_runtime = nullptr;
102100
IRunQueueStream* m_queue_stream = nullptr;
103101
eExecutionPolicy m_exec_policy = eExecutionPolicy::Sequential;
104-
ThreadBlockInfo m_thread_block_info;
102+
KernelLaunchArgs m_kernel_launch_args;
105103
ForLoopRunInfo m_loop_run_info;
106104
Int64 m_total_loop_size = 0;
107105

108106
private:
109107

108+
/*!
109+
* \brief Informations dynamiques sur le nombre de block/thread/grille du noyau à lancer.
110+
*
111+
* Ces informations sont calculées à partir de méthodes fournies par le runtime accélérateur
112+
* sous-jacent.
113+
*/
114+
KernelLaunchArgs _threadBlockInfo(const void* func, Int64 shared_memory_size) const;
115+
void* _internalStreamImpl();
110116
void _begin();
111117
void _doEndKernelLaunch();
112-
ThreadBlockInfo _computeThreadBlockInfo() const;
118+
KernelLaunchArgs _computeKernelLaunchArgs() const;
113119

114120
private:
115121

arcane/src/arcane/accelerator/core/srcs.cmake

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ set( ARCANE_SOURCES
1010
DeviceInfoList.h
1111
IReduceMemoryImpl.h
1212
IDeviceInfoList.h
13+
KernelLaunchArgs.h
1314
Memory.h
1415
Memory.cc
1516
MemoryTracer.cc

0 commit comments

Comments
 (0)