Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Begin support to automatically compute thread and block size #1802

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 10 additions & 6 deletions arcane/src/arcane/accelerator/RunQueueInternal.h
Original file line number Diff line number Diff line change
Expand Up @@ -311,7 +311,7 @@ class InvalidKernelClass
* \param func fonction à exécuter par le noyau
* \param args arguments de la fonction lambda
*
* TODO: Tester si Lambda est bien un fonction, le SFINAE étant peu lisible :
* TODO: Tester si Lambda est bien une fonction, le SFINAE étant peu lisible :
* typename std::enable_if_t<std::is_function_v<std::decay_t<Lambda> > >* = nullptr
* attendons les concepts c++20 (requires)
*
Expand All @@ -321,10 +321,11 @@ _applyKernelCUDA(impl::RunCommandLaunchInfo& launch_info, const CudaKernel& kern
const LambdaArgs& args, [[maybe_unused]] const RemainingArgs&... other_args)
{
#if defined(ARCANE_COMPILING_CUDA)
auto [b, t] = launch_info.threadBlockInfo();
Int32 wanted_shared_memory = 0;
auto tbi = launch_info._threadBlockInfo(reinterpret_cast<const void*>(kernel), wanted_shared_memory);
cudaStream_t* s = reinterpret_cast<cudaStream_t*>(launch_info._internalStreamImpl());
// TODO: utiliser cudaLaunchKernel() à la place.
kernel<<<b, t, 0, *s>>>(args, func, other_args...);
kernel<<<tbi.nbBlockPerGrid(), tbi.nbThreadPerBlock(), wanted_shared_memory, *s>>>(args, func, other_args...);
#else
ARCANE_UNUSED(launch_info);
ARCANE_UNUSED(kernel);
Expand All @@ -348,9 +349,10 @@ _applyKernelHIP(impl::RunCommandLaunchInfo& launch_info, const HipKernel& kernel
const LambdaArgs& args, [[maybe_unused]] const RemainingArgs&... other_args)
{
#if defined(ARCANE_COMPILING_HIP)
auto [b, t] = launch_info.threadBlockInfo();
Int32 wanted_shared_memory = 0;
auto tbi = launch_info._threadBlockInfo(reinterpret_cast<const void*>(kernel), wanted_shared_memory);
hipStream_t* s = reinterpret_cast<hipStream_t*>(launch_info._internalStreamImpl());
hipLaunchKernelGGL(kernel, b, t, 0, *s, args, func, other_args...);
hipLaunchKernelGGL(kernel, tbi.nbBlockPerGrid(), tbi.nbThreadPerBlock(), wanted_shared_memory, *s, args, func, other_args...);
#else
ARCANE_UNUSED(launch_info);
ARCANE_UNUSED(kernel);
Expand All @@ -377,7 +379,9 @@ void _applyKernelSYCL(impl::RunCommandLaunchInfo& launch_info, SyclKernel kernel
sycl::queue* s = reinterpret_cast<sycl::queue*>(launch_info._internalStreamImpl());
sycl::event event;
if constexpr (sizeof...(ReducerArgs) > 0) {
auto [b, t] = launch_info.threadBlockInfo();
auto tbi = launch_info.kernelLaunchArgs();
Int32 b = tbi.nbBlockPerGrid();
Int32 t = tbi.nbThreadPerBlock();
sycl::nd_range<1> loop_size(b * t, t);
event = s->parallel_for(loop_size, [=](sycl::nd_item<1> i) { kernel(i, args, func, reducer_args...); });
}
Expand Down
1 change: 1 addition & 0 deletions arcane/src/arcane/accelerator/core/AcceleratorCoreGlobal.h
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ enum class eMemoryAdvice;

namespace impl
{
class KernelLaunchArgs;
class RuntimeStaticInfo;
class IRunnerRuntime;
// typedef pour compatibilité avec anciennes versions (octobre 2022)
Expand Down
63 changes: 63 additions & 0 deletions arcane/src/arcane/accelerator/core/KernelLaunchArgs.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
// -*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*-
//-----------------------------------------------------------------------------
// Copyright 2000-2024 CEA (www.cea.fr) IFPEN (www.ifpenergiesnouvelles.com)
// See the top-level COPYRIGHT file for details.
// SPDX-License-Identifier: Apache-2.0
//-----------------------------------------------------------------------------
/*---------------------------------------------------------------------------*/
/* KernelLaunchArgs.h (C) 2000-2024 */
/* */
/* Arguments pour lancer un kernel. */
/*---------------------------------------------------------------------------*/
#ifndef ARCANE_ACCELERATOR_CORE_KERNELLAUNCHARGS_H
#define ARCANE_ACCELERATOR_CORE_KERNELLAUNCHARGS_H
/*---------------------------------------------------------------------------*/
/*---------------------------------------------------------------------------*/

#include "arcane/accelerator/core/AcceleratorCoreGlobal.h"

/*---------------------------------------------------------------------------*/
/*---------------------------------------------------------------------------*/

namespace Arcane::Accelerator::impl
{

/*---------------------------------------------------------------------------*/
/*---------------------------------------------------------------------------*/
/*!
* \internal
* \brief Arguments pour lancer un kernel.
*/
class ARCANE_ACCELERATOR_CORE_EXPORT KernelLaunchArgs
{
friend RunCommandLaunchInfo;

public:

KernelLaunchArgs() = default;
KernelLaunchArgs(Int32 nb_block_per_grid, Int32 nb_thread_per_block)
: m_nb_block_per_grid(nb_block_per_grid)
, m_nb_thread_per_block(nb_thread_per_block)
{
}

public:

int nbBlockPerGrid() const { return m_nb_block_per_grid; }
int nbThreadPerBlock() const { return m_nb_thread_per_block; }

private:

int m_nb_block_per_grid = 0;
int m_nb_thread_per_block = 0;
};

/*---------------------------------------------------------------------------*/
/*---------------------------------------------------------------------------*/

} // End namespace Arcane::Accelerator::impl

/*---------------------------------------------------------------------------*/
/*---------------------------------------------------------------------------*/

#endif
17 changes: 13 additions & 4 deletions arcane/src/arcane/accelerator/core/RunCommandLaunchInfo.cc
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
: m_command(command)
, m_total_loop_size(total_loop_size)
{
m_thread_block_info = _computeThreadBlockInfo();
m_kernel_launch_args = _computeKernelLaunchArgs();
_begin();
}

Expand All @@ -58,7 +58,7 @@
m_exec_policy = queue.executionPolicy();
m_queue_stream = queue._internalStream();
m_runtime = queue._internalRuntime();
m_command._allocateReduceMemory(m_thread_block_info.nb_block_per_grid);
m_command._allocateReduceMemory(m_kernel_launch_args.nbBlockPerGrid());
}

/*---------------------------------------------------------------------------*/
Expand Down Expand Up @@ -118,8 +118,8 @@
/*---------------------------------------------------------------------------*/

//! Calcule le nombre de block/thread/grille du noyau en fonction de \a full_size
auto RunCommandLaunchInfo::
_computeThreadBlockInfo() const -> ThreadBlockInfo
KernelLaunchArgs RunCommandLaunchInfo::
_computeKernelLaunchArgs() const
{
int threads_per_block = m_command.nbThreadPerBlock();
if (threads_per_block<=0)
Expand Down Expand Up @@ -167,6 +167,15 @@
/*---------------------------------------------------------------------------*/
/*---------------------------------------------------------------------------*/

KernelLaunchArgs RunCommandLaunchInfo::

Check warning on line 170 in arcane/src/arcane/accelerator/core/RunCommandLaunchInfo.cc

View check run for this annotation

Codecov / codecov/patch

arcane/src/arcane/accelerator/core/RunCommandLaunchInfo.cc#L170

Added line #L170 was not covered by tests
_threadBlockInfo([[maybe_unused]] const void* func,[[maybe_unused]] Int64 shared_memory_size) const
{
return m_kernel_launch_args;

Check warning on line 173 in arcane/src/arcane/accelerator/core/RunCommandLaunchInfo.cc

View check run for this annotation

Codecov / codecov/patch

arcane/src/arcane/accelerator/core/RunCommandLaunchInfo.cc#L173

Added line #L173 was not covered by tests
}

/*---------------------------------------------------------------------------*/
/*---------------------------------------------------------------------------*/

void RunCommandLaunchInfo::
_addSyclEvent(void* sycl_event_ptr)
{
Expand Down
34 changes: 20 additions & 14 deletions arcane/src/arcane/accelerator/core/RunCommandLaunchInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,10 @@
/*---------------------------------------------------------------------------*/
/*---------------------------------------------------------------------------*/

#include "arcane/utils/CheckedConvert.h"
#include "arcane/utils/ConcurrencyUtils.h"
#include "arcane/utils/Profiling.h"

#include "arcane/accelerator/core/AcceleratorCoreGlobal.h"
#include "arcane/accelerator/core/KernelLaunchArgs.h"

/*---------------------------------------------------------------------------*/
/*---------------------------------------------------------------------------*/
Expand All @@ -35,17 +34,20 @@ namespace Arcane::Accelerator::impl
*/
class ARCANE_ACCELERATOR_CORE_EXPORT RunCommandLaunchInfo
{
// Les fonctions suivantes permettent de lancer les kernels.
template <typename SyclKernel, typename Lambda, typename LambdaArgs, typename... ReducerArgs>
friend void _applyKernelSYCL(impl::RunCommandLaunchInfo& launch_info, SyclKernel kernel, Lambda& func,
const LambdaArgs& args, const ReducerArgs&... reducer_args);
template <typename CudaKernel, typename Lambda, typename LambdaArgs, typename... RemainingArgs>
friend void _applyKernelCUDA(impl::RunCommandLaunchInfo& launch_info, const CudaKernel& kernel, Lambda& func,
const LambdaArgs& args, [[maybe_unused]] const RemainingArgs&... other_args);
template <typename HipKernel, typename Lambda, typename LambdaArgs, typename... RemainingArgs>
friend void _applyKernelHIP(impl::RunCommandLaunchInfo& launch_info, const HipKernel& kernel, const Lambda& func,
const LambdaArgs& args, [[maybe_unused]] const RemainingArgs&... other_args);

public:

struct ThreadBlockInfo
{
int nb_block_per_grid = 0;
int nb_thread_per_block = 0;
};
using ThreadBlockInfo = KernelLaunchArgs;

public:

Expand Down Expand Up @@ -75,7 +77,7 @@ class ARCANE_ACCELERATOR_CORE_EXPORT RunCommandLaunchInfo
void endExecute();

//! Informations sur le nombre de block/thread/grille du noyau à lancer.
ThreadBlockInfo threadBlockInfo() const { return m_thread_block_info; }
KernelLaunchArgs kernelLaunchArgs() const { return m_kernel_launch_args; }

//! Calcul les informations pour les boucles multi-thread
ParallelLoopOptions computeParallelLoopOptions() const;
Expand All @@ -89,10 +91,6 @@ class ARCANE_ACCELERATOR_CORE_EXPORT RunCommandLaunchInfo
//! Taille totale de la boucle
Int64 totalLoopSize() const { return m_total_loop_size; }

public:

void* _internalStreamImpl();

private:

RunCommand& m_command;
Expand All @@ -101,15 +99,23 @@ class ARCANE_ACCELERATOR_CORE_EXPORT RunCommandLaunchInfo
IRunnerRuntime* m_runtime = nullptr;
IRunQueueStream* m_queue_stream = nullptr;
eExecutionPolicy m_exec_policy = eExecutionPolicy::Sequential;
ThreadBlockInfo m_thread_block_info;
KernelLaunchArgs m_kernel_launch_args;
ForLoopRunInfo m_loop_run_info;
Int64 m_total_loop_size = 0;

private:

/*!
* \brief Informations dynamiques sur le nombre de block/thread/grille du noyau à lancer.
*
* Ces informations sont calculées à partir de méthodes fournies par le runtime accélérateur
* sous-jacent.
*/
KernelLaunchArgs _threadBlockInfo(const void* func, Int64 shared_memory_size) const;
void* _internalStreamImpl();
void _begin();
void _doEndKernelLaunch();
ThreadBlockInfo _computeThreadBlockInfo() const;
KernelLaunchArgs _computeKernelLaunchArgs() const;

private:

Expand Down
1 change: 1 addition & 0 deletions arcane/src/arcane/accelerator/core/srcs.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ set( ARCANE_SOURCES
DeviceInfoList.h
IReduceMemoryImpl.h
IDeviceInfoList.h
KernelLaunchArgs.h
Memory.h
Memory.cc
MemoryTracer.cc
Expand Down
Loading