Skip to content

Commit 90f1b0e

Browse files
Merge pull request #1890 from arcaneframework/dev/gg-add-grain-size-for-multi-dim-multi-thread-loop
Add support for specifiying grain size for multi-dimensionnelle multi-threaded loops
2 parents c8a2174 + 1e19b65 commit 90f1b0e

File tree

2 files changed

+81
-30
lines changed

2 files changed

+81
-30
lines changed

arcane/src/arcane/parallel/thread/TBBTaskImplementation.cc

+76-28
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
#include "arcane/utils/PlatformUtils.h"
2222
#include "arcane/utils/Profiling.h"
2323
#include "arcane/utils/MemoryAllocator.h"
24+
#include "arcane/utils/FixedArray.h"
2425
#include "arcane/utils/internal/TaskFactoryInternal.h"
2526

2627
#include "arcane/core/FactoryService.h"
@@ -197,27 +198,27 @@ _toTBBRange(const ComplexForLoopRanges<4>& r)
197198
/*---------------------------------------------------------------------------*/
198199

199200
inline tbb::blocked_rangeNd<Int32,2>
200-
_toTBBRangeWithGrain(const tbb::blocked_rangeNd<Int32,2>& r,std::size_t grain_size)
201+
_toTBBRangeWithGrain(const tbb::blocked_rangeNd<Int32,2>& r,FixedArray<size_t,2> grain_sizes)
201202
{
202-
return {{r.dim(0).begin(), r.dim(0).end(), grain_size},
203-
{r.dim(1).begin(), r.dim(1).end()}};
203+
return {{r.dim(0).begin(), r.dim(0).end(), grain_sizes[0]},
204+
{r.dim(1).begin(), r.dim(1).end(), grain_sizes[1]}};
204205
}
205206

206207
inline tbb::blocked_rangeNd<Int32,3>
207-
_toTBBRangeWithGrain(const tbb::blocked_rangeNd<Int32,3>& r,std::size_t grain_size)
208+
_toTBBRangeWithGrain(const tbb::blocked_rangeNd<Int32,3>& r,FixedArray<size_t,3> grain_sizes)
208209
{
209-
return {{r.dim(0).begin(), r.dim(0).end(), grain_size},
210-
{r.dim(1).begin(), r.dim(0).end()},
211-
{r.dim(2).begin(), r.dim(0).end()}};
210+
return {{r.dim(0).begin(), r.dim(0).end(), grain_sizes[0]},
211+
{r.dim(1).begin(), r.dim(1).end(), grain_sizes[1]},
212+
{r.dim(2).begin(), r.dim(2).end(), grain_sizes[2]}};
212213
}
213214

214215
inline tbb::blocked_rangeNd<Int32,4>
215-
_toTBBRangeWithGrain(const tbb::blocked_rangeNd<Int32,4>& r,std::size_t grain_size)
216+
_toTBBRangeWithGrain(const tbb::blocked_rangeNd<Int32,4>& r,FixedArray<size_t,4> grain_sizes)
216217
{
217-
return {{r.dim(0).begin(), r.dim(0).end(), grain_size},
218-
{r.dim(1).begin(), r.dim(1).end()},
219-
{r.dim(2).begin(), r.dim(2).end()},
220-
{r.dim(3).begin(), r.dim(3).end()}};
218+
return {{r.dim(0).begin(), r.dim(0).end(), grain_sizes[0]},
219+
{r.dim(1).begin(), r.dim(1).end(), grain_sizes[1]},
220+
{r.dim(2).begin(), r.dim(2).end(), grain_sizes[2]},
221+
{r.dim(3).begin(), r.dim(3).end(), grain_sizes[3]}};
221222
}
222223

223224
/*---------------------------------------------------------------------------*/
@@ -766,8 +767,13 @@ class TBBMDParallelFor
766767
o << "TBB: INDEX=" << TaskFactory::currentTaskThreadIndex()
767768
<< " id=" << std::this_thread::get_id()
768769
<< " max_allowed=" << m_nb_allowed_thread
769-
//<< " range_begin=" << range.begin() << " range_size=" << range.size()
770-
<< "\n";
770+
<< " MDFor ";
771+
for( Int32 i=0; i<RankValue; ++i ){
772+
Int32 r0= range.dim(i).begin();
773+
Int32 r1 = range.dim(i).size();
774+
o << " range" << i << " (begin=" << r0 << " size=" << r1 << ")";
775+
}
776+
o << "\n";
771777
std::cout << o.str();
772778
std::cout.flush();
773779
}
@@ -931,7 +937,9 @@ class TBBTaskImplementation::ParallelForExecute
931937
std::cout << "TBB: TBBTaskImplementationInit ParallelForExecute begin=" << m_begin
932938
<< " size=" << m_size << " gsize=" << gsize
933939
<< " partitioner=" << (int)m_options.partitioner()
934-
<< " nb_thread=" << nb_thread << '\n';
940+
<< " nb_thread=" << nb_thread
941+
<< " has_stat_info=" << (m_stat_info!=nullptr)
942+
<< '\n';
935943

936944
if (gsize>0)
937945
range = tbb::blocked_range<Integer>(m_begin,m_begin+m_size,gsize);
@@ -968,19 +976,54 @@ class TBBTaskImplementation::MDParallelForExecute
968976
const ParallelLoopOptions& options,
969977
const ComplexForLoopRanges<RankValue>& range,
970978
IMDRangeFunctor<RankValue>* f,[[maybe_unused]] ForLoopOneExecStat* stat_info)
971-
: m_impl(impl), m_tbb_range(_toTBBRange(range)), m_functor(f), m_options(options)
979+
: m_impl(impl)
980+
, m_tbb_range(_toTBBRange(range))
981+
, m_functor(f)
982+
, m_options(options)
983+
, m_stat_info(stat_info)
972984
{
973985
// On ne peut pas modifier les valeurs d'une instance de tbb::blocked_rangeNd.
974986
// Il faut donc en reconstruire une complètement.
975-
976-
Integer gsize = m_options.grainSize();
977-
if (gsize>0 && RankValue==1){
978-
Int32 max_range0 = range.template upperBound<0>() - range.template lowerBound<0>();
979-
if (gsize > max_range0)
980-
gsize = max_range0;
981-
// Modifie la taille du grain pour la première dimension.
982-
// TODO: pouvoir aussi modifier la valeur de 'grain_size' pour les autres dimensions.
983-
m_tbb_range = _toTBBRangeWithGrain(m_tbb_range,gsize);
987+
FixedArray<size_t,RankValue> all_grain_sizes;
988+
Int32 gsize = m_options.grainSize();
989+
if (gsize>0){
990+
// Si la taille du grain est différent zéro, il faut la répartir
991+
// sur l'ensemble des dimensions. On commence par la dernière.
992+
// TODO: regarder pourquoi dans certains cas les performances sont
993+
// inférieures à celles qu'on obtient en utilisant un partitionneur
994+
// statique.
995+
constexpr bool is_verbose = false;
996+
std::array<Int32,RankValue> range_extents = range.extents().asStdArray();
997+
double ratio = static_cast<double>(gsize) / static_cast<double>(range.nbElement());
998+
if constexpr (is_verbose){
999+
std::cout << "GSIZE=" << gsize << " rank=" << RankValue << " ratio=" << ratio;
1000+
for(Int32 i=0; i<RankValue; ++i )
1001+
std::cout << " range" << i << "=" << range_extents[i];
1002+
std::cout << "\n";
1003+
}
1004+
Int32 index = RankValue - 1;
1005+
Int32 remaining_grain = gsize;
1006+
for( ; index>=0; --index ){
1007+
Int32 current = range_extents[index];
1008+
if constexpr (is_verbose)
1009+
std::cout << "Check index=" << index << " remaining=" << remaining_grain << " current=" << current << "\n";
1010+
if (remaining_grain>current){
1011+
all_grain_sizes[index] = current;
1012+
remaining_grain /= current;
1013+
}
1014+
else{
1015+
all_grain_sizes[index] = remaining_grain;
1016+
break;
1017+
}
1018+
}
1019+
for( Int32 i=0; i<index; ++i )
1020+
all_grain_sizes[i] = 1;
1021+
if constexpr (is_verbose){
1022+
for(Int32 i=0; i<RankValue; ++i )
1023+
std::cout << " grain" << i << "=" << all_grain_sizes[i];
1024+
std::cout << "\n";
1025+
}
1026+
m_tbb_range = _toTBBRangeWithGrain(m_tbb_range,all_grain_sizes);
9841027
}
9851028
}
9861029

@@ -1001,8 +1044,9 @@ class TBBTaskImplementation::MDParallelForExecute
10011044
//TBBDeterministicParallelFor dpf(m_impl,pf,m_begin,m_size,gsize,nb_thread);
10021045
//tbb::parallel_for(range2,dpf);
10031046
}
1004-
else
1047+
else{
10051048
tbb::parallel_for(m_tbb_range,pf);
1049+
}
10061050
}
10071051
private:
10081052
TBBTaskImplementation* m_impl = nullptr;
@@ -1152,10 +1196,14 @@ _executeMDParallelFor(const ComplexForLoopRanges<RankValue>& loop_ranges,
11521196
ForLoopOneExecStat* stat_info = sei.statInfo();
11531197
impl::ScopedStatLoop scoped_loop(sei.isOwn() ? stat_info : nullptr);
11541198

1155-
if (TaskFactory::verboseLevel()>=1)
1199+
if (TaskFactory::verboseLevel()>=1){
11561200
std::cout << "TBB: TBBTaskImplementation executeMDParallelFor nb_dim=" << RankValue
11571201
<< " nb_element=" << loop_ranges.nbElement()
1158-
<< " grain_size=" << options.grainSize() << '\n';
1202+
<< " grain_size=" << options.grainSize()
1203+
<< " name=" << run_info.traceInfo().traceInfo()
1204+
<< " has_stat_info=" << (stat_info!=nullptr)
1205+
<< '\n';
1206+
}
11591207

11601208
Integer max_thread = options.maxThread();
11611209
// En exécution séquentielle, appelle directement la méthode \a f.

arcane/src/arcane/utils/ForLoopRanges.h

+5-2
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
// -*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*-
22
//-----------------------------------------------------------------------------
3-
// Copyright 2000-2024 CEA (www.cea.fr) IFPEN (www.ifpenergiesnouvelles.com)
3+
// Copyright 2000-2025 CEA (www.cea.fr) IFPEN (www.ifpenergiesnouvelles.com)
44
// See the top-level COPYRIGHT file for details.
55
// SPDX-License-Identifier: Apache-2.0
66
//-----------------------------------------------------------------------------
77
/*---------------------------------------------------------------------------*/
8-
/* ForLoopRanges.h (C) 2000-2024 */
8+
/* ForLoopRanges.h (C) 2000-2025 */
99
/* */
1010
/* Intervalles d'itérations pour les boucles. */
1111
/*---------------------------------------------------------------------------*/
@@ -104,6 +104,7 @@ class SimpleForLoopRanges
104104

105105
template <Int32 I> constexpr Int32 lowerBound() const { return 0; }
106106
template <Int32 I> constexpr Int32 upperBound() const { return m_bounds.template constExtent<I>(); }
107+
template <Int32 I> constexpr Int32 extent() const { return m_bounds.template constExtent<I>(); }
107108
constexpr Int64 nbElement() const { return m_bounds.nbElement(); }
108109
constexpr ArrayIndexType getIndices(Int32 i) const { return m_bounds.getIndices(i); }
109110

@@ -144,13 +145,15 @@ class ComplexForLoopRanges
144145

145146
template <Int32 I> constexpr Int32 lowerBound() const { return m_lower_bounds[I]; }
146147
template <Int32 I> constexpr Int32 upperBound() const { return m_lower_bounds[I] + m_extents.template constExtent<I>(); }
148+
template <Int32 I> constexpr Int32 extent() const { return m_extents.template constExtent<I>(); }
147149
constexpr Int64 nbElement() const { return m_extents.nbElement(); }
148150
constexpr ArrayIndexType getIndices(Int32 i) const
149151
{
150152
auto x = m_extents.getIndices(i);
151153
x.add(m_lower_bounds);
152154
return x;
153155
}
156+
constexpr ArrayBoundsType extents() const { return m_extents; }
154157

155158
private:
156159

0 commit comments

Comments
 (0)