21
21
#include " arcane/utils/PlatformUtils.h"
22
22
#include " arcane/utils/Profiling.h"
23
23
#include " arcane/utils/MemoryAllocator.h"
24
+ #include " arcane/utils/FixedArray.h"
24
25
#include " arcane/utils/internal/TaskFactoryInternal.h"
25
26
26
27
#include " arcane/core/FactoryService.h"
@@ -197,27 +198,27 @@ _toTBBRange(const ComplexForLoopRanges<4>& r)
197
198
/* ---------------------------------------------------------------------------*/
198
199
199
200
inline tbb::blocked_rangeNd<Int32,2 >
200
- _toTBBRangeWithGrain (const tbb::blocked_rangeNd<Int32,2 >& r,std:: size_t grain_size )
201
+ _toTBBRangeWithGrain (const tbb::blocked_rangeNd<Int32,2 >& r,FixedArray< size_t , 2 > grain_sizes )
201
202
{
202
- return {{r.dim (0 ).begin (), r.dim (0 ).end (), grain_size },
203
- {r.dim (1 ).begin (), r.dim (1 ).end ()}};
203
+ return {{r.dim (0 ).begin (), r.dim (0 ).end (), grain_sizes[ 0 ] },
204
+ {r.dim (1 ).begin (), r.dim (1 ).end (), grain_sizes[ 1 ] }};
204
205
}
205
206
206
207
inline tbb::blocked_rangeNd<Int32,3 >
207
- _toTBBRangeWithGrain (const tbb::blocked_rangeNd<Int32,3 >& r,std:: size_t grain_size )
208
+ _toTBBRangeWithGrain (const tbb::blocked_rangeNd<Int32,3 >& r,FixedArray< size_t , 3 > grain_sizes )
208
209
{
209
- return {{r.dim (0 ).begin (), r.dim (0 ).end (), grain_size },
210
- {r.dim (1 ).begin (), r.dim (0 ).end ()},
211
- {r.dim (2 ).begin (), r.dim (0 ).end ()}};
210
+ return {{r.dim (0 ).begin (), r.dim (0 ).end (), grain_sizes[ 0 ] },
211
+ {r.dim (1 ).begin (), r.dim (1 ).end (), grain_sizes[ 1 ] },
212
+ {r.dim (2 ).begin (), r.dim (2 ).end (), grain_sizes[ 2 ] }};
212
213
}
213
214
214
215
inline tbb::blocked_rangeNd<Int32,4 >
215
- _toTBBRangeWithGrain (const tbb::blocked_rangeNd<Int32,4 >& r,std:: size_t grain_size )
216
+ _toTBBRangeWithGrain (const tbb::blocked_rangeNd<Int32,4 >& r,FixedArray< size_t , 4 > grain_sizes )
216
217
{
217
- return {{r.dim (0 ).begin (), r.dim (0 ).end (), grain_size },
218
- {r.dim (1 ).begin (), r.dim (1 ).end ()},
219
- {r.dim (2 ).begin (), r.dim (2 ).end ()},
220
- {r.dim (3 ).begin (), r.dim (3 ).end ()}};
218
+ return {{r.dim (0 ).begin (), r.dim (0 ).end (), grain_sizes[ 0 ] },
219
+ {r.dim (1 ).begin (), r.dim (1 ).end (), grain_sizes[ 1 ] },
220
+ {r.dim (2 ).begin (), r.dim (2 ).end (), grain_sizes[ 2 ] },
221
+ {r.dim (3 ).begin (), r.dim (3 ).end (), grain_sizes[ 3 ] }};
221
222
}
222
223
223
224
/* ---------------------------------------------------------------------------*/
@@ -766,8 +767,13 @@ class TBBMDParallelFor
766
767
o << " TBB: INDEX=" << TaskFactory::currentTaskThreadIndex ()
767
768
<< " id=" << std::this_thread::get_id ()
768
769
<< " max_allowed=" << m_nb_allowed_thread
769
- // << " range_begin=" << range.begin() << " range_size=" << range.size()
770
- << " \n " ;
770
+ << " MDFor " ;
771
+ for ( Int32 i=0 ; i<RankValue; ++i ){
772
+ Int32 r0= range.dim (i).begin ();
773
+ Int32 r1 = range.dim (i).size ();
774
+ o << " range" << i << " (begin=" << r0 << " size=" << r1 << " )" ;
775
+ }
776
+ o << " \n " ;
771
777
std::cout << o.str ();
772
778
std::cout.flush ();
773
779
}
@@ -931,7 +937,9 @@ class TBBTaskImplementation::ParallelForExecute
931
937
std::cout << " TBB: TBBTaskImplementationInit ParallelForExecute begin=" << m_begin
932
938
<< " size=" << m_size << " gsize=" << gsize
933
939
<< " partitioner=" << (int )m_options.partitioner ()
934
- << " nb_thread=" << nb_thread << ' \n ' ;
940
+ << " nb_thread=" << nb_thread
941
+ << " has_stat_info=" << (m_stat_info!=nullptr )
942
+ << ' \n ' ;
935
943
936
944
if (gsize>0 )
937
945
range = tbb::blocked_range<Integer>(m_begin,m_begin+m_size,gsize);
@@ -968,19 +976,54 @@ class TBBTaskImplementation::MDParallelForExecute
968
976
const ParallelLoopOptions& options,
969
977
const ComplexForLoopRanges<RankValue>& range,
970
978
IMDRangeFunctor<RankValue>* f,[[maybe_unused]] ForLoopOneExecStat* stat_info)
971
- : m_impl(impl), m_tbb_range(_toTBBRange(range)), m_functor(f), m_options(options)
979
+ : m_impl(impl)
980
+ , m_tbb_range(_toTBBRange(range))
981
+ , m_functor(f)
982
+ , m_options(options)
983
+ , m_stat_info(stat_info)
972
984
{
973
985
// On ne peut pas modifier les valeurs d'une instance de tbb::blocked_rangeNd.
974
986
// Il faut donc en reconstruire une complètement.
975
-
976
- Integer gsize = m_options.grainSize ();
977
- if (gsize>0 && RankValue==1 ){
978
- Int32 max_range0 = range.template upperBound <0 >() - range.template lowerBound <0 >();
979
- if (gsize > max_range0)
980
- gsize = max_range0;
981
- // Modifie la taille du grain pour la première dimension.
982
- // TODO: pouvoir aussi modifier la valeur de 'grain_size' pour les autres dimensions.
983
- m_tbb_range = _toTBBRangeWithGrain (m_tbb_range,gsize);
987
+ FixedArray<size_t ,RankValue> all_grain_sizes;
988
+ Int32 gsize = m_options.grainSize ();
989
+ if (gsize>0 ){
990
+ // Si la taille du grain est différent zéro, il faut la répartir
991
+ // sur l'ensemble des dimensions. On commence par la dernière.
992
+ // TODO: regarder pourquoi dans certains cas les performances sont
993
+ // inférieures à celles qu'on obtient en utilisant un partitionneur
994
+ // statique.
995
+ constexpr bool is_verbose = false ;
996
+ std::array<Int32,RankValue> range_extents = range.extents ().asStdArray ();
997
+ double ratio = static_cast <double >(gsize) / static_cast <double >(range.nbElement ());
998
+ if constexpr (is_verbose){
999
+ std::cout << " GSIZE=" << gsize << " rank=" << RankValue << " ratio=" << ratio;
1000
+ for (Int32 i=0 ; i<RankValue; ++i )
1001
+ std::cout << " range" << i << " =" << range_extents[i];
1002
+ std::cout << " \n " ;
1003
+ }
1004
+ Int32 index = RankValue - 1 ;
1005
+ Int32 remaining_grain = gsize;
1006
+ for ( ; index >=0 ; --index ){
1007
+ Int32 current = range_extents[index ];
1008
+ if constexpr (is_verbose)
1009
+ std::cout << " Check index=" << index << " remaining=" << remaining_grain << " current=" << current << " \n " ;
1010
+ if (remaining_grain>current){
1011
+ all_grain_sizes[index ] = current;
1012
+ remaining_grain /= current;
1013
+ }
1014
+ else {
1015
+ all_grain_sizes[index ] = remaining_grain;
1016
+ break ;
1017
+ }
1018
+ }
1019
+ for ( Int32 i=0 ; i<index ; ++i )
1020
+ all_grain_sizes[i] = 1 ;
1021
+ if constexpr (is_verbose){
1022
+ for (Int32 i=0 ; i<RankValue; ++i )
1023
+ std::cout << " grain" << i << " =" << all_grain_sizes[i];
1024
+ std::cout << " \n " ;
1025
+ }
1026
+ m_tbb_range = _toTBBRangeWithGrain (m_tbb_range,all_grain_sizes);
984
1027
}
985
1028
}
986
1029
@@ -1001,8 +1044,9 @@ class TBBTaskImplementation::MDParallelForExecute
1001
1044
// TBBDeterministicParallelFor dpf(m_impl,pf,m_begin,m_size,gsize,nb_thread);
1002
1045
// tbb::parallel_for(range2,dpf);
1003
1046
}
1004
- else
1047
+ else {
1005
1048
tbb::parallel_for (m_tbb_range,pf);
1049
+ }
1006
1050
}
1007
1051
private:
1008
1052
TBBTaskImplementation* m_impl = nullptr ;
@@ -1152,10 +1196,14 @@ _executeMDParallelFor(const ComplexForLoopRanges<RankValue>& loop_ranges,
1152
1196
ForLoopOneExecStat* stat_info = sei.statInfo ();
1153
1197
impl::ScopedStatLoop scoped_loop (sei.isOwn () ? stat_info : nullptr );
1154
1198
1155
- if (TaskFactory::verboseLevel ()>=1 )
1199
+ if (TaskFactory::verboseLevel ()>=1 ){
1156
1200
std::cout << " TBB: TBBTaskImplementation executeMDParallelFor nb_dim=" << RankValue
1157
1201
<< " nb_element=" << loop_ranges.nbElement ()
1158
- << " grain_size=" << options.grainSize () << ' \n ' ;
1202
+ << " grain_size=" << options.grainSize ()
1203
+ << " name=" << run_info.traceInfo ().traceInfo ()
1204
+ << " has_stat_info=" << (stat_info!=nullptr )
1205
+ << ' \n ' ;
1206
+ }
1159
1207
1160
1208
Integer max_thread = options.maxThread ();
1161
1209
// En exécution séquentielle, appelle directement la méthode \a f.
0 commit comments