2121#include " arcane/utils/PlatformUtils.h"
2222#include " arcane/utils/Profiling.h"
2323#include " arcane/utils/MemoryAllocator.h"
24+ #include " arcane/utils/FixedArray.h"
2425#include " arcane/utils/internal/TaskFactoryInternal.h"
2526
2627#include " arcane/core/FactoryService.h"
@@ -197,27 +198,27 @@ _toTBBRange(const ComplexForLoopRanges<4>& r)
197198/* ---------------------------------------------------------------------------*/
198199
199200inline tbb::blocked_rangeNd<Int32,2 >
200- _toTBBRangeWithGrain (const tbb::blocked_rangeNd<Int32,2 >& r,std:: size_t grain_size )
201+ _toTBBRangeWithGrain (const tbb::blocked_rangeNd<Int32,2 >& r,FixedArray< size_t , 2 > grain_sizes )
201202{
202- return {{r.dim (0 ).begin (), r.dim (0 ).end (), grain_size },
203- {r.dim (1 ).begin (), r.dim (1 ).end ()}};
203+ return {{r.dim (0 ).begin (), r.dim (0 ).end (), grain_sizes[ 0 ] },
204+ {r.dim (1 ).begin (), r.dim (1 ).end (), grain_sizes[ 1 ] }};
204205}
205206
206207inline tbb::blocked_rangeNd<Int32,3 >
207- _toTBBRangeWithGrain (const tbb::blocked_rangeNd<Int32,3 >& r,std:: size_t grain_size )
208+ _toTBBRangeWithGrain (const tbb::blocked_rangeNd<Int32,3 >& r,FixedArray< size_t , 3 > grain_sizes )
208209{
209- return {{r.dim (0 ).begin (), r.dim (0 ).end (), grain_size },
210- {r.dim (1 ).begin (), r.dim (0 ).end ()},
211- {r.dim (2 ).begin (), r.dim (0 ).end ()}};
210+ return {{r.dim (0 ).begin (), r.dim (0 ).end (), grain_sizes[ 0 ] },
211+ {r.dim (1 ).begin (), r.dim (1 ).end (), grain_sizes[ 1 ] },
212+ {r.dim (2 ).begin (), r.dim (2 ).end (), grain_sizes[ 2 ] }};
212213}
213214
214215inline tbb::blocked_rangeNd<Int32,4 >
215- _toTBBRangeWithGrain (const tbb::blocked_rangeNd<Int32,4 >& r,std:: size_t grain_size )
216+ _toTBBRangeWithGrain (const tbb::blocked_rangeNd<Int32,4 >& r,FixedArray< size_t , 4 > grain_sizes )
216217{
217- return {{r.dim (0 ).begin (), r.dim (0 ).end (), grain_size },
218- {r.dim (1 ).begin (), r.dim (1 ).end ()},
219- {r.dim (2 ).begin (), r.dim (2 ).end ()},
220- {r.dim (3 ).begin (), r.dim (3 ).end ()}};
218+ return {{r.dim (0 ).begin (), r.dim (0 ).end (), grain_sizes[ 0 ] },
219+ {r.dim (1 ).begin (), r.dim (1 ).end (), grain_sizes[ 1 ] },
220+ {r.dim (2 ).begin (), r.dim (2 ).end (), grain_sizes[ 2 ] },
221+ {r.dim (3 ).begin (), r.dim (3 ).end (), grain_sizes[ 3 ] }};
221222}
222223
223224/* ---------------------------------------------------------------------------*/
@@ -766,8 +767,13 @@ class TBBMDParallelFor
766767 o << " TBB: INDEX=" << TaskFactory::currentTaskThreadIndex ()
767768 << " id=" << std::this_thread::get_id ()
768769 << " max_allowed=" << m_nb_allowed_thread
769- // << " range_begin=" << range.begin() << " range_size=" << range.size()
770- << " \n " ;
770+ << " MDFor " ;
771+ for ( Int32 i=0 ; i<RankValue; ++i ){
772+ Int32 r0= range.dim (i).begin ();
773+ Int32 r1 = range.dim (i).size ();
774+ o << " range" << i << " (begin=" << r0 << " size=" << r1 << " )" ;
775+ }
776+ o << " \n " ;
771777 std::cout << o.str ();
772778 std::cout.flush ();
773779 }
@@ -931,7 +937,9 @@ class TBBTaskImplementation::ParallelForExecute
931937 std::cout << " TBB: TBBTaskImplementationInit ParallelForExecute begin=" << m_begin
932938 << " size=" << m_size << " gsize=" << gsize
933939 << " partitioner=" << (int )m_options.partitioner ()
934- << " nb_thread=" << nb_thread << ' \n ' ;
940+ << " nb_thread=" << nb_thread
941+ << " has_stat_info=" << (m_stat_info!=nullptr )
942+ << ' \n ' ;
935943
936944 if (gsize>0 )
937945 range = tbb::blocked_range<Integer>(m_begin,m_begin+m_size,gsize);
@@ -968,19 +976,54 @@ class TBBTaskImplementation::MDParallelForExecute
968976 const ParallelLoopOptions& options,
969977 const ComplexForLoopRanges<RankValue>& range,
970978 IMDRangeFunctor<RankValue>* f,[[maybe_unused]] ForLoopOneExecStat* stat_info)
971- : m_impl(impl), m_tbb_range(_toTBBRange(range)), m_functor(f), m_options(options)
979+ : m_impl(impl)
980+ , m_tbb_range(_toTBBRange(range))
981+ , m_functor(f)
982+ , m_options(options)
983+ , m_stat_info(stat_info)
972984 {
973985 // On ne peut pas modifier les valeurs d'une instance de tbb::blocked_rangeNd.
974986 // Il faut donc en reconstruire une complètement.
975-
976- Integer gsize = m_options.grainSize ();
977- if (gsize>0 && RankValue==1 ){
978- Int32 max_range0 = range.template upperBound <0 >() - range.template lowerBound <0 >();
979- if (gsize > max_range0)
980- gsize = max_range0;
981- // Modifie la taille du grain pour la première dimension.
982- // TODO: pouvoir aussi modifier la valeur de 'grain_size' pour les autres dimensions.
983- m_tbb_range = _toTBBRangeWithGrain (m_tbb_range,gsize);
987+ FixedArray<size_t ,RankValue> all_grain_sizes;
988+ Int32 gsize = m_options.grainSize ();
989+ if (gsize>0 ){
990+ // Si la taille du grain est différent zéro, il faut la répartir
991+ // sur l'ensemble des dimensions. On commence par la dernière.
992+ // TODO: regarder pourquoi dans certains cas les performances sont
993+ // inférieures à celles qu'on obtient en utilisant un partitionneur
994+ // statique.
995+ constexpr bool is_verbose = false ;
996+ std::array<Int32,RankValue> range_extents = range.extents ().asStdArray ();
997+ double ratio = static_cast <double >(gsize) / static_cast <double >(range.nbElement ());
998+ if constexpr (is_verbose){
999+ std::cout << " GSIZE=" << gsize << " rank=" << RankValue << " ratio=" << ratio;
1000+ for (Int32 i=0 ; i<RankValue; ++i )
1001+ std::cout << " range" << i << " =" << range_extents[i];
1002+ std::cout << " \n " ;
1003+ }
1004+ Int32 index = RankValue - 1 ;
1005+ Int32 remaining_grain = gsize;
1006+ for ( ; index>=0 ; --index ){
1007+ Int32 current = range_extents[index];
1008+ if constexpr (is_verbose)
1009+ std::cout << " Check index=" << index << " remaining=" << remaining_grain << " current=" << current << " \n " ;
1010+ if (remaining_grain>current){
1011+ all_grain_sizes[index] = current;
1012+ remaining_grain /= current;
1013+ }
1014+ else {
1015+ all_grain_sizes[index] = remaining_grain;
1016+ break ;
1017+ }
1018+ }
1019+ for ( Int32 i=0 ; i<index; ++i )
1020+ all_grain_sizes[i] = 1 ;
1021+ if constexpr (is_verbose){
1022+ for (Int32 i=0 ; i<RankValue; ++i )
1023+ std::cout << " grain" << i << " =" << all_grain_sizes[i];
1024+ std::cout << " \n " ;
1025+ }
1026+ m_tbb_range = _toTBBRangeWithGrain (m_tbb_range,all_grain_sizes);
9841027 }
9851028 }
9861029
@@ -1001,8 +1044,9 @@ class TBBTaskImplementation::MDParallelForExecute
10011044 // TBBDeterministicParallelFor dpf(m_impl,pf,m_begin,m_size,gsize,nb_thread);
10021045 // tbb::parallel_for(range2,dpf);
10031046 }
1004- else
1047+ else {
10051048 tbb::parallel_for (m_tbb_range,pf);
1049+ }
10061050 }
10071051 private:
10081052 TBBTaskImplementation* m_impl = nullptr ;
@@ -1152,10 +1196,14 @@ _executeMDParallelFor(const ComplexForLoopRanges<RankValue>& loop_ranges,
11521196 ForLoopOneExecStat* stat_info = sei.statInfo ();
11531197 impl::ScopedStatLoop scoped_loop (sei.isOwn () ? stat_info : nullptr );
11541198
1155- if (TaskFactory::verboseLevel ()>=1 )
1199+ if (TaskFactory::verboseLevel ()>=1 ){
11561200 std::cout << " TBB: TBBTaskImplementation executeMDParallelFor nb_dim=" << RankValue
11571201 << " nb_element=" << loop_ranges.nbElement ()
1158- << " grain_size=" << options.grainSize () << ' \n ' ;
1202+ << " grain_size=" << options.grainSize ()
1203+ << " name=" << run_info.traceInfo ().traceInfo ()
1204+ << " has_stat_info=" << (stat_info!=nullptr )
1205+ << ' \n ' ;
1206+ }
11591207
11601208 Integer max_thread = options.maxThread ();
11611209 // En exécution séquentielle, appelle directement la méthode \a f.
0 commit comments