Merge pull request #1880 from arcaneframework/dev/gg-add-multi-thread-filterer

grospelliergilles · web-flow · commit 53c165a2e7be · 2024-12-29T22:36:06.000+01:00
Add basic multi-thread implementation for 'GenericFiltererIf'
diff --git a/arcane/src/arcane/accelerator/GenericFilterer.h b/arcane/src/arcane/accelerator/GenericFilterer.h
@@ -26,6 +26,7 @@
 #include "arcane/accelerator/RunCommandLaunchInfo.h"
 #include "arcane/accelerator/RunCommandLoop.h"
 #include "arcane/accelerator/ScanImpl.h"
+#include "arcane/accelerator/MultiThreadAlgo.h"
 
 /*---------------------------------------------------------------------------*/
 /*---------------------------------------------------------------------------*/
@@ -315,7 +316,12 @@ class GenericFilteringIf
     } break;
 #endif
     case eExecutionPolicy::Thread:
-      // Pas encore implémenté en multi-thread
+      if (nb_item > 500) {
+        MultiThreadAlgo scanner;
+        Int32 v = scanner.doFilter(launch_info.loopRunInfo(), nb_item, input_iter, output_iter, select_lambda);
+        s.m_host_nb_out_storage[0] = v;
+        break;
+      }
       [[fallthrough]];
     case eExecutionPolicy::Sequential: {
       Int32 index = 0;
diff --git a/arcane/src/arcane/accelerator/MultiThreadAlgo.h b/arcane/src/arcane/accelerator/MultiThreadAlgo.h
@@ -130,6 +130,102 @@ class MultiThreadAlgo
     Arcane::arcaneParallelFor(0, nb_block, run_info, final_sum_func);
   }
 
+  template <typename InputIterator, typename OutputIterator, typename SelectLambda>
+  Int32 doFilter(ForLoopRunInfo run_info, Int32 nb_value,
+                 InputIterator input, OutputIterator output,
+                 SelectLambda select_lambda)
+  {
+    // Type de l'index
+    using IndexType = Int32;
+
+    UniqueArray<bool> select_flags(nb_value);
+    Span<bool> select_flags_view = select_flags;
+    //std::cout << "DO_FILTER MULTI_THREAD nb_value=" << nb_value << "\n";
+    auto multiple_getter_func = [=](Int32 input_index, Int32 nb_value) -> IndexType {
+      IndexType partial_value = 0;
+      for (Int32 x = 0; x < nb_value; ++x) {
+        const Int32 index = x + input_index;
+        bool is_select = select_lambda(input[index]);
+        select_flags_view[index] = is_select;
+        if (is_select)
+          ++partial_value;
+      }
+      return partial_value;
+    };
+
+    auto multiple_setter_func = [=](IndexType partial_value, Int32 input_index, Int32 nb_value) {
+      for (Int32 x = 0; x < nb_value; ++x) {
+        const Int32 index = x + input_index;
+        if (select_flags_view[index]) {
+          output[partial_value] = input[index];
+          ++partial_value;
+        }
+      }
+    };
+
+    // TODO: calculer automatiquement cette valeur.
+    const Int32 nb_block = 10;
+
+    // Tableau pour conserver les valeurs partielles des blocs.
+    // TODO: Utiliser un padding pour éviter des conflits de cache entre les threads.
+    SmallArray<Int32> partial_values(nb_block, 0);
+    Span<Int32> out_partial_values = partial_values;
+
+    auto partial_value_func = [=](Int32 a, Int32 n) {
+      for (Int32 i = 0; i < n; ++i) {
+        Int32 interval_index = i + a;
+
+        Int32 input_index = 0;
+        Int32 nb_value_in_interval = 0;
+        _subInterval<Int32>(nb_value, interval_index, nb_block, &input_index, &nb_value_in_interval);
+
+        out_partial_values[interval_index] = multiple_getter_func(input_index, nb_value_in_interval);
+      }
+    };
+
+    ParallelLoopOptions loop_options(run_info.options().value_or(ParallelLoopOptions{}));
+    loop_options.setGrainSize(1);
+    run_info.addOptions(loop_options);
+
+    // Calcule les sommes partielles pour nb_block
+    Arcane::arcaneParallelFor(0, nb_block, run_info, partial_value_func);
+
+    // Calcule le nombre de valeurs filtrées
+    // Calcule aussi la valeur accumulée de partial_values
+    Int32 nb_filter = 0;
+    for (Int32 i = 0; i < nb_block; ++i) {
+      Int32 x = partial_values[i];
+      nb_filter += x;
+      partial_values[i] = nb_filter;
+    }
+
+    auto filter_func = [=](Int32 a, Int32 n) {
+      for (Int32 i = 0; i < n; ++i) {
+        Int32 interval_index = i + a;
+
+        IndexType partial_value = 0;
+        if (interval_index > 0)
+          partial_value = out_partial_values[interval_index - 1];
+
+        Int32 input_index = 0;
+        Int32 nb_value_in_interval = 0;
+        _subInterval<Int32>(nb_value, interval_index, nb_block, &input_index, &nb_value_in_interval);
+
+        multiple_setter_func(partial_value, input_index, nb_value_in_interval);
+      }
+    };
+
+    // Pour l'instant il est possible que l'entrée et la sortie
+    // se chevauchent. Dans ce cas on fait le remplissage en séquentiel.
+    const bool may_input_and_output_overlap = true;
+    if (may_input_and_output_overlap)
+      filter_func(0, nb_block);
+    else
+      Arcane::arcaneParallelFor(0, nb_block, run_info, filter_func);
+
+    return nb_filter;
+  }
+
  private:
 
   template <typename SizeType>