Add basic parallel_for support to reduce_util

swolchok · swolchok · commit 846f827bb556 · 2025-03-05T17:37:01.000-08:00
Initial parallel_for integration in a portable op. Needed for #8932. Feel free to hold review until rest of stack is ready and we observe successful paralleliztaion. ghstack-source-id: 3d510f0abf35069c3c3939605ff9c5639f8f845d ghstack-comment-id: 2702502530 Pull Request resolved: #8986
diff --git a/kernels/portable/cpu/util/reduce_util.h b/kernels/portable/cpu/util/reduce_util.h
@@ -8,8 +8,10 @@
 
 #pragma once
 
+#include <c10/util/irange.h>
 #include <executorch/runtime/core/exec_aten/exec_aten.h>
 #include <executorch/runtime/core/exec_aten/util/tensor_util.h>
+#include <executorch/runtime/kernel/thread_parallel_interface.h>
 #include <cstring>
 #include <tuple>
 
@@ -24,9 +26,12 @@ void apply_on_flat_ix_with_stride_and_base(
     const size_t base,
     const size_t start,
     const size_t end) {
-  for (size_t i = start; i <= end; i++) {
-    fn(base + i * stride);
-  }
+  executorch::extension::parallel_for(
+      start, end + 1, [&](auto start_, auto end_) {
+        for (const auto i : c10::irange(start_, end_)) {
+          fn(base + i * stride);
+        }
+      });
 }
 
 template <typename Fn>
@@ -36,9 +41,12 @@ void apply_on_flat_and_dim_ix_with_stride_and_base(
     const size_t base,
     const size_t start,
     const size_t end) {
-  for (size_t i = start; i <= end; i++) {
-    fn(base + i * stride, i);
-  }
+  executorch::extension::parallel_for(
+      start, end + 1, [&](auto start_, auto end_) {
+        for (const auto i : c10::irange(start_, end_)) {
+          fn(base + i * stride, i);
+        }
+      });
 }
 
 template <typename Fn>
diff --git a/kernels/portable/cpu/util/targets.bzl b/kernels/portable/cpu/util/targets.bzl
@@ -299,8 +299,12 @@ def define_common_targets():
             srcs = ["reduce_util.cpp"],
             exported_headers = ["reduce_util.h"],
             deps = [
-                "//executorch/runtime/kernel:kernel_includes{}".format(suffix),
                 "//executorch/runtime/core/exec_aten/util:tensor_util{}".format(suffix),
+                "//executorch/runtime/kernel:kernel_includes{}".format(suffix),
+            ],
+            exported_deps = [
+                "//executorch/runtime/kernel:thread_parallel_interface",
+                "//executorch/runtime/core/portable_type/c10/c10:c10",
             ],
             exported_preprocessor_flags = ["-DUSE_ATEN_LIB"] if aten_mode else [],
             visibility = [
diff --git a/runtime/kernel/thread_parallel_interface.h b/runtime/kernel/thread_parallel_interface.h
@@ -33,6 +33,10 @@ inline bool parallel_for_no_threadpool(
   return true;
 }
 
+// Match GRAIN_SIZE from PyTorch core.
+// https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/TensorIterator.h#L78
+constexpr int64_t GRAIN_SIZE = 32768;
+
 } // namespace internal
 
 #ifdef ET_USE_THREADPOOL
@@ -74,10 +78,18 @@ inline int64_t get_thread_num() {
   return 0;
 }
 
-void set_thread_num(int64_t thread_num) {
+inline void set_thread_num(int64_t thread_num) {
   ET_DCHECK_MSG(false, "cannot set_thread_num without threading support!");
 }
 #endif // ET_USE_THREADPOOL
+
+/**
+ * Convenience version of parallel_for that sets the grain size to internal::GRAIN_SIZE.
+ */
+template <typename Func>
+bool parallel_for(const int64_t begin, const int64_t end, const Func& func) {
+  return parallel_for(begin, end, internal::GRAIN_SIZE, func);
+}
 } // namespace extension
 } // namespace executorch