Skip grain_size computation in parallel_for_each_reduce_* wrappers without threadpool (#9144)

swolchok · web-flow · commit fbed0b2c0904 · 2025-03-14T14:29:54.000-07:00
If we don't have the threadpool then this division is wasted.
diff --git a/kernels/portable/cpu/util/reduce_util.h b/kernels/portable/cpu/util/reduce_util.h
@@ -823,11 +823,15 @@ template <typename Func>
     executorch::aten::optional<int64_t> dim,
     const Tensor& out,
     const Func& func) {
+#ifdef ET_USE_THREADPOOL
   const ssize_t reduction_size = get_reduced_dim_product(in, dim);
   const auto grain_size = std::max(
       static_cast<ssize_t>(1),
       static_cast<ssize_t>(executorch::extension::internal::GRAIN_SIZE) /
           reduction_size);
+#else // ET_USE_THREADPOOL
+  const auto grain_size = 1;
+#endif // ET_USE_THREADPOOL
   return executorch::extension::parallel_for(0, out.numel(), grain_size, func);
 }
 
@@ -842,11 +846,15 @@ template <typename Func>
     optional<ArrayRef<int64_t>> dim_list,
     const Tensor& out,
     const Func& func) {
+#ifdef ET_UE_THREADPOOL
   const ssize_t reduction_size = get_reduced_dim_product(in, dim_list);
   const auto grain_size = std::max(
       static_cast<ssize_t>(1),
       static_cast<ssize_t>(executorch::extension::internal::GRAIN_SIZE) /
           reduction_size);
+#else // ET_USE_THREADPOOL
+  const auto grain_size = 1;
+#endif // ET_USE_THREADPOOL
   return executorch::extension::parallel_for(0, out.numel(), grain_size, func);
 }