Narrow scaleup probes to max cache sharding span (#2588)

Damian Reeves · facebook-github-bot · commit 49288c3b4ad7 · 2024-11-25T15:41:18.000-08:00
Summary: Pull Request resolved: #2588 When a lot of cache scaleup budget is available, significantly larger than the total amount of memory needed to promote every table to HBM, it's likely that many of the budget probes will attempt to cost a plan using more budget than the proposal can utilize. In these scenarios, we tend to see only two distinct plan costs, 1) the min-working-set plan which is costed first, 2) every other proposal "clips" at the max scaleup limit (i.e. everything promoted to HBM). It's also plausible that the fully-promoted plan is more expensive than the min-working-set plan, due to the increased bin-packing difficulty of fitting the larger shards. In these cases, the job only runs on the min-working-set proposal, even though (lots of) additional memory is available for larger caches (up to the point of diminishing returns due to bin-packing overhead). This diff narrows the search region, when more memory is available than we can use, to focus our search effort on productive portions of the search space. This increases the likelihood we discover a plan that is both cheaper than min-working-set or fully-promoted. Reviewed By: keyan Differential Revision: D66419942 fbshipit-source-id: 8d5ad8b70179517193fa88e9acc041ffb171b822
diff --git a/torchrec/distributed/planner/proposers.py b/torchrec/distributed/planner/proposers.py
@@ -12,7 +12,7 @@
 import logging
 from collections import OrderedDict
 from decimal import Decimal
-from typing import cast, Dict, List, Optional, Set, Tuple, Union
+from typing import cast, Dict, List, Optional, Set, Tuple, TypeVar, Union
 
 import torch
 
@@ -460,6 +460,15 @@ def feedback(
                 self._current_proposal = -1
 
 
+_T = TypeVar("_T")
+
+
+def _none_throws(x: Optional[_T]) -> _T:
+    if x is None:
+        raise AssertionError("unexpected None")
+    return x
+
+
 class EmbeddingOffloadScaleupProposer(Proposer):
     def __init__(self, use_depth: bool = True) -> None:
         self.use_depth: bool = use_depth
@@ -535,6 +544,26 @@ def load(
         )
         self.proposal = copy.deepcopy(self.starting_proposal)
 
+    @staticmethod
+    def get_hbm_ceiling(
+        starting_proposal: List[ShardingOption], enumerator: Enumerator
+    ) -> int:
+        """returns total amount of memory scaleup could use."""
+        proposal = copy.deepcopy(starting_proposal)
+        cache_tables = EmbeddingOffloadScaleupProposer.get_scalable_sharding_options(
+            proposal
+        )
+        for sharding_option in cache_tables:
+            if (
+                sharding_option.compute_kernel
+                == EmbeddingComputeKernel.FUSED_UVM_CACHING.value
+            ):
+                assert sharding_option.cache_params  # appease pyre
+                sharding_option.cache_params.load_factor = None
+                sharding_option.compute_kernel = EmbeddingComputeKernel.FUSED.value
+        enumerator.populate_estimates(cache_tables)
+        return sum(sharding_option.total_storage.hbm for sharding_option in proposal)
+
     @staticmethod
     def promote_high_prefetch_overheaad_table_to_hbm(
         enumerator: Optional[Enumerator], proposal: List[ShardingOption]
@@ -621,11 +650,20 @@ def feedback(
             hbm_available = EmbeddingOffloadScaleupProposer.get_budget(
                 plan, storage_constraint
             )
+            # max scale up
+            peak_budget_need = (
+                EmbeddingOffloadScaleupProposer.get_hbm_ceiling(
+                    plan, _none_throws(self.enumerator)
+                )
+                - hbm_used_previously
+            )
+            search_budget = min(hbm_available, peak_budget_need)
+
             logger.info(
-                f"EmbeddingOffloadScaleupProposer - cache scale up budget={round(bytes_to_gb(hbm_available), 2)} GB, exploring [{round(bytes_to_gb(hbm_used_previously), 2)}, {round(bytes_to_gb(hbm_used_previously + hbm_available), 2)}] GB"
+                f"EmbeddingOffloadScaleupProposer - unscaled plan={round(bytes_to_gb(hbm_used_previously),2)} GB, cache scale up budget={round(bytes_to_gb(hbm_available), 2)} GB, peak scale up budget need={round(bytes_to_gb(peak_budget_need),2)} GB, exploring plans of size [{round(bytes_to_gb(hbm_used_previously), 2)}, {round(bytes_to_gb(hbm_used_previously + search_budget), 2)}] GB"
             )
             self.search = LuusJaakolaSearch(
-                0, hbm_available, max_iterations=16, left_cost=perf_rating
+                0, search_budget, max_iterations=16, left_cost=perf_rating
             )
 
         logger.info(
@@ -663,23 +701,16 @@ def get_budget(proposal: List[ShardingOption], storage_constraint: Topology) ->
         )
         return available_hbm - used_hbm
 
-    # Given an available budget of additional memory, and a provisional sharding plan,
-    # attempt to use the budget wisely to scale up caches that would most benefit from it.
     @staticmethod
-    def next_plan(
-        starting_proposal: List[ShardingOption],
-        budget: Optional[int],
-        enumerator: Optional[Enumerator],
-    ) -> Optional[List[ShardingOption]]:
-        if budget is None or enumerator is None:
-            return None
+    def get_scalable_sharding_options(
+        proposal: List[ShardingOption],
+    ) -> List[ShardingOption]:
+        """Return the subset of tables that we can scale."""
 
         def none_to_zero(x: Optional[float]) -> float:
             return x if x is not None else 0.0
 
-        proposal = copy.deepcopy(starting_proposal)
-        # This is the subset of tables that we can scale
-        cache_tables = [
+        return [
             sharding_option
             for sharding_option in proposal
             if sharding_option.compute_kernel
@@ -693,6 +724,26 @@ def none_to_zero(x: Optional[float]) -> float:
             * none_to_zero(sharding_option.cache_load_factor)
             > 0
         ]
+
+    # Given an available budget of additional memory, and a provisional sharding plan,
+    # attempt to use the budget wisely to scale up caches that would most benefit from it.
+    @staticmethod
+    def next_plan(
+        starting_proposal: List[ShardingOption],
+        budget: Optional[int],
+        enumerator: Optional[Enumerator],
+    ) -> Optional[List[ShardingOption]]:
+        if budget is None or enumerator is None:
+            return None
+
+        def none_to_zero(x: Optional[float]) -> float:
+            return x if x is not None else 0.0
+
+        proposal = copy.deepcopy(starting_proposal)
+        # This is the subset of tables that we can scale
+        cache_tables = EmbeddingOffloadScaleupProposer.get_scalable_sharding_options(
+            proposal
+        )
         # Nothing to scale
         if len(cache_tables) == 0:
             return None
diff --git a/torchrec/distributed/planner/tests/test_proposers.py b/torchrec/distributed/planner/tests/test_proposers.py
@@ -472,6 +472,34 @@ def test_dynamic_programming_three_table(self) -> None:
             num_proposals += 1
         self.assertEqual(2, num_proposals)
 
+    def test_get_scalable_sharding_options(self) -> None:
+        def make_so(
+            name: str, clf: Optional[float], stats: Optional[CacheStatistics]
+        ) -> ShardingOption:
+            so = make_sharding_option(name, 1, clf)
+            if clf:
+                assert so.cache_params
+                so.cache_params.stats = stats
+            return so
+
+        proposal = [
+            make_so("fused", None, None),
+            make_so("caching-no-stats", 0.5, None),
+            make_so(
+                "caching-stats",
+                0.5,
+                MockCacheStatistics(expected_lookups=1, cacheability=0.42),
+            ),
+            make_so(
+                "caching-stats-no-data",
+                0,
+                MockCacheStatistics(expected_lookups=0, cacheability=0),
+            ),
+        ]
+        got = EmbeddingOffloadScaleupProposer.get_scalable_sharding_options(proposal)
+        want = [proposal[-2]]
+        self.assertEqual(got, want)
+
     def test_allocate_budget(self) -> None:
         model = torch.tensor([[1.0, 0.0], [2.0, 3.0], [4.0, 5.0]])
         got = EmbeddingOffloadScaleupProposer.clf_to_bytes(
@@ -823,7 +851,7 @@ def test_budget_shrink(self, _) -> None:
             if initial_mem is None:
                 initial_mem = mem
             # Budget given constraints:
-            #  cache scale up budget=92.53 GB, exploring [7.47, 100.0] GB
+            # unscaled plan=7.47 GB, cache scale up budget=92.53 GB, peak scale up budget need=67.06 GB, exploring plans of size [7.47, 74.53] GB
             #
             # Simple perf model, assume partitioner gives a lowest score at 7.9GB, and
             # anything larger than 8GB fails to partition. This is very hard to hit when
@@ -845,7 +873,7 @@ def test_budget_shrink(self, _) -> None:
         self.assertEqual(proposals, 16)
         self.assertNotEqual(initial_mem, best_plan, "couldn't find a better plan")
         # goal is 7.9, we get very close
-        self.assertEqual(best_plan, 7.960684550926089 * GB)
+        self.assertEqual(best_plan, 7.9028974287211895 * GB)
 
     def test_proposers_to_proposals_list(self) -> None:
         def make_mock_proposal(name: str) -> List[ShardingOption]: