Reclaim prefetch promotion budget and reapply during iterative scaleup (#2590)

Damian Reeves · facebook-github-bot · commit 371fc534f95b · 2024-11-25T15:41:18.000-08:00
Summary: Pull Request resolved: #2590 In D56505315 we promote tables that consume less HBM when not using UVM_CACHING. This can happen when the input is large and the overheads of calculating the uniques/populating the cache (around 7x input size, or (1+6/12)x when using multi-pass prefetch, see shard_estimators.py:calculate_pipeline_io_cost) dominate the saving from having CLF < 1.0. It runs the promotion both on the starting proposal (using min-working-set), and after the proposed scaleup. In the second run, because it runs after scaleup has completed, the saved memory is "wasted". In this diff, we integrate the promotion logic directly into the interactive scaleup, so any memory saved via promotion is available to further scale hard to cache tables. This can improve plan quality. We still keep the original implementation to run on the initial starting proposal. It's possible the starting proposal is not partitionable with the configured storage reservation without this initial promotion step. Removing this would cause the planner to fail and never reach the scaleup. The net result is that we'll try to use all of the probe budget, rather undershooting, when tables have large I/O prefetch costs. Reviewed By: keyan Differential Revision: D66435139 fbshipit-source-id: 27faf36542266d323d5747280f4c1053b610cdc6
diff --git a/torchrec/distributed/planner/proposers.py b/torchrec/distributed/planner/proposers.py
@@ -12,7 +12,7 @@
 import logging
 from collections import OrderedDict
 from decimal import Decimal
-from typing import cast, Dict, List, Optional, Set, Tuple, TypeVar, Union
+from typing import Callable, cast, Dict, List, Optional, Set, Tuple, TypeVar, Union
 
 import torch
 
@@ -687,10 +687,6 @@ def feedback(
         self.proposal = EmbeddingOffloadScaleupProposer.next_plan(
             self.starting_proposal, budget, self.enumerator
         )
-        if self.proposal is not None:
-            self.promote_high_prefetch_overheaad_table_to_hbm(
-                self.enumerator, self.proposal
-            )
 
     @staticmethod
     def get_budget(proposal: List[ShardingOption], storage_constraint: Topology) -> int:
@@ -748,8 +744,10 @@ def none_to_zero(x: Optional[float]) -> float:
         if len(cache_tables) == 0:
             return None
 
-        size_model = EmbeddingOffloadScaleupProposer.build_affine_storage_model(
-            cache_tables, enumerator
+        size_model, fused_hbm_ceiling = (
+            EmbeddingOffloadScaleupProposer.build_affine_storage_model(
+                cache_tables, enumerator
+            )
         )
         clfs = torch.tensor(
             [sharding_option.cache_load_factor for sharding_option in cache_tables]
@@ -772,6 +770,7 @@ def none_to_zero(x: Optional[float]) -> float:
         )
         new_clfs = EmbeddingOffloadScaleupProposer.allocate_budget(
             model=size_model,
+            fused_hbm_ceiling=fused_hbm_ceiling,
             clfs=clfs,
             budget=budget,
             allocation_priority=cooked_cacheability,
@@ -788,9 +787,10 @@ def none_to_zero(x: Optional[float]) -> float:
                 sharding_option.cache_params.load_factor = None
                 sharding_option.compute_kernel = EmbeddingComputeKernel.FUSED.value
                 num_promoted += 1
-        logger.info(
-            f"EmbeddingOffloadScaleupProposer - Promoted {num_promoted} tables to HBM because cache size is similar to table size."
-        )
+        if num_promoted > 0:
+            logger.info(
+                f"EmbeddingOffloadScaleupProposer - Promoted {num_promoted} tables to HBM because cache size is similar to table size."
+            )
         # recalculate cost estimates of modified tables
         enumerator.populate_estimates(cache_tables)
         return proposal
@@ -822,32 +822,42 @@ def get_expected_lookups(sharding_option: ShardingOption) -> Optional[float]:
     @staticmethod
     def build_affine_storage_model(
         uvm_caching_sharding_options: List[ShardingOption], enumerator: Enumerator
-    ) -> torch.Tensor:
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
         plan: List[ShardingOption] = copy.deepcopy(uvm_caching_sharding_options)
 
-        def compute_hbm_sizes(clf: float) -> torch.Tensor:
+        def set_clf(sharding_option: ShardingOption, clf: float) -> None:
+            assert sharding_option.cache_params  # appease pyre
+            sharding_option.cache_params.load_factor = clf
+
+        def set_fused(sharding_option: ShardingOption) -> None:
+            assert sharding_option.cache_params  # appease pyre
+            sharding_option.cache_params.load_factor = None
+            sharding_option.compute_kernel = EmbeddingComputeKernel.FUSED.value
+
+        def compute_hbm_sizes(f: Callable[[ShardingOption], None]) -> torch.Tensor:
             for sharding_option in plan:
-                assert sharding_option.cache_params  # appease pyre
-                sharding_option.cache_params.load_factor = clf
+                f(sharding_option)
             enumerator.populate_estimates(plan)
             return torch.tensor(
                 [sharding_option.total_storage.hbm for sharding_option in plan]
             )
 
         low_clf, high_clf = 0.1, 0.9
-        low_hbms = compute_hbm_sizes(low_clf)
-        high_hbms = compute_hbm_sizes(high_clf)
+        low_hbms = compute_hbm_sizes(lambda so: set_clf(so, low_clf))
+        high_hbms = compute_hbm_sizes(lambda so: set_clf(so, high_clf))
+        fused_hbms = compute_hbm_sizes(set_fused)
 
         A = (high_hbms - low_hbms) / (high_clf - low_clf)
         B = low_hbms - A * low_clf
-        return torch.stack((A, B), dim=1)  # Nx2 (a,b)
+        caching_model = torch.stack((A, B), dim=1)  # Nx2 (a,b)
+        return caching_model, fused_hbms
 
     @staticmethod
     def clf_to_bytes(
         model: torch.Tensor, clfs: Union[float, torch.Tensor]
     ) -> torch.Tensor:
         # evaluate affine model AX + B
-        return (model[:, 0] * clfs + model[:, 1]).to(torch.int64)
+        return (model[:, 0] * clfs + model[:, 1]).to(torch.float64)
 
     # Given a model of an affine system, an existing configuration (clfs), available
     # budget, and an allocation policy, return new configuration that best uses the
@@ -856,6 +866,7 @@ def clf_to_bytes(
     @staticmethod
     def allocate_budget(
         model: torch.Tensor,
+        fused_hbm_ceiling: torch.Tensor,
         clfs: torch.Tensor,
         budget: int,
         allocation_priority: torch.Tensor,
@@ -882,7 +893,7 @@ def allocate_budget(
             if mask.sum() == 0:
                 break
 
-            logging.debug(
+            logger.debug(
                 f"[allocate_budget] pass={num_pass}, budget={budget}, #cache_tables={mask.sum()}"
             )
 
@@ -902,6 +913,21 @@ def allocate_budget(
             # to HBM vs spending that budget on improving hit rate on other tables in
             # next pass.
 
+            # Is any table over the size we'd get if we promoted to HBM? (promotion can
+            # be smaller if input size is large when using prefetch). If so, mark for
+            # promotion and reclaim budget to use on remaining tables.
+            promotes = mask & (min_size_bytes + cache_size_bytes > fused_hbm_ceiling)
+            if promotes.sum() > 0:
+                budget_reclaimed = torch.sum(
+                    ((min_size_bytes + cache_size_bytes) - fused_hbm_ceiling)[promotes]
+                ).item()
+                logger.debug(
+                    f"[allocate_budget] {promotes.sum()} tables exceeded ceiling, promoting to save {budget_reclaimed} bytes"
+                )
+                budget += budget_reclaimed
+                # force these tables to 1.0 to ensure promotion
+                cache_size_bytes[promotes] = max_cache_size_bytes[promotes]
+
         # cache_size_bytes are the new cache sizes we want to use. We convert them back
         # to clfs by dividing by max_cache_size_bytes, which has isolated the clf
         # portion of the table size from the fixed overheads.
diff --git a/torchrec/distributed/planner/tests/test_proposers.py b/torchrec/distributed/planner/tests/test_proposers.py
@@ -505,17 +505,19 @@ def test_allocate_budget(self) -> None:
         got = EmbeddingOffloadScaleupProposer.clf_to_bytes(
             model, torch.tensor([0, 0.5, 1])
         )
-        torch.testing.assert_close(got, torch.tensor([0, 4, 9]))
+        torch.testing.assert_close(got, torch.tensor([0, 4, 9], dtype=torch.float64))
 
         # Scenario 1, enough budget to scale everything to 1.0
         model = torch.tensor(
             [[30_000_000, 2_000_000], [30_000_000, 2_000_000], [30_000_000, 2_000_000]]
         )
+        fused_hbm_ceiling = EmbeddingOffloadScaleupProposer.clf_to_bytes(model, 1.0)
         mins = torch.tensor([0.1, 0.1, 1])
         budget = 100_000_000
         got = EmbeddingOffloadScaleupProposer.allocate_budget(
             model,
-            clfs=torch.tensor(mins),
+            fused_hbm_ceiling=fused_hbm_ceiling,
+            clfs=mins,
             budget=budget,
             allocation_priority=torch.tensor([2, 2, 2]),
         )
@@ -530,10 +532,15 @@ def test_allocate_budget(self) -> None:
         model = torch.tensor(
             [[30_000_000, 2_000_000], [30_000_000, 2_000_000], [30_000_000, 2_000_000]]
         )
+        fused_hbm_ceiling = EmbeddingOffloadScaleupProposer.clf_to_bytes(model, 1.0)
         mins = torch.tensor([0.1, 0.1, 1])
         budget = 10_000_000
         got = EmbeddingOffloadScaleupProposer.allocate_budget(
-            model, clfs=mins, budget=budget, allocation_priority=torch.tensor([2, 2, 2])
+            model,
+            fused_hbm_ceiling=fused_hbm_ceiling,
+            clfs=mins,
+            budget=budget,
+            allocation_priority=torch.tensor([2, 2, 2]),
         )
         torch.testing.assert_close(got, torch.tensor([0.26667, 0.26667, 1.0]))
         increase = (
@@ -546,10 +553,15 @@ def test_allocate_budget(self) -> None:
         model = torch.tensor(
             [[30_000_000, 2_000_000], [30_000_000, 2_000_000], [30_000_000, 2_000_000]]
         )
+        fused_hbm_ceiling = EmbeddingOffloadScaleupProposer.clf_to_bytes(model, 1.0)
         mins = torch.tensor([0.1, 0.1, 1])
         budget = 10_000_000
         got = EmbeddingOffloadScaleupProposer.allocate_budget(
-            model, clfs=mins, budget=budget, allocation_priority=torch.tensor([2, 4, 2])
+            model,
+            fused_hbm_ceiling=fused_hbm_ceiling,
+            clfs=mins,
+            budget=budget,
+            allocation_priority=torch.tensor([2, 4, 2]),
         )
         # increase is twice as much for table 2 (started at 0.1)
         torch.testing.assert_close(
@@ -559,16 +571,18 @@ def test_allocate_budget(self) -> None:
             EmbeddingOffloadScaleupProposer.clf_to_bytes(model, got).sum()
             - EmbeddingOffloadScaleupProposer.clf_to_bytes(model, mins).sum()
         )
-        self.assertEqual(increase, budget)
+        self.assertEqual(int(increase), budget)
 
         # Scenario 4, multi-pass scale up
         model = torch.tensor(
             [[30_000_000, 2_000_000], [30_000_000, 2_000_000], [30_000_000, 2_000_000]]
         )
+        fused_hbm_ceiling = EmbeddingOffloadScaleupProposer.clf_to_bytes(model, 1.0)
         mins = torch.tensor([0.1, 0.3, 0.5])
         budget = 50_000_000
         got = EmbeddingOffloadScaleupProposer.allocate_budget(
             model,
+            fused_hbm_ceiling=fused_hbm_ceiling,
             clfs=mins,
             budget=budget,
             allocation_priority=torch.tensor([1, 2, 100]),
@@ -580,6 +594,31 @@ def test_allocate_budget(self) -> None:
         )
         self.assertEqual(increase, budget)
 
+        # Scenario 5, prefetch overhead causing early promotion
+        # like scenario 4, but we set fused size to 80%, which saves enough memory
+        # to promote all 3 to HBM inside the same budget.
+        model = torch.tensor(
+            [[30_000_000, 2_000_000], [30_000_000, 2_000_000], [30_000_000, 2_000_000]]
+        )
+        fused_hbm_ceiling = (
+            EmbeddingOffloadScaleupProposer.clf_to_bytes(model, 1.0) * 0.8
+        )
+        mins = torch.tensor([0.1, 0.3, 0.5])
+        budget = 50_000_000
+        got = EmbeddingOffloadScaleupProposer.allocate_budget(
+            model,
+            fused_hbm_ceiling=fused_hbm_ceiling,
+            clfs=mins,
+            budget=budget,
+            allocation_priority=torch.tensor([1, 2, 100]),
+        )
+        torch.testing.assert_close(got, torch.tensor([1.0, 1.0, 1.0]))
+        self.assertLessEqual(
+            fused_hbm_ceiling.sum().item(),
+            EmbeddingOffloadScaleupProposer.clf_to_bytes(model, mins).sum().item()
+            + budget,
+        )
+
     @unittest.mock.patch(
         "torchrec.distributed.planner.shard_estimators._calculate_storage_specific_sizes",
         side_effect=mock_calculate_storage_specific_sizes,