Remove QuantizeEBC grouping by data type (#2571)

aporialiao · facebook-github-bot · commit 4b5a8a3f7ea6 · 2024-11-19T12:29:00.000-08:00
Summary: Pull Request resolved: #2571 FBGEMM supports TBEs being initialized with a list of data_types, so we no longer have to split tables with different quantization data types into separate tables. We've already implemented this for the ShardedQuantizedEBC - see base diff - which helped to optimize TorchRec eager mode Inference to have on-par QPS with non-eager mode inference. This diff introduces the same optimization to QuantizedEBC - even though it's not needed, for consistency. Reviewed By: PaulZhang12 Differential Revision: D63861064 fbshipit-source-id: 376adbce6cde5d9c45def0836c78c1455993d419
diff --git a/torchrec/inference/tests/test_inference.py b/torchrec/inference/tests/test_inference.py
@@ -13,6 +13,7 @@
 
 import torch
 from fbgemm_gpu.split_embedding_configs import SparseType
+from torchrec import PoolingType
 from torchrec.datasets.criteo import DEFAULT_CAT_NAMES, DEFAULT_INT_NAMES
 from torchrec.distributed.global_settings import set_propogate_device
 from torchrec.distributed.test_utils.test_model import (
@@ -298,3 +299,35 @@ def test_sharded_quantized_tbe_count(self) -> None:
                             spec[1],
                             expected_num_embeddings[spec[0]],
                         )
+
+    def test_quantized_tbe_count_different_pooling(self) -> None:
+        set_propogate_device(True)
+
+        self.tables[0].pooling = PoolingType.MEAN
+        model = TestSparseNN(
+            tables=self.tables,
+            weighted_tables=self.weighted_tables,
+            num_float_features=10,
+            dense_device=torch.device("cpu"),
+            sparse_device=torch.device("cpu"),
+            over_arch_clazz=TestOverArchRegroupModule,
+        )
+
+        model.eval()
+        _, local_batch = ModelInput.generate(
+            batch_size=16,
+            world_size=1,
+            num_float_features=10,
+            tables=self.tables,
+            weighted_tables=self.weighted_tables,
+        )
+
+        model(local_batch[0])
+
+        # Quantize the model and collect quantized weights
+        quantized_model = quantize_inference_model(model)
+        # We should have 2 TBEs for unweighted ebc as the 2 tables here have different pooling types
+        self.assertTrue(len(quantized_model.sparse.ebc.tbes) == 2)
+        self.assertTrue(len(quantized_model.sparse.weighted_ebc.tbes) == 1)
+        # Changing this back
+        self.tables[0].pooling = PoolingType.SUM
diff --git a/torchrec/quant/embedding_modules.py b/torchrec/quant/embedding_modules.py
@@ -382,15 +382,14 @@ def __init__(
             if table.name in table_names:
                 raise ValueError(f"Duplicate table name {table.name}")
             table_names.add(table.name)
-            key = (table.pooling, table.data_type)
-            self._key_to_tables[key].append(table)
+            # pyre-ignore
+            self._key_to_tables[table.pooling].append(table)
 
         location = (
             EmbeddingLocation.HOST if device.type == "cpu" else EmbeddingLocation.DEVICE
         )
 
-        for key, emb_configs in self._key_to_tables.items():
-            (pooling, data_type) = key
+        for pooling, emb_configs in self._key_to_tables.items():
             embedding_specs = []
             weight_lists: Optional[
                 List[Tuple[torch.Tensor, Optional[torch.Tensor]]]
@@ -409,7 +408,7 @@ def __init__(
                             else table.num_embeddings
                         ),
                         table.embedding_dim,
-                        data_type_to_sparse_type(data_type),
+                        data_type_to_sparse_type(table.data_type),
                         location,
                     )
                 )
@@ -421,6 +420,7 @@ def __init__(
 
             emb_module = IntNBitTableBatchedEmbeddingBagsCodegen(
                 embedding_specs=embedding_specs,
+                # pyre-ignore
                 pooling_mode=pooling_type_to_pooling_mode(pooling),
                 weight_lists=weight_lists,
                 device=device,