fix EBC optimizer size setting for virtual table (#3239)

emlin · facebook-github-bot · commit d94560314100 · 2025-07-26T02:58:48.000-07:00
Summary: Pull Request resolved: #3239 Add missing optimizer state processing logic similar to what EC is doing here: https://fburl.com/code/hvzzs3t4, to make sure optimizer state won't use default metadata which is the virtual table size, not the actual tensor size. Reviewed By: EddyLXJ Differential Revision: D78950717 fbshipit-source-id: 45eca79bbff1fe498e2707b51a6845eb603bbdfd
diff --git a/torchrec/distributed/embeddingbag.py b/torchrec/distributed/embeddingbag.py
@@ -1081,33 +1081,66 @@ def _initialize_torch_state(self, skip_registering: bool = False) -> None:  # no
                 # note: at this point kvstore backed tensors don't own valid snapshots, so no read
                 # access is allowed on them.
 
-                # create ShardedTensor from local shards and metadata avoding all_gather collective
-                sharding_spec = none_throws(
-                    self.module_sharding_plan[table_name].sharding_spec
-                )
-
-                tensor_properties = TensorProperties(
-                    dtype=(
-                        data_type_to_dtype(
-                            self._table_name_to_config[table_name].data_type
+                if self._table_name_to_config[table_name].use_virtual_table:
+                    # virtual table size will be recalculated before checkpointing. Here we cannot
+                    # use sharding spec to build tensor metadata which will exceed the checkpoint capacity limit
+                    self._model_parallel_name_to_sharded_tensor[table_name] = (
+                        ShardedTensor._init_from_local_shards(
+                            local_shards,
+                            (
+                                [
+                                    # assuming virtual table only supports rw sharding for now
+                                    # When backend return whole row, need to respect dim(1)
+                                    # otherwise will see shard dim exceeded tensor dim error
+                                    (
+                                        0
+                                        if dim == 0
+                                        else (
+                                            local_shards[0].metadata.shard_sizes[1]
+                                            if dim == 1
+                                            else dim_size
+                                        )
+                                    )
+                                    for dim, dim_size in enumerate(
+                                        self._name_to_table_size[table_name]
+                                    )
+                                ]
+                            ),
+                            process_group=(
+                                self._env.sharding_pg
+                                if isinstance(self._env, ShardingEnv2D)
+                                else self._env.process_group
+                            ),
                         )
-                    ),
-                )
+                    )
+                else:
+                    # create ShardedTensor from local shards and metadata avoding all_gather collective
+                    sharding_spec = none_throws(
+                        self.module_sharding_plan[table_name].sharding_spec
+                    )
 
-                self._model_parallel_name_to_sharded_tensor[table_name] = (
-                    ShardedTensor._init_from_local_shards_and_global_metadata(
-                        local_shards=local_shards,
-                        sharded_tensor_metadata=sharding_spec.build_metadata(
-                            tensor_sizes=self._name_to_table_size[table_name],
-                            tensor_properties=tensor_properties,
-                        ),
-                        process_group=(
-                            self._env.sharding_pg
-                            if isinstance(self._env, ShardingEnv2D)
-                            else self._env.process_group
+                    tensor_properties = TensorProperties(
+                        dtype=(
+                            data_type_to_dtype(
+                                self._table_name_to_config[table_name].data_type
+                            )
                         ),
                     )
-                )
+
+                    self._model_parallel_name_to_sharded_tensor[table_name] = (
+                        ShardedTensor._init_from_local_shards_and_global_metadata(
+                            local_shards=local_shards,
+                            sharded_tensor_metadata=sharding_spec.build_metadata(
+                                tensor_sizes=self._name_to_table_size[table_name],
+                                tensor_properties=tensor_properties,
+                            ),
+                            process_group=(
+                                self._env.sharding_pg
+                                if isinstance(self._env, ShardingEnv2D)
+                                else self._env.process_group
+                            ),
+                        )
+                    )
 
         def extract_sharded_kvtensors(
             module: ShardedEmbeddingBagCollection,