add size and stride for empty shard DT (#2662)

iamzainhuda · facebook-github-bot · commit 00d8ed2d6269 · 2025-01-02T23:08:41.000-08:00
Summary: Pull Request resolved: #2662 Bringing DT empty shard on rank to behave the same as ST empty shard. For OT, our current DT approach broke transfer learning because they expect the tensor.size() to return global shape, we amend the DT empty shard init to include global shape and stride. Differential Revision: D67727355 fbshipit-source-id: 9823d3e75c7e4bf2dad1b77d8dcbd0ee960205ec
diff --git a/torchrec/distributed/embeddingbag.py b/torchrec/distributed/embeddingbag.py
@@ -32,6 +32,7 @@
 from torch.distributed._tensor import DTensor
 from torch.nn.modules.module import _IncompatibleKeys
 from torch.nn.parallel import DistributedDataParallel
+from torchrec.distributed.comm import get_local_size
 from torchrec.distributed.embedding_sharding import (
     EmbeddingSharding,
     EmbeddingShardingContext,
@@ -73,6 +74,7 @@
     add_params_from_parameter_sharding,
     append_prefix,
     convert_to_fbgemm_types,
+    create_global_tensor_shape_stride_from_metadata,
     maybe_annotate_embedding_event,
     merge_fused_params,
     none_throws,
@@ -918,6 +920,14 @@ def _initialize_torch_state(self) -> None:  # noqa
                         )
                     )
                 else:
+                    shape, stride = create_global_tensor_shape_stride_from_metadata(
+                        none_throws(self.module_sharding_plan[table_name]),
+                        (
+                            self._env.node_group_size
+                            if isinstance(self._env, ShardingEnv2D)
+                            else get_local_size(self._env.world_size)
+                        ),
+                    )
                     # empty shard case
                     self._model_parallel_name_to_dtensor[table_name] = (
                         DTensor.from_local(
@@ -927,6 +937,8 @@ def _initialize_torch_state(self) -> None:  # noqa
                             ),
                             device_mesh=self._env.device_mesh,
                             run_check=False,
+                            shape=shape,
+                            stride=stride,
                         )
                     )
             else:
diff --git a/torchrec/distributed/utils.py b/torchrec/distributed/utils.py
@@ -15,7 +15,7 @@
 from collections import OrderedDict
 from contextlib import AbstractContextManager, nullcontext
 from dataclasses import asdict
-from typing import Any, Dict, List, Optional, Set, Type, TypeVar, Union
+from typing import Any, Dict, List, Optional, Set, Tuple, Type, TypeVar, Union
 
 import torch
 from fbgemm_gpu.split_embedding_configs import EmbOptimType
@@ -511,3 +511,44 @@ def interaction(self, *args, **kwargs) -> None:
             pdb.Pdb.interaction(self, *args, **kwargs)
         finally:
             sys.stdin = _stdin
+
+
+def create_global_tensor_shape_stride_from_metadata(
+    parameter_sharding: ParameterSharding, devices_per_node: Optional[int] = None
+) -> Tuple[torch.Size, Tuple[int, int]]:
+    """
+    Create a global tensor shape and stride from shard metadata.
+
+    Returns:
+        torch.Size: global tensor shape.
+        tuple: global tensor stride.
+    """
+    size = None
+    if parameter_sharding.sharding_type == ShardingType.COLUMN_WISE.value:
+        row_dim = parameter_sharding.sharding_spec.shards[0].shard_sizes[0]  # pyre-ignore[16]
+        col_dim = 0
+        for shard in parameter_sharding.sharding_spec.shards:
+            col_dim += shard.shard_sizes[1]
+        size = torch.Size([row_dim, col_dim])
+    elif (
+        parameter_sharding.sharding_type == ShardingType.ROW_WISE.value
+        or parameter_sharding.sharding_type == ShardingType.TABLE_ROW_WISE.value
+    ):
+        row_dim = 0
+        col_dim = parameter_sharding.sharding_spec.shards[0].shard_sizes[1]
+        for shard in parameter_sharding.sharding_spec.shards:
+            row_dim += shard.shard_sizes[0]
+        size = torch.Size([row_dim, col_dim])
+    elif parameter_sharding.sharding_type == ShardingType.TABLE_WISE.value:
+        size = torch.Size(parameter_sharding.sharding_spec.shards[0].shard_sizes)
+    elif parameter_sharding.sharding_type == ShardingType.GRID_SHARD.value:
+        # we need node group size to appropriately calculate global shape from shard
+        assert devices_per_node is not None
+        row_dim, col_dim = 0, 0
+        num_cw_shards = len(parameter_sharding.sharding_spec.shards) // devices_per_node
+        for _ in range(num_cw_shards):
+            col_dim += parameter_sharding.sharding_spec.shards[0].shard_sizes[1]
+        for _ in range(devices_per_node):
+            row_dim += parameter_sharding.sharding_spec.shards[0].shard_sizes[0]
+        size = torch.Size([row_dim, col_dim])
+    return size, (size[1], 1) if size else (torch.Size([0, 0]), (0, 1))  # pyre-ignore[7]