[shard] ShardedTensor Interface (pytorch#74695)

Wanchao Liang · pytorchmergebot · commit 9bcd60f06ab3 · 2022-05-02T22:07:42.000Z
Summary: Pull Request resolved: pytorch#74695 ShardedTensor Interface: 1. Make a ShardedTensorInterface class that is a subclass of torch.Tensor, define basic APIs in ShardedTensorInterface, ShardedTensor is now a subclass of it. 2. By default disable `__torch_dispatch__` in the ShardedTensorInterface, ShardedTensor will use `__torch_function__` for now, subclasses of ShardedTensorInterface can use `__torch_dispatch__` by overriding it 3. remove attribute functions in ShardedTensor and handled them in `__torch_function__` ghstack-source-id: 155141823 (Note: this ignores all push blocking failures!) Reviewed By: pritamdamania87, fduwjj Differential Revision: D35123200 fbshipit-source-id: 04ad48ae373e6f61d48bb3bc83021e97b0721362 (cherry picked from commit 71ad555)
diff --git a/test/distributed/_shard/sharded_tensor/ops/test_linear.py b/test/distributed/_shard/sharded_tensor/ops/test_linear.py
@@ -192,6 +192,7 @@ def test_sharded_linear_rowwise(self):
     def test_sharded_linear_errors(self):
         for spec in generate_chunk_sharding_specs_for_test(0):
             fc1 = torch.nn.Linear(10, 10).cuda(self.rank)
+            shard_parameter(fc1, "weight", spec)
             shard_parameter(fc1, "bias", spec)
             with self.assertRaisesRegex(TypeError, 'bias needs to be torch.Tensor'):
                 fc1(torch.rand(10, 10).cuda(self.rank))
diff --git a/test/distributed/_shard/sharded_tensor/test_sharded_tensor.py b/test/distributed/_shard/sharded_tensor/test_sharded_tensor.py
@@ -421,6 +421,7 @@ def test_sharded_tensor_metadata(self):
         st = sharded_tensor.empty(spec, 10, 20, init_rrefs=True)
         st_metadata = st.metadata()
         self.assertEqual(torch.Size([10, 20]), st_metadata.size)
+        self.assertEqual(torch.Size([10, 20]), st.size())
         self.assertEqual(torch.float, st.dtype)
         self.assertEqual(torch.strided, st.layout)
         self.assertEqual(False, st.requires_grad)
@@ -449,7 +450,7 @@ def test_sharded_tensor_metadata(self):
 
         # test read only properties, they're read only as we can't simply change
         # the global metadata without changing the underlying shard's properties
-        with self.assertRaisesRegex(AttributeError, "can't set attribute"):
+        with self.assertRaisesRegex(RuntimeError, "torch function '__set__'"):
             st.requires_grad = True
 
     @with_comms
@@ -908,7 +909,7 @@ def test_invalid_sharding(self):
 
         spec = ChunkShardingSpec(dim=0, placements=["rank:0/cuda:1"])
         with self.assertRaisesRegex(ValueError, 'Only torch.strided layout is currently supported'):
-            sharded_tensor.empty(spec, 10, 20, layout=torch.sparse)
+            sharded_tensor.empty(spec, 10, 20, layout=torch.sparse_coo)
 
         spec = ChunkShardingSpec(dim=0, placements=["rank:0/cuda:1"])
         with self.assertRaisesRegex(ValueError, 'Only torch.contiguous_format memory_format is currently supported'):
@@ -1025,11 +1026,17 @@ def test_sharded_tensor_sizes(self):
         st = sharded_tensor.empty(spec, (10, 20), init_rrefs=True)
         self.assertEqual(st.size(1), 20)
 
+        # Test with negative indexed size
+        st = sharded_tensor.empty(spec, (10, 20), init_rrefs=True)
+        self.assertEqual(st.size(-1), 20)
+
+        # Test with dim/ndim
+        self.assertEqual(st.dim(), 2)
+        self.assertEqual(st.ndim, 2)
+
         # Test with invalid input
         st = sharded_tensor.empty(spec, (10, 20), init_rrefs=True)
-        with self.assertRaisesRegex(ValueError, 'must be within the range of tensor dimensions \\[0, 2\\)'):
-            st.size(-1)
-        with self.assertRaisesRegex(ValueError, 'must be within the range of tensor dimensions \\[0, 2\\)'):
+        with self.assertRaisesRegex(IndexError, 'Dimension out of range'):
             st.size(2)
 
         with self.assertRaises(TypeError):
@@ -1493,15 +1500,15 @@ def test_sharded_tensor_to_cpu(self):
         # CPU sharded tensor should return the same instance (no copy)
         st_cpu = sharded_tensor.zeros(cpu_spec, h, w, process_group=gloo_pg)
         new_st_cpu = st_cpu.cpu()
-        self.assertEqual(st_cpu, new_st_cpu)
+        self.assertTrue(st_cpu is new_st_cpu)
 
         # GPU sharded tensor to cpu
         st = sharded_tensor.zeros(spec, h, w)
         # test ability to move st to CPU
         spec_before_move = st.sharding_spec()
         new_st = st.cpu(process_group=gloo_pg)
         # return a copy of orginal st
-        self.assertNotEqual(st, new_st)
+        self.assertFalse(st is new_st)
         # check the spec is still ChunkShardingSpec
         spec_after_move = new_st.sharding_spec()
         self.assertIsInstance(spec_after_move, ChunkShardingSpec)
@@ -1534,7 +1541,7 @@ def test_sharded_tensor_to_cpu(self):
         st = sharded_tensor.zeros(mixed_spec, h, w, process_group=gloo_pg)
         new_st = st.cpu()
         # return a copy of orginal st
-        self.assertNotEqual(st, new_st)
+        self.assertFalse(st is new_st)
         # check the spec is still ChunkShardingSpec
         spec_after_move = new_st.sharding_spec()
         self.assertIsInstance(spec_after_move, ChunkShardingSpec)
diff --git a/torch/distributed/_shard/sharded_tensor/_ops/__init__.py b/torch/distributed/_shard/sharded_tensor/_ops/__init__.py
@@ -1,5 +1,6 @@
 import torch.distributed._shard.sharded_tensor._ops.elementwise_ops
 import torch.distributed._shard.sharded_tensor._ops.math_ops
+import torch.distributed._shard.sharded_tensor._ops.default_tensor_ops
 
 from .binary_cmp import equal, allclose
 from .embedding import sharded_embedding
diff --git a/torch/distributed/_shard/sharded_tensor/_ops/default_tensor_ops.py b/torch/distributed/_shard/sharded_tensor/_ops/default_tensor_ops.py
@@ -0,0 +1,34 @@
+import torch
+from torch.distributed._shard.sharded_tensor import (
+    sharded_op_impl,
+)
+
+
+def register_default_op(op):
+    @sharded_op_impl(op)
+    def tensor_default_op(types, args=(), kwargs=None, pg=None):
+        """
+        Handles ``__torch_function__`` dispatch for the default tensor ops that
+        behave the same as ``torch.Tensor`` such as ``torch.Tensor.shape`` or
+        ``torch.Tensor.dtype``. We simply lower to the real op call with
+        DisableTorchFunction context like ``torch.Tensor.__torch_function__``
+        to avoid recursions.
+        """
+        if kwargs is None:
+            kwargs = {}
+
+        with torch._C.DisableTorchFunction():
+            return op(*args, **kwargs)
+
+# Tensor properties access
+register_default_op(torch.Tensor.requires_grad.__get__)  # type: ignore[attr-defined]
+register_default_op(torch.Tensor.shape.__get__)  # type: ignore[attr-defined]
+register_default_op(torch.Tensor.dtype.__get__)  # type: ignore[attr-defined]
+register_default_op(torch.Tensor.layout.__get__)  # type: ignore[attr-defined]
+register_default_op(torch.Tensor.size)
+register_default_op(torch.Tensor.dim)
+register_default_op(torch.Tensor.ndim.__get__)  # type: ignore[attr-defined]
+register_default_op(torch.Tensor.is_contiguous)
+
+# __reduce_ex__ to dispatch to get_state/set_state
+register_default_op(torch.Tensor.__reduce_ex__)
diff --git a/torch/distributed/_shard/sharded_tensor/_ops/embedding.py b/torch/distributed/_shard/sharded_tensor/_ops/embedding.py
@@ -1,7 +1,5 @@
 # coding=utf-8
 
-from typing import cast
-
 import torch
 import torch.distributed as dist
 from ._common import (
@@ -158,7 +156,7 @@ def _validate_embedding_param(args, kwargs):
         raise TypeError("input need to be torch.Tensor")
     if not isinstance(weight, ShardedTensor):
         raise TypeError("weight needs to be ShardedTensor")
-    weight_size = cast(torch.Size, weight.size())
+    weight_size = weight.size()
     if len(weight_size) != 2:
         raise ValueError("Weight needs to have exactly 2 dims")
     if int(torch.min(input).item()) < 0:
diff --git a/torch/distributed/_shard/sharded_tensor/_ops/embedding_bag.py b/torch/distributed/_shard/sharded_tensor/_ops/embedding_bag.py
@@ -204,7 +204,7 @@ def _validate_embedding_bag_param(args, kwargs):
         raise TypeError("weight needs to be ShardedTensor")
     if len(input.size()) > 2:
         raise ValueError("Input more than 2 dims not supported")
-    weight_size = cast(torch.Size, weight.size())
+    weight_size = weight.size()
     if len(weight_size) != 2:
         raise ValueError("Weight needs to have exactly 2 dims")
     if int(torch.min(input).item()) < 0:
diff --git a/torch/distributed/_shard/sharded_tensor/_ops/linear.py b/torch/distributed/_shard/sharded_tensor/_ops/linear.py
@@ -1,4 +1,4 @@
-from typing import List, cast
+from typing import List
 
 import torch
 import torch.distributed as dist
@@ -105,14 +105,14 @@ def sharded_linear(types, args, kwargs, pg):
     world_size = dist.get_world_size(pg)
     rank = dist.get_rank(pg)
 
-    if sharding_dim == 1 and isinstance(input, torch.Tensor):
-        return _handle_row_wise_sharding_tensor(
-            input, world_size, weight, rank, local_shard_t, bias, pg
-        )
-    elif sharding_dim == 1 and isinstance(input, ShardedTensor):
+    if sharding_dim == 1 and isinstance(input, ShardedTensor):
         return _handle_row_wise_sharding_sharded_tensor(
             input, world_size, weight, local_shard_t, bias, pg
         )
+    elif sharding_dim == 1 and isinstance(input, torch.Tensor):
+        return _handle_row_wise_sharding_tensor(
+            input, world_size, weight, rank, local_shard_t, bias, pg
+        )
     elif sharding_dim == 0:
         return _handle_col_wise_sharding(
             input, world_size, weight, rank, local_shard_t, bias, pg
@@ -125,7 +125,7 @@ def sharded_linear(types, args, kwargs, pg):
 
 def _validate_linear_op_param(args, kwargs):
     """
-    Validate input params of sharded embedding op.
+    Validate input params of sharded linear op.
 
     Args:
         input: input of the linear layer.
@@ -141,13 +141,13 @@ def _validate_linear_op_param(args, kwargs):
     # Validate types
     if not isinstance(input, torch.Tensor) and not isinstance(input, ShardedTensor):
         raise TypeError("input needs to be either torch.Tensor or ShardedTensor")
-    if not isinstance(bias, torch.Tensor):
+    if type(bias) != torch.Tensor and type(bias) != torch.nn.Parameter:
         raise TypeError("bias needs to be torch.Tensor")
     if not isinstance(weight, ShardedTensor):
         raise TypeError("weight needs to be ShardedTensor")
     if len(input.size()) < 1:  # type: ignore[arg-type]
         raise ValueError("Input needs to have at least 1 dim")
-    weight_size = cast(torch.Size, weight.size())
+    weight_size = weight.size()
     if len(weight_size) != 2:
         raise ValueError("Weight needs to have exactly 2 dims")
     if len(bias.size()) != 1:
diff --git a/torch/distributed/_shard/sharded_tensor/api.py b/torch/distributed/_shard/sharded_tensor/api.py
diff --git a/torch/distributed/_shard/sharded_tensor/interface.py b/torch/distributed/_shard/sharded_tensor/interface.py