Place args for all gather/reduce on devices before the op to avoid CSE and excessive copying (#171)

sogartar · web-flow · commit be1633805dcf · 2024-09-06T15:53:47.000Z
We are encoding the device/shard information in the
flow.tensor.transfer/transfer_to_logical_device
operation. Then if we do an all-gather or an all-reduce, CSE is happy to
collapse the expressions into one. This would result in the
all-gather/all-reduce being performed on one device and then the result
is copied to the rest. We want each device to do the
all-gather/all-reduce.

There is no easy way to test the desired effect, but at least we test
for correctness on the PyTorch level.

This change adds the all_reduce op that is currently not used anywhere.
Here is expanded the elementwise op to support a variable number of
tensor arguments.
diff --git a/sharktank/sharktank/ops/_registry.py b/sharktank/sharktank/ops/_registry.py
@@ -18,6 +18,7 @@
 
 __all__ = [
     "AllOfExprs",
+    "AllOfExprsVariadic",
     "AllOfType",
     "AnyOfType",
     "IsOfType",
@@ -65,7 +66,8 @@ def __call__(self, *args: type) -> bool:
 
 
 class AllOfExprs(BoolTypeExpr):
-    """Returns True if all types match their respective boolean type expression.
+    """Returns True if all type arguments match their respective boolean type
+    expression.
 
     ```python
     # True. int == int and str in (float, str).
@@ -87,6 +89,38 @@ def expr(*types: type):
         super().__init__(expr)
 
 
+class AllOfExprsVariadic(BoolTypeExpr):
+    """Returns True if all type arguments match their respective boolean type
+    expression and any remaining trailing arguments match the last type expression.
+
+    ```python
+    # True. int == int
+    # str in (float, str).
+    # float in (float, str).
+    AllOfExprsVariadic(IsOfType(int), IsOfType(float, str))(int, str, float)
+
+     # False. str is not in (int, float).
+    AllOfExprsVariadic(IsOfType(int), IsOfType(int, float))(int, float, str, int)
+    ```
+    """
+
+    def __init__(self, *exprs: BoolTypeExpr):
+        if len(exprs) == 0:
+            raise ValueError("At least one expression is required.")
+        self._exprs = list(exprs)
+
+        def expr(*types: type):
+            if len(types) < len(self._exprs):
+                return False
+            exprs = self._exprs
+            if len(types) > len(exprs):
+                # pad with the trailing expression.
+                exprs = exprs + ([exprs[-1]] * (len(types) - len(self._exprs)))
+            return all([e(t) for e, t in zip(exprs, types)])
+
+        super().__init__(expr)
+
+
 class AllOfType(BoolTypeExpr):
     """Returns True if all of the types are from a set of types.
 
diff --git a/sharktank/sharktank/ops/default_impls.py b/sharktank/sharktank/ops/default_impls.py
@@ -16,8 +16,9 @@
 
 from ..types import PrimitiveTensor, QuantizedTensor, InferenceTensor
 from ..types.tensors import unbox_tensor, AnyTensor
-from ._registry import AllOfType, AllOfExprs, IsOfType
+from ._registry import AllOfType, AllOfExprs, AllOfExprsVariadic, IsOfType
 from .signatures import *
+import shark_turbine.ops.iree
 
 
 @cat.override(AllOfType(Tensor, PrimitiveTensor))
@@ -80,6 +81,39 @@ def elementwise_binary(operator, x, y):
     return operator(x, y)
 
 
+@elementwise.override(
+    AllOfExprsVariadic(
+        IsOfType(Tensor, InferenceTensor),
+        IsOfType(Tensor, InferenceTensor, Number),
+        IsOfType(Tensor, InferenceTensor, Number),
+    )
+)
+def elementwise_variadic(operator, x, y, *args):
+    """Folds by successively applying the binary operator from left to right until
+    exhaustion.
+
+    Match a variable number of tensor/number arguments with at least 3 such arguments.
+
+    Example matches
+    ```
+    (Tensor, Tensor, Tensor)
+    (Tensor, DefaultPrimitiveTensor, float),
+    (SplitPrimitiveTensor, ReplicatedTensor, int, Tensor)
+    ```
+
+    Will not match
+    ```
+    (Tensor)
+    (Tensor, Tensor)
+    (int, Tensor, Tensor)
+    ```
+    """
+    res = elementwise(operator, x, y)
+    for arg in args:
+        res = elementwise(operator, res, arg)
+    return res
+
+
 # Embedding Lookup
 @embedding_lookup.override(Tensor, Tensor)
 def embedding_lookup_default(input, embedding_matrix, dtype: dtype):
@@ -234,6 +268,13 @@ def permute(tensor: Tensor, dims: List[int]):
     return torch.permute(torch_tensor, dims)
 
 
+@transfer_to_logical_device.override(Tensor)
+def transfer_to_logical_device_default(tensor: Tensor, ordinal: int):
+    return shark_turbine.ops.iree.transfer_to_logical_device(
+        f"{ordinal}", unbox_tensor(tensor)
+    )
+
+
 # Sharded default impls (do nothing).
 
 
diff --git a/sharktank/sharktank/ops/sharded_impls.py b/sharktank/sharktank/ops/sharded_impls.py
@@ -31,15 +31,30 @@
 def all_gather_split(
     input: SplitPrimitiveTensor, *, dim: int | None
 ) -> ReplicatedTensor:
-    assert (
-        dim is None
-    ), "gather dimension other than `input.shard_dim` is not supported."
-    # TODO: figure out how to avoid common sub-expression elimination to not
-    # merge all these into one.
-    # Even if we place each resulting shard inside of ReplicatedTensor on a
-    # distinct logical device with an explicit operation, CSE should still
-    # collapse them.
-    shards = [sharded_cat(input) for i in range(input.shard_count)]
+    dim = input.shard_dim if dim is None else dim
+    # For each device move the shards to it and do a concatenation.
+    # If we don't move first, common sub-expression elimination is free to collapse all
+    # concatenations into one and then copy to all devices, which is not what we want.
+    shards = [
+        cat([transfer_to_logical_device(shard, i) for shard in input.shards], dim=dim)
+        for i in range(input.shard_count)
+    ]
+    return ReplicatedTensor(ts=shards)
+
+
+@all_reduce.override(SplitPrimitiveTensor)
+def all_reduce_split(
+    input: SplitPrimitiveTensor,
+) -> ReplicatedTensor:
+    # For each device move the shards to it and do a reduction.
+    # If we don't move first, common sub-expression elimination is free to collapse all
+    # reductions into one and then copy to all devices, which is not what we want.
+    shards = [
+        elementwise(
+            torch.add, *[transfer_to_logical_device(shard, i) for shard in input.shards]
+        )
+        for i in range(input.shard_count)
+    ]
     return ReplicatedTensor(ts=shards)
 
 
@@ -692,15 +707,15 @@ def reshard_like_split_to_split(
     return tensor
 
 
-# Sharded sum.
-
-
 @sharded_cat.override(SplitPrimitiveTensor)
 def sharded_cat_unsharded(maybe_sharded: SplitPrimitiveTensor):
     shard_ts = [t.as_torch() for t in maybe_sharded.shards]
     return torch.cat(shard_ts, dim=maybe_sharded.shard_dim)
 
 
+# Sharded sum.
+
+
 def _sharded_sum_sharded(tensor: ShardedTensor) -> Tensor:
     accum = tensor.shards[0].as_torch()
     for shard in tensor.shards[1:]:
@@ -709,13 +724,13 @@ def _sharded_sum_sharded(tensor: ShardedTensor) -> Tensor:
 
 
 @sharded_sum.override(SplitPrimitiveTensor)
-def sharded_sum_split(maybe_sharded: SplitPrimitiveTensor):
+def sharded_sum_split(maybe_sharded: SplitPrimitiveTensor) -> Tensor:
     # TODO: Should implement as an all reduce.
     return _sharded_sum_sharded(maybe_sharded)
 
 
 @sharded_sum.override(UnreducedTensor)
-def sharded_sum_unreduced(maybe_sharded: UnreducedTensor):
+def sharded_sum_unreduced(maybe_sharded: UnreducedTensor) -> Tensor:
     return _sharded_sum_sharded(maybe_sharded)
 
 
diff --git a/sharktank/sharktank/ops/signatures.py b/sharktank/sharktank/ops/signatures.py
@@ -18,6 +18,7 @@
 
 __all__ = [
     "all_gather",
+    "all_reduce",
     "cat",
     "conv2d",
     "elementwise",
@@ -38,6 +39,7 @@
     "scaled_dot_product_attention",
     "sharded_cat",
     "sharded_sum",
+    "transfer_to_logical_device",
     "unshard",
 ]
 
@@ -46,6 +48,7 @@
 
 @overridable
 def all_gather(maybe_sharded: AnyTensor, *, dim: int | None = None) -> AnyTensor:
+    "Gather/concatenate on all devices along dimension `dim`."
     ...
 
 
@@ -62,6 +65,23 @@ def _all_gather_trampoline(
         d.fail(tensors)
 
 
+@overridable
+def all_reduce(tensor: AnyTensor) -> AnyTensor:
+    "Reduce on all devices."
+    ...
+
+
+@all_reduce.trampoline
+def _all_reduce_trampoline(d: SignatureDispatcher, tensor: AnyTensor):
+    tensors = (tensor,)
+    for override in d.find_overrides(tensors):
+        result = override(tensor)
+        if result is not NotImplemented:
+            return override, result
+    else:
+        d.fail(tensors)
+
+
 @overridable
 def cat(tensors: Tuple[AnyTensor, ...] | List[AnyTensor], dim: int = 0) -> AnyTensor:
     ...
@@ -616,6 +636,25 @@ def _sharded_sum_trampoline(d: SignatureDispatcher, maybe_sharded: AnyTensor):
         d.fail(tensors)
 
 
+@overridable
+def transfer_to_logical_device(tensor: AnyTensor, ordinal: int) -> AnyTensor:
+    """Transfer the tensor to a device with ordinal `ordinal`."""
+    ...
+
+
+@transfer_to_logical_device.trampoline
+def _transfer_to_logical_device(
+    d: SignatureDispatcher, tensor: AnyTensor, ordinal: int
+):
+    tensors = (tensor,)
+    for override in d.find_overrides(tensors):
+        result = override(tensor, ordinal)
+        if result is not NotImplemented:
+            return override, result
+    else:
+        d.fail(tensors)
+
+
 @overridable
 def unshard(tensor: AnyTensor) -> AnyTensor:
     """Return the tensor that has the same elements and shape, but is not sharded."""
diff --git a/sharktank/sharktank/types/tensors.py b/sharktank/sharktank/types/tensors.py
@@ -30,7 +30,6 @@
 from shark_turbine.aot import (
     ExternalTensorTrait,
 )
-from shark_turbine.ops.iree import transfer_to_logical_device
 from ..utils import tree as tree_utils
 
 from ..utils.io import ShardedArchiveBuilder
@@ -618,13 +617,15 @@ def __init__(
         name: str = UnnamedTensorName,
         shape: Optional[list[int]],
     ):
+        from ..ops import transfer_to_logical_device
+
         assert len(ts) > 0
         assert shard_dim is None or len(ts[0].shape) > shard_dim
         super().__init__(name=name, shape=shape, shard_dim=shard_dim)
         self._shards: tuple[DefaultPrimitiveTensor] = tuple(
             DefaultPrimitiveTensor(
                 name=f"{name}.shard.{i}",
-                data=transfer_to_logical_device(f"{i}", unbox_tensor(t)),
+                data=transfer_to_logical_device(t, i),
             )
             for i, t in enumerate(ts)
         )
@@ -867,6 +868,8 @@ def __init__(
         will be replicated that many times.
         """
 
+        from ..ops import transfer_to_logical_device
+
         if isinstance(ts, torch.Tensor):
             assert shard_count is not None
             ts = [ts] * shard_count
@@ -884,7 +887,7 @@ def __init__(
         self._shards: tuple[DefaultPrimitiveTensor] = tuple(
             DefaultPrimitiveTensor(
                 name=f"{name}.shard.{i}",
-                data=transfer_to_logical_device(f"{i}", unbox_tensor(t)),
+                data=transfer_to_logical_device(t, i),
             )
             for i, t in enumerate(ts)
         )
diff --git a/sharktank/tests/ops/ops_test.py b/sharktank/tests/ops/ops_test.py
@@ -8,6 +8,7 @@
 
 import torch
 import torch.nn.functional as F
+from parameterized import parameterized
 
 from sharktank import ops
 from sharktank.types import *
@@ -34,6 +35,26 @@ def testBroadcastDims(self):
         assert res[1] == 2
 
 
+class ElementwiseTest(unittest.TestCase):
+    @parameterized.expand(
+        [
+            (torch.add,),
+            (torch.div,),
+            (torch.fmin,),
+            (torch.fmax,),
+            (torch.sub),
+        ]
+    )
+    def testMultiArgOperators(self, operator):
+        a = torch.rand(2, 3, 4, dtype=torch.float32)
+        b = torch.rand(2, 3, 4, dtype=torch.float32)
+        c = torch.rand(2, 3, 4, dtype=torch.float32)
+        d = torch.rand(2, 3, 4, dtype=torch.float32)
+        expected_result = operator(operator(operator(a, b), c), d)
+        actual_result = ops.elementwise(operator, a, b, c, d)
+        torch.testing.assert_close(actual_result, expected_result)
+
+
 class EqualTest(unittest.TestCase):
     def testEqualTorchTensors(self):
         a = torch.rand(2, 3, dtype=torch.float32)
diff --git a/sharktank/tests/ops/sharded_test.py b/sharktank/tests/ops/sharded_test.py