[low-bit optim] Fix load state dict when device is different (#1021)

gau-nernst · jainapurva · commit 8d508edca114 · 2024-10-14T20:14:01.000-07:00
* fix serialization

* fix pytorch 2.3

* fix typo

* update note
diff --git a/test/prototype/test_low_bit_optim.py b/test/prototype/test_low_bit_optim.py
@@ -97,7 +97,30 @@ def test_optim_smoke(self, optim_name, dtype, device):
         optim.step()
         optim.zero_grad()
 
-    @pytest.mark.skipif(bnb is None, reason="bitsandbytes is not availablle")
+        # test serialization. also test the case CUDA optim loads CPU state dict
+        with tempfile.NamedTemporaryFile() as f:
+            torch.save(optim.state_dict(), f.name)
+            state_dict = torch.load(f.name, map_location="cpu")
+
+        model2 = copy.deepcopy(model)
+        optim2 = getattr(low_bit_optim, optim_name)(model2.parameters())
+        optim2.load_state_dict(state_dict)
+
+        for _ in range(2):
+            x = torch.randn(4, 32, device=device, dtype=dtype)
+
+            model(x).sum().backward()
+            optim.step()
+            optim.zero_grad()
+
+            model2(x).sum().backward()
+            optim2.step()
+            optim2.zero_grad()
+
+        for p1, p2 in zip(model.parameters(), model2.parameters()):
+            torch.testing.assert_close(p2, p1)
+
+    @pytest.mark.skipif(bnb is None, reason="bitsandbytes is not available")
     @pytest.mark.skipif(not torch.cuda.is_available(), reason="bitsandbytes 8-bit Adam only works for CUDA")
     @pytest.mark.skipif(not TORCH_VERSION_AT_LEAST_2_3, reason="requires PyTorch >= 2.3")
     @parametrize("optim_name", ["Adam8bit", "AdamW8bit"])
@@ -129,7 +152,7 @@ def test_optim_8bit_correctness(self, optim_name):
             torch.testing.assert_close(p2, p1, rtol=1e-5, atol=1e-5)
 
     # this will not run in CI because we can't install lpmm
-    @pytest.mark.skipif(lpmm is None, reason="lpmm is not availablle")
+    @pytest.mark.skipif(lpmm is None, reason="lpmm is not available")
     @pytest.mark.skipif(not torch.cuda.is_available(), reason="lpmm 4-bit Adam only works for CUDA")
     @pytest.mark.skipif(not TORCH_VERSION_AT_LEAST_2_3, reason="requires PyTorch >= 2.3")
     @parametrize("optim_name", ["Adam4bit", "AdamW4bit"])
@@ -205,7 +228,7 @@ def test_optim_cpu_offload_save_load(self):
         # save checkpoint. make sure it can be serialized by torch.save()
         with tempfile.NamedTemporaryFile() as file:
             torch.save(optim1.state_dict(), file.name)
-            state_dict = torch.load(file.name)
+            state_dict = torch.load(file.name, map_location="cpu")
 
         # resume training
         model2 = copy.deepcopy(model1)
diff --git a/torchao/prototype/low_bit_optim/adam.py b/torchao/prototype/low_bit_optim/adam.py
@@ -109,6 +109,9 @@ def step(self, closure=None):
 
 # this will work with any optim state tensor subclass that implements aten.lerp.Scalar and aten.copy_.default
 # and param tensor subclass that implements aten.add_.Tensor, and aten.addcdiv_.default
+# NOTE: right now all of our optimizer state subclasses will dequant to FP32, thus adam computation
+# will be done in FP32 (not purposely). we should explicitly cast all inputs to FP32 to ensure FP32
+# computation. will need to benchmark to ensure no slowdown.
 def single_param_adam(
     p: Tensor,
     grad: Tensor,
diff --git a/torchao/prototype/low_bit_optim/subclass_4bit.py b/torchao/prototype/low_bit_optim/subclass_4bit.py
@@ -2,7 +2,8 @@
 
 import torch
 from torch import Tensor
-from torchao.utils import TorchAOBaseTensor
+from torch.utils._python_dispatch import return_and_correct_aliasing
+from torchao.utils import TorchAOBaseTensor, TORCH_VERSION_AT_LEAST_2_4
 
 from .quant_utils import create_dynamic_map, scale_tensor, quantize_4bit_with_qmap, dequant_with_qmap
 
@@ -60,8 +61,9 @@ def __tensor_unflatten__(cls, tensor_data_dict, tensor_attributes, outer_size=No
     def dequantize(self, output_dtype=None):
         codes = torch.stack([self.codes >> 4, self.codes & 0b1111], dim=-1)  # unpack
         float_data = dequant_with_qmap(codes, self.qmap, self.scale)
-        dtype = output_dtype or torch.get_default_dtype()
-        return float_data.view(self._shape).to(dtype)
+        if output_dtype is not None:
+            float_data = float_data.to(output_dtype)
+        return float_data.view(self._shape)
 
     @classmethod
     def zeros(cls, shape, signed: bool = True, block_size: int = 128, device=None):
@@ -80,6 +82,24 @@ def __repr__(self):
         )
 
 
+# in pre-2.4, calling .to(device, dtype) will not dispatch aten._to_copy.default when
+# dtype is the same but device is different. thus, we must override .to() method instead.
+if not TORCH_VERSION_AT_LEAST_2_4:
+    def _to(self, *args, **kwargs):
+        # ignore other args/kwargs
+        device = kwargs.pop("device", None)
+        return OptimState4bit(
+            self.codes.to(device),
+            self.scale.to(device),
+            self.qmap.to(device),
+            self.signed,
+            self.shape,
+        )
+
+    OptimState4bit.to = _to
+    del _to  # make sure to not re-use
+
+
 @OptimState4bit.implements(aten.copy_.default)
 def _(func, types, args, kwargs):
     dst = args[0]
@@ -107,6 +127,20 @@ def _(func, types, args, kwargs):
     return dst
 
 
+@OptimState4bit.implements(aten._to_copy.default)
+def _(func, types, args, kwargs):
+    # ignore dtype
+    device = kwargs.get("device", None)
+    out = OptimState4bit(
+        args[0].codes.to(device=device),
+        args[0].scale.to(device=device),
+        args[0].qmap.to(device=device),
+        args[0].signed,
+        args[0].shape,
+    )
+    return return_and_correct_aliasing(func, args, kwargs, out)
+
+
 @OptimState4bit.implements(aten.lerp.Scalar)
 def _(func, types, args, kwargs):
     args = [x.dequantize() if isinstance(x, OptimState4bit) else x for x in args]
diff --git a/torchao/prototype/low_bit_optim/subclass_8bit.py b/torchao/prototype/low_bit_optim/subclass_8bit.py
@@ -1,6 +1,7 @@
 import torch
 from torch import Tensor
-from torchao.utils import TorchAOBaseTensor
+from torch.utils._python_dispatch import return_and_correct_aliasing
+from torchao.utils import TorchAOBaseTensor, TORCH_VERSION_AT_LEAST_2_4
 
 from .quant_utils import create_dynamic_map, scale_tensor, quantize_8bit_with_qmap, dequant_with_qmap
 
@@ -49,8 +50,10 @@ def __tensor_unflatten__(cls, tensor_data_dict, tensor_attributes, outer_size=No
         return cls(*[tensor_data_dict[name] for name in cls.tensor_attrs], *tensor_attributes)
 
     def dequantize(self, output_dtype=None):
-        dtype = output_dtype or torch.get_default_dtype()
-        return dequant_with_qmap(self.codes, self.qmap, self.scale).to(dtype)
+        float_data = dequant_with_qmap(self.codes, self.qmap, self.scale)
+        if output_dtype is not None:
+            float_data = float_data.to(output_dtype)
+        return float_data
 
     @classmethod
     def zeros(cls, shape, signed: bool = True, block_size: int = 256, device=None):
@@ -66,6 +69,23 @@ def __repr__(self):
         )
 
 
+# in pre-2.4, calling .to(device, dtype) will not dispatch aten._to_copy.default when
+# dtype is the same but device is different. thus, we must override .to() method instead.
+if not TORCH_VERSION_AT_LEAST_2_4:
+    def _to(self, *args, **kwargs):
+        # ignore other args/kwargs
+        device = kwargs.pop("device", None)
+        return OptimState8bit(
+            self.codes.to(device),
+            self.scale.to(device),
+            self.qmap.to(device),
+            self.signed,
+        )
+
+    OptimState8bit.to = _to
+    del _to  # make sure to not re-use
+
+
 @OptimState8bit.implements(aten.copy_.default)
 def _(func, types, args, kwargs):
     dst = args[0]
@@ -89,6 +109,19 @@ def _(func, types, args, kwargs):
     return dst
 
 
+@OptimState8bit.implements(aten._to_copy.default)
+def _(func, types, args, kwargs):
+    # ignore dtype
+    device = kwargs.get("device", None)
+    out = OptimState8bit(
+        args[0].codes.to(device=device),
+        args[0].scale.to(device=device),
+        args[0].qmap.to(device=device),
+        args[0].signed,
+    )
+    return return_and_correct_aliasing(func, args, kwargs, out)
+
+
 @OptimState8bit.implements(aten.lerp.Scalar)
 def _(func, types, args, kwargs):
     args = [x.dequantize() if isinstance(x, OptimState8bit) else x for x in args]
diff --git a/torchao/prototype/low_bit_optim/subclass_fp8.py b/torchao/prototype/low_bit_optim/subclass_fp8.py
@@ -1,5 +1,6 @@
 import torch
 from torch import Tensor
+from torch.utils._python_dispatch import return_and_correct_aliasing
 from torchao.utils import TorchAOBaseTensor
 
 
@@ -21,6 +22,7 @@ def quantize_fp8(input: Tensor, block_size: int):
 
 # NOTE: FP8 sign bit is redundant for unsigned optim state.
 # we may investigate how to use it to increase range/precision for unsigned optim state.
+# https://arxiv.org/abs/2409.12517 uses FP8 E5M2 for 2nd Adam buffer
 class OptimStateFp8(TorchAOBaseTensor):
     tensor_attrs = ["codes", "scale"]
 
@@ -56,8 +58,9 @@ def dequantize(self, output_dtype=None):
         float_data = self.codes.float()
         float_data = float_data.view(-1, self.block_size) * self.scale.view(-1, 1)
 
-        dtype = output_dtype or torch.get_default_dtype()
-        return float_data.view(self.codes.shape).to(dtype)
+        if output_dtype is not None:
+            float_data = float_data.to(output_dtype)
+        return float_data.view(self.codes.shape)
 
     @classmethod
     def zeros(cls, shape, block_size: int = 256, device=None):
@@ -93,6 +96,17 @@ def _(func, types, args, kwargs):
     return dst
 
 
+@OptimStateFp8.implements(aten._to_copy.default)
+def _(func, types, args, kwargs):
+    # ignore dtype
+    device = kwargs.get("device", None)
+    out = OptimStateFp8(
+        args[0].codes.to(device=device),
+        args[0].scale.to(device=device),
+    )
+    return return_and_correct_aliasing(func, args, kwargs, out)
+
+
 @OptimStateFp8.implements(aten.lerp.Scalar)
 def _(func, types, args, kwargs):
     args = [x.dequantize() if isinstance(x, OptimStateFp8) else x for x in args]