this is failing in backward for some reason with device context failure

drisspg · drisspg · commit df940ae9eaaa · 2024-03-05T11:01:21.000-08:00
diff --git a/float8_experimental/config.py b/float8_experimental/config.py
@@ -20,4 +20,7 @@
 # dynamic_use_activation_hooks = True
 # dynamic_use_activation_hooks = False
 
+# This is a global flag that controls whether the fused_cast kernels,
+# This can offer greater performance in eager but it is still recommended
+# That if you are using torch.compile to set this to False.
 use_fused_cast = True
diff --git a/float8_experimental/float8_utils.py b/float8_experimental/float8_utils.py
@@ -72,7 +72,6 @@ def amax_history_to_scale_stack(
 def tensor_to_amax(x, distributed_reduction=False):
     if float8_experimental.config.use_fused_cast and x.is_cuda:
         from float8_experimental.fused_kernels.fused_casting_kernels import abs_max
-
         amax = abs_max(x)
     else:
         amax = x.abs().max()