pytorch-labs
diff --git a/‎README.md
Lines changed: 1 addition & 1 deletion b/‎README.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎float8_experimental/dynamic_linear/__init__.py
Lines changed: 0 additions & 6 deletions b/‎float8_experimental/dynamic_linear/__init__.py
Lines changed: 0 additions & 6 deletions
diff --git a/‎float8_experimental/dynamic_linear/dynamic_float8_linear.py renamed to ‎float8_experimental/float8_dynamic_linear.py b/‎float8_experimental/dynamic_linear/dynamic_float8_linear.py renamed to ‎float8_experimental/float8_dynamic_linear.py
diff --git a/‎float8_experimental/float8_linear_utils.py
Lines changed: 1 addition & 1 deletion b/‎float8_experimental/float8_linear_utils.py
Lines changed: 1 addition & 1 deletion
@@ -143,7 +143,7 @@ We are working on a new FSDP implementation that uses per-parameter sharding tha
 # code tips
 
 * `float8_experimental/float8_linear.py` - `Float8Linear` (main user facing entry point for delayed scaling)
-* `float8_experimental/dynamic_linear/dynamic_linear.py` - `Float8DynamicLinear` (main user facing entry point for dynamic scaling)
+* `float8_experimental/float8_dynamic_linear.py` - `Float8DynamicLinear` (main user facing entry point for dynamic scaling)
 * `float8_experimental/float8_tensor.py` - `Float8Tensor`, which allows `Float8Linear` to abide by the `x.dtype == x.grad.dtype` restriction
 * `float8_experimental/tp_linear.py` - `Float8ColumnParallelLinear` / `Float8RowParallelLinear` (TP/SP versions of float8 linear)
 
 
@@ -8,7 +8,7 @@
 
 import torch
 import torch.distributed as dist
-from float8_experimental.dynamic_linear import Float8DynamicLinear
+from float8_experimental.float8_dynamic_linear import Float8DynamicLinear
 from float8_experimental.float8_linear import Float8Linear
 
 from float8_experimental.float8_utils import amax_history_to_scale, tensor_to_amax