Fix an issue in sync_amax (#169)

y-sq · facebook-github-bot · commit 31fba04689a5 · 2023-12-21T13:23:00.000-08:00
Summary: To fix this error ``` RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.cuda.FloatTensor []] is at version 1; expected version 0 instead. ``` ---- Also tried ``` torch.no_grad() def sync_float8_amax_and_scale_history( ``` which didn't work. ---- We can look into if there are any better ways to fix this. Pull Request resolved: #169 Test Plan: ./test/test_fsdp.sh Reviewed By: vkuzo Differential Revision: D52373985 Pulled By: y-sq fbshipit-source-id: a25f4b0fee21dd5801c444b28f8a2f878bbafa35
diff --git a/float8_experimental/float8_linear_utils.py b/float8_experimental/float8_linear_utils.py
@@ -163,9 +163,9 @@ def sync_float8_amax_and_scale_history(
         # 1. in distributed contexts, syncs amax values across workers
         #
         if dist.is_initialized():
-            child.fp8_amax_x = fp8_amax_x_tensor[idx]
-            child.fp8_amax_w = fp8_amax_w_tensor[idx]
-            child.fp8_amax_dL_dY = fp8_amax_dL_dY_tensor[idx]
+            child.fp8_amax_x = fp8_amax_x_tensor[idx].clone()
+            child.fp8_amax_w = fp8_amax_w_tensor[idx].clone()
+            child.fp8_amax_dL_dY = fp8_amax_dL_dY_tensor[idx].clone()
 
         #
         # 2. adds the `amax` values to history

Original file line number	Diff line number	Diff line change
`@@ -163,9 +163,9 @@ def sync_float8_amax_and_scale_history(`
`163`	`163`	`# 1. in distributed contexts, syncs amax values across workers`
`164`	`164`	`#`
`165`	`165`	`if dist.is_initialized():`
`166`		`- child.fp8_amax_x = fp8_amax_x_tensor[idx]`
`167`		`- child.fp8_amax_w = fp8_amax_w_tensor[idx]`
`168`		`- child.fp8_amax_dL_dY = fp8_amax_dL_dY_tensor[idx]`
	`166`	`+ child.fp8_amax_x = fp8_amax_x_tensor[idx].clone()`
	`167`	`+ child.fp8_amax_w = fp8_amax_w_tensor[idx].clone()`
	`168`	`+ child.fp8_amax_dL_dY = fp8_amax_dL_dY_tensor[idx].clone()`
`169`	`169`
`170`	`170`	`#`
`171`	`171`	# 2. adds the `amax` values to history