Skip to content

Commit de1ab6d

Browse files
committed
change wording
1 parent 73c05a1 commit de1ab6d

File tree

2 files changed

+3
-4
lines changed

2 files changed

+3
-4
lines changed

torchtitan/models/deepseek_v3/model/state_dict_adapter.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -136,7 +136,7 @@ def _add_quantization_scale_inv_tensors(
136136

137137
def to_hf(self, state_dict: dict[str, Any]) -> dict[str, Any]:
138138
"""
139-
1. Quantize the weights from float32 to float8.
139+
1. When saving HF checkpoints, quantize the weights from float32 to float8.
140140
2. Convert between the HF shape and the torchtitan shape.
141141
3. Split the GroupedExperts' weight into seprate expert's wegiht.
142142
"""
@@ -149,7 +149,6 @@ def to_hf(self, state_dict: dict[str, Any]) -> dict[str, Any]:
149149
continue
150150

151151
if "moe.experts" in key:
152-
# model.layers.3.mlp.experts.0.down_proj.weight
153152
abstract_key = re.sub(r"(\d+)", "{}", key, count=1)
154153
layer_num = re.search(r"\d+", key).group(0)
155154
new_abstract_key = to_hf_map[abstract_key]
@@ -188,7 +187,7 @@ def to_hf(self, state_dict: dict[str, Any]) -> dict[str, Any]:
188187

189188
def from_hf(self, hf_state_dict: dict[str, Any]) -> dict[str, Any]:
190189
"""
191-
1. Dequantize the weights from float8 to float32.
190+
1. When loading from HF checkpoint, dequantize the weights from float8 to float32.
192191
2. Convert between the HF shape and the torchtitan shape.
193192
3. Concate seprate expert's wegiht into GroupedExperts' weight.
194193
"""

torchtitan/models/deepseek_v3/train_configs/deepseek_v3_671b.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ dataset = "c4" # supported datasets: c4_test (2K), c4 (177M)
4848
data_parallel_replicate_degree = 1
4949
data_parallel_shard_degree = -1
5050
fsdp_reshard_after_forward = "default" # default / never / always
51-
tensor_parallel_degree = 1
51+
tensor_parallel_degree = 8
5252
enable_async_tensor_parallel = false
5353
expert_parallel_degree = 1
5454
pipeline_parallel_degree = 1

0 commit comments

Comments
 (0)