Skip to content

Commit 386ca8d

Browse files
authored
feat: Added cfg.cudnn_deterministic_mode flag (#2367)
1 parent a0f74c3 commit 386ca8d

19 files changed

+57
-19
lines changed

Diff for: recipes/dev/early_exit_finetune_distributed.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -233,7 +233,9 @@ def __init__(self, cfg: DictConfig) -> None:
233233

234234
# These are public properties which are updated by the checkpoint loader
235235
# when ``resume_from_checkpoint`` is `True` or validated in tests
236-
self.seed = training.set_seed(seed=cfg.seed)
236+
self.seed = training.set_seed(
237+
seed=cfg.seed, debug_mode=cfg.get("cudnn_deterministic_mode", None)
238+
)
237239
self.epochs_run = 0
238240
self.total_epochs = cfg.epochs
239241
self.max_steps_per_epoch = cfg.max_steps_per_epoch

Diff for: recipes/dev/generate_v2.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,9 @@ def __init__(self, cfg: DictConfig) -> None:
7878
self._device = utils.get_device(device=cfg.device)
7979
self._dtype = training.get_dtype(dtype=cfg.dtype, device=self._device)
8080
self._logger = utils.get_logger(cfg.log_level)
81-
training.set_seed(seed=cfg.seed)
81+
training.set_seed(
82+
seed=cfg.seed, debug_mode=cfg.get("cudnn_deterministic_mode", None)
83+
)
8284

8385
def setup(self, cfg: DictConfig) -> None:
8486
"""Setup the model and transforms."""

Diff for: recipes/dev/generate_v2_distributed.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,9 @@ def __init__(self, cfg: DictConfig) -> None:
8787
dist.init_process_group(backend="nccl")
8888
_, rank = utils.get_world_size_and_rank()
8989
self._is_rank_zero = rank == 0
90-
training.set_seed(seed=cfg.seed)
90+
training.set_seed(
91+
seed=cfg.seed, debug_mode=cfg.get("cudnn_deterministic_mode", None)
92+
)
9193

9294
def setup(self, cfg: DictConfig) -> None:
9395
"""Setup the model and transforms."""

Diff for: recipes/eleuther_eval.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -451,7 +451,9 @@ def __init__(self, cfg: DictConfig) -> None:
451451
self.device = utils.get_device(device=cfg.device)
452452
self.dtype = training.get_dtype(dtype=cfg.dtype, device=self.device)
453453
self.logger = utils.get_logger(cfg.get("log_level", "info"))
454-
training.set_seed(seed=cfg.seed)
454+
training.set_seed(
455+
seed=cfg.seed, debug_mode=cfg.get("cudnn_deterministic_mode", None)
456+
)
455457

456458
# Eval specific variables
457459
self.limit = cfg.limit

Diff for: recipes/full_dpo_distributed.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -180,7 +180,9 @@ def __init__(self, cfg: DictConfig) -> None:
180180

181181
# These attributes constitute the recipe state and are updated by ``load_checkpoint``
182182
# when ``resume_from_checkpoint`` is ``True``
183-
self.seed = training.set_seed(seed=cfg.seed)
183+
self.seed = training.set_seed(
184+
seed=cfg.seed, debug_mode=cfg.get("cudnn_deterministic_mode", None)
185+
)
184186
self.epochs_run = 0
185187
self.total_epochs = cfg.epochs
186188
self.max_steps_per_epoch = cfg.max_steps_per_epoch

Diff for: recipes/full_finetune_distributed.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -215,7 +215,9 @@ def __init__(self, cfg: DictConfig) -> None:
215215

216216
# These are public properties which are updated by the checkpoint loader
217217
# when ``resume_from_checkpoint`` is `True` or validated in tests
218-
self.seed = training.set_seed(seed=cfg.seed)
218+
self.seed = training.set_seed(
219+
seed=cfg.seed, debug_mode=cfg.get("cudnn_deterministic_mode", None)
220+
)
219221
self.epochs_run = 0
220222
self.total_epochs = cfg.epochs
221223
self.max_steps_per_epoch = cfg.max_steps_per_epoch

Diff for: recipes/full_finetune_single_device.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -184,7 +184,9 @@ def __init__(self, cfg: DictConfig) -> None:
184184

185185
# These are public properties which are updated by the checkpoint loader
186186
# when ``resume_from_checkpoint`` is `True` or validated in tests
187-
self.seed = training.set_seed(seed=cfg.seed)
187+
self.seed = training.set_seed(
188+
seed=cfg.seed, debug_mode=cfg.get("cudnn_deterministic_mode", None)
189+
)
188190
self.epochs_run = 0
189191
self.total_epochs = cfg.epochs
190192
self.max_steps_per_epoch = cfg.max_steps_per_epoch

Diff for: recipes/generate.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,9 @@ def __init__(self, cfg: DictConfig) -> None:
4040
self._quantizer = config.instantiate(cfg.quantizer)
4141
self._quantization_mode = training.get_quantizer_mode(self._quantizer)
4242

43-
training.set_seed(seed=cfg.seed)
43+
training.set_seed(
44+
seed=cfg.seed, debug_mode=cfg.get("cudnn_deterministic_mode", None)
45+
)
4446

4547
def setup(self, cfg: DictConfig) -> None:
4648
checkpointer = config.instantiate(cfg.checkpointer)

Diff for: recipes/knowledge_distillation_distributed.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,9 @@ def __init__(self, cfg: DictConfig) -> None:
130130

131131
# These are public properties which are updated by the checkpoint loader
132132
# when ``resume_from_checkpoint`` is `True` or validated in tests
133-
self.seed = training.set_seed(seed=cfg.seed)
133+
self.seed = training.set_seed(
134+
seed=cfg.seed, debug_mode=cfg.get("cudnn_deterministic_mode", None)
135+
)
134136
self.epochs_run = 0
135137
self.total_epochs = cfg.epochs
136138
self.max_steps_per_epoch = cfg.max_steps_per_epoch

Diff for: recipes/knowledge_distillation_single_device.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,9 @@ def __init__(self, cfg: DictConfig) -> None:
128128

129129
# These are public properties which are updated by the checkpoint loader
130130
# when ``resume_from_checkpoint`` is `True` or validated in tests
131-
self.seed = training.set_seed(seed=cfg.seed)
131+
self.seed = training.set_seed(
132+
seed=cfg.seed, debug_mode=cfg.get("cudnn_deterministic_mode", None)
133+
)
132134
self.epochs_run = 0
133135
self.total_epochs = cfg.epochs
134136
self.max_steps_per_epoch = cfg.max_steps_per_epoch

Diff for: recipes/lora_dpo_distributed.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -172,7 +172,9 @@ def __init__(self, cfg: DictConfig) -> None:
172172

173173
# These attributes constitute the recipe state and are updated by ``load_checkpoint``
174174
# when ``resume_from_checkpoint`` is ``True``
175-
self.seed = training.set_seed(seed=cfg.seed)
175+
self.seed = training.set_seed(
176+
seed=cfg.seed, debug_mode=cfg.get("cudnn_deterministic_mode", None)
177+
)
176178
self.epochs_run = 0
177179
self.total_epochs = cfg.epochs
178180
self.max_steps_per_epoch = cfg.max_steps_per_epoch

Diff for: recipes/lora_dpo_single_device.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,9 @@ def __init__(self, cfg: DictConfig) -> None:
129129

130130
# These are public properties which are updated by the checkpoint loader
131131
# when ``resume_from_checkpoint`` is `True` or validated in tests
132-
self.seed = training.set_seed(seed=cfg.seed)
132+
self.seed = training.set_seed(
133+
seed=cfg.seed, debug_mode=cfg.get("cudnn_deterministic_mode", None)
134+
)
133135
self.epochs_run = 0
134136
self.total_epochs = cfg.epochs
135137
self.max_steps_per_epoch = cfg.max_steps_per_epoch

Diff for: recipes/lora_finetune_distributed.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -152,7 +152,9 @@ def __init__(self, cfg: DictConfig) -> None:
152152

153153
# These attributes constitute the recipe state and are updated by ``load_checkpoint``
154154
# when ``resume_from_checkpoint`` is ``True``
155-
self.seed = training.set_seed(seed=cfg.seed)
155+
self.seed = training.set_seed(
156+
seed=cfg.seed, debug_mode=cfg.get("cudnn_deterministic_mode", None)
157+
)
156158
self.epochs_run = 0
157159
self.total_epochs = cfg.epochs
158160
self.max_steps_per_epoch = cfg.max_steps_per_epoch

Diff for: recipes/lora_finetune_distributed_multi_dataset.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -155,7 +155,9 @@ def __init__(self, cfg: DictConfig) -> None:
155155

156156
# These attributes constitute the recipe state and are updated by ``load_checkpoint``
157157
# when ``resume_from_checkpoint`` is ``True``
158-
self.seed = training.set_seed(seed=cfg.seed)
158+
self.seed = training.set_seed(
159+
seed=cfg.seed, debug_mode=cfg.get("cudnn_deterministic_mode", None)
160+
)
159161
self.epochs_run = 0
160162
self.total_epochs = cfg.epochs
161163
self.max_steps_per_epoch = cfg.max_steps_per_epoch

Diff for: recipes/lora_finetune_single_device.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,9 @@ def __init__(self, cfg: DictConfig) -> None:
144144

145145
# These are public properties which are updated by the checkpoint loader
146146
# when ``resume_from_checkpoint`` is `True` or validated in tests
147-
self.seed = training.set_seed(seed=cfg.seed)
147+
self.seed = training.set_seed(
148+
seed=cfg.seed, debug_mode=cfg.get("cudnn_deterministic_mode", None)
149+
)
148150
self.epochs_run = 0
149151
self.total_epochs = cfg.epochs
150152
self.max_steps_per_epoch = cfg.max_steps_per_epoch

Diff for: recipes/ppo_full_finetune_single_device.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -133,7 +133,9 @@ def __init__(self, cfg: DictConfig) -> None:
133133

134134
# These are public properties which are updated by the checkpoint loader
135135
# when ``resume_from_checkpoint`` is `True` or validated in tests
136-
self.seed = training.set_seed(seed=cfg.seed)
136+
self.seed = training.set_seed(
137+
seed=cfg.seed, debug_mode=cfg.get("cudnn_deterministic_mode", None)
138+
)
137139
# manually setting up a generator for the recipe
138140
self._rng = torch.Generator(self._device).manual_seed(self.seed)
139141
self._total_steps = 0

Diff for: recipes/qat_distributed.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -202,7 +202,9 @@ def __init__(self, cfg: DictConfig) -> None:
202202

203203
# These are public properties which are updated by the checkpoint loader
204204
# when ``resume_from_checkpoint`` is `True` or validated in tests
205-
self.seed = training.set_seed(seed=cfg.seed)
205+
self.seed = training.set_seed(
206+
seed=cfg.seed, debug_mode=cfg.get("cudnn_deterministic_mode", None)
207+
)
206208
self.epochs_run = 0
207209
self.total_epochs = cfg.epochs
208210
self.max_steps_per_epoch = cfg.max_steps_per_epoch

Diff for: recipes/qat_lora_finetune_distributed.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -174,7 +174,9 @@ def __init__(self, cfg: DictConfig) -> None:
174174

175175
# These attributes constitute the recipe state and are updated by ``load_checkpoint``
176176
# when ``resume_from_checkpoint`` is ``True``
177-
self.seed = training.set_seed(seed=cfg.seed)
177+
self.seed = training.set_seed(
178+
seed=cfg.seed, debug_mode=cfg.get("cudnn_deterministic_mode", None)
179+
)
178180
self.epochs_run = 0
179181
self.total_epochs = cfg.epochs
180182
self.max_steps_per_epoch = cfg.max_steps_per_epoch

Diff for: recipes/quantize.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,9 @@ def __init__(self, cfg: DictConfig) -> None:
5050
self._dtype = training.get_dtype(dtype=cfg.dtype, device=self._device)
5151
self._quantizer = config.instantiate(cfg.quantizer)
5252
self._quantization_mode = training.get_quantizer_mode(self._quantizer)
53-
training.set_seed(seed=cfg.seed)
53+
training.set_seed(
54+
seed=cfg.seed, debug_mode=cfg.get("cudnn_deterministic_mode", None)
55+
)
5456

5557
def load_checkpoint(self, checkpointer_cfg: DictConfig) -> Dict[str, Any]:
5658
self._checkpointer = config.instantiate(checkpointer_cfg)

0 commit comments

Comments
 (0)