hot fix ddp tests: decreased lr for stability

AlekseySh · AlekseySh · commit ec091f183fc1 · 2024-04-18T00:51:08.000+07:00
diff --git a/tests/test_runs/test_ddp_cases/run_retrieval_experiment_ddp.py b/tests/test_runs/test_ddp_cases/run_retrieval_experiment_ddp.py
@@ -153,7 +153,7 @@ def check_and_save_ids(self, outputs: List[Any], mode: str) -> None:
         torch.save(ids_per_step_synced, pattern.format(experiment=self.exp_num, epoch=self.trainer.current_epoch))
 
     def configure_optimizers(self) -> Any:
-        return Adam(params=self.parameters(), lr=0.5)
+        return Adam(params=self.parameters(), lr=1e-3)
 
     def on_train_end(self) -> None:
         torch.save(self.model, self.save_path_ckpt_pattern.format(experiment=self.exp_num))
diff --git a/tests/test_runs/test_ddp_cases/test_train_with_metrics.py b/tests/test_runs/test_ddp_cases/test_train_with_metrics.py
@@ -36,8 +36,8 @@
 
 @pytest.mark.long
 @pytest.mark.parametrize("batch_size", [12])
-@pytest.mark.parametrize("max_epochs", [2])
-@pytest.mark.parametrize("num_labels,atol", [(120, 1e-2), (1200, 2e-2)])
+@pytest.mark.parametrize("max_epochs", [4])
+@pytest.mark.parametrize("num_labels,atol", [(120, 1e-2), (360, 2e-2)])
 def test_metrics_is_similar_in_ddp(num_labels: int, atol: float, batch_size: int, max_epochs: int) -> None:
     devices = (1, 2, 3)
     # We will compare metrics from same experiment but with different amount of devices. For this we aggregate
@@ -48,18 +48,19 @@ def test_metrics_is_similar_in_ddp(num_labels: int, atol: float, batch_size: int
     metric_topk2values = defaultdict(list)
 
     for num_devices in devices:
-        batch_size //= num_devices
+        batch_size_eff = batch_size // num_devices
+
         params = (
             f"--devices {num_devices} "
             f"--max_epochs {max_epochs} "
             f"--num_labels {num_labels} "
-            f"--batch_size {batch_size}"
+            f"--batch_size {batch_size_eff}"
         )
         cmd = f"python {exp_file} " + params
         subprocess.run(cmd, check=True, shell=True)
 
         metrics_path = MetricValCallbackWithSaving.save_path_pattern.format(
-            devices=num_devices, batch_size=batch_size, num_labels=num_labels
+            devices=num_devices, batch_size=batch_size_eff, num_labels=num_labels
         )
         metrics = torch.load(metrics_path)[OVERALL_CATEGORIES_KEY]
         Path(metrics_path).unlink(missing_ok=True)