File tree Expand file tree Collapse file tree 2 files changed +21
-3
lines changed Expand file tree Collapse file tree 2 files changed +21
-3
lines changed Original file line number Diff line number Diff line change @@ -47,9 +47,10 @@ def train(num_epochs):
47
47
verbose = dist .get_rank () == 0 # print only on global_rank==0
48
48
if verbose :
49
49
mlflow .set_tracking_uri ("/scratch/project_2001659/mvsjober/mlruns" )
50
- slurm_id = os .getenv ("SLURM_JOB_ID" )
51
- if slurm_id :
52
- mlflow .start_run (run_name = slurm_id )
50
+ #mlflow.set_tracking_uri("sqlite:////scratch/project_2001659/mvsjober/mlruns.db")
51
+ #mlflow.set_tracking_uri('https://mats-mlflow2.rahtiapp.fi/')
52
+
53
+ mlflow .start_run (run_name = os .getenv ("SLURM_JOB_ID" ))
53
54
54
55
model = ConvNet ().cuda ()
55
56
batch_size = 100
Original file line number Diff line number Diff line change
1
+ #! /bin/bash
2
+ # SBATCH --account=project_2001659
3
+ # SBATCH --partition=gputest
4
+ # SBATCH --ntasks=1
5
+ # SBATCH --cpus-per-task=10
6
+ # SBATCH --mem=64G
7
+ # SBATCH --time=15
8
+ # SBATCH --gres=gpu:v100:1
9
+
10
+ module purge
11
+ module load pytorch
12
+
13
+ # Old way with torch.distributed.run
14
+ # srun python3 -m torch.distributed.run --standalone --nnodes=1 --nproc_per_node=4 mnist_ddp.py --epochs=100
15
+
16
+ # New way with torchrun
17
+ srun torchrun --standalone --nnodes=1 --nproc_per_node=1 mnist_ddp_mlflow.py --epochs=100
You can’t perform that action at this time.
0 commit comments