Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PPO - Add hyperparam tuning with ray.tune #54

Open
wants to merge 16 commits into
base: ppo-nm
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 2 additions & 3 deletions install/install.sh
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,8 @@ wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
chmod +x Miniconda3-latest-Linux-x86_64.sh
sh Miniconda3-latest-Linux-x86_64.sh

cd idl-2021-wo-rl

conda create --name car-racing python=3.8
conda activate car-racing

pip install -r requirements
pip install -r requirements.txt
pip install git+https://github.com/xeviknal/gym.git@master
75 changes: 50 additions & 25 deletions main.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import time
import torch
import numpy as np
from ray import tune

import helpers
from environment import CarRacingEnv
Expand All @@ -9,42 +11,65 @@
# if gpu is to be used
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#for concurrent runs and logging
experiment='ppo-nm'

def train(config):
# Reproducibility: manual seeding
seed = config['seed']
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
np.random.seed(seed)

config['params_path'] = f'./params/policy-params-{experiment}-{int(time.time())}.dl'

# make sure that params folder exists
helpers.create_directory('params')

env = CarRacingEnv(device, seed, config['stack_frames'], config['train'])
helpers.display_start()
# Train it first
trainer = Trainer(env, config)
trainer.train()

# Let's store a vid with one episode
config['train'] = False
runner = Runner(env, config)
runner.run()
config['train'] = True


# for concurrent runs and logging
experiment = 'ppo-nm-hp-tuning-max'
if __name__ == "__main__":
hyperparams = {
'num_epochs': 25000, # Number of training episodes
'num_ppo_epochs': 10,
'num_epochs': 1500, # Number of training episodes
'num_ppo_epochs': tune.randint(3, 5),
'mini_batch_size': 128,
'memory_size': 2000,
'eps': 0.2,
'c1': 1., # Value Function coeff
'c2': 0.01, # Entropy coeff
'c1': tune.quniform(0.5, 2.5, 0.25), # Value Function coeff
'c2': tune.quniform(0.00, 0.16, 0.02), # Entropy coeff
'lr': 1e-3, # Learning rate
'gamma': 0.99, # Discount rate
'log_interval': 10, # controls how often we log progress
'stack_frames': 4,
'device': device,
'experiment':experiment,
'params_path': f'./params/policy-params-{experiment}.dl',
'action_set_num': 0,
'train': True
'experiment': experiment,
'action_set_num': 4,
'train': True,
'seed': tune.grid_search([7081960, 1000, 190421])
}

# Reproducibility: manual seeding
seed = 7081960 # Yann LeCun birthday
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
np.random.seed(seed)
analysis = tune.run(
train,
metric='running_reward',
mode='max',
num_samples=18,
resources_per_trial={"cpu": 0.4, "gpu": 0.3},
config=hyperparams,
)

# make sure that params folder exists
helpers.create_directory('params')
print("Best config: ", analysis.get_best_config(
metric="running_reward", mode="max"))

env = CarRacingEnv(device, seed, hyperparams['stack_frames'], hyperparams['train'])
helpers.display_start()
if hyperparams['train']:
trainer = Trainer(env, hyperparams)
trainer.train()
else:
runner = Runner(env, hyperparams)
runner.run()
# Get a dataframe for analyzing trial results.
df = analysis.results_df
4 changes: 3 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,6 @@ IPython
torch
torchvision
opencv-python
tensorboard
tensorboard
ray==1.0.1.post1
ray[tune]
20 changes: 11 additions & 9 deletions trainer.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from ray import tune
import numpy as np
import torch
import torch.nn as nn
Expand All @@ -15,6 +16,7 @@ def __init__(self, env, config):
super().__init__()
self.env = env
self.config = config
self.experiment = config['experiment']
self.gamma = config['gamma']
self.input_channels = config['stack_frames']
self.device = config['device']
Expand All @@ -23,14 +25,13 @@ def __init__(self, env, config):
self.mini_batch = config['mini_batch_size']
self.memory_size = config['memory_size']
self.c1, self.c2, self.eps = config['c1'], config['c2'], config['eps']
self.writer = SummaryWriter(flush_secs=5)
self.writer = SummaryWriter(flush_secs=5, log_dir=f'runs/{self.experiment}')
self.action_set = get_action(config['action_set_num'])
self.policy = Policy(len(self.action_set), 1, self.input_channels).to(self.device)
self.last_epoch, optim_params, self.running_reward = self.policy.load_checkpoint(config['params_path'])
self.memory = ReplayMemory(self.memory_size)
self.value_loss = nn.SmoothL1Loss()
self.optimizer = torch.optim.Adam(self.policy.parameters(), lr=config['lr'])
self.experiment = config['experiment']
if optim_params is not None:
self.optimizer.load_state_dict(optim_params)

Expand Down Expand Up @@ -125,17 +126,18 @@ def policy_update(self, transitions, v_targ, adv, iteration):
loss = -l_clip + l_vf - l_entropy

self.optimizer.zero_grad()
self.writer.add_scalar(f'{self.experiment}/loss', loss.item(), iteration)
self.writer.add_scalar(f'{self.experiment}/entropy', l_entropy.item(), iteration)
self.writer.add_scalar(f'{self.experiment}/ratio', rt.mean().item(), iteration)
self.writer.add_scalar(f'{self.experiment}/advantage', adv.mean().item(), iteration)
self.writer.add_scalar(f'{self.experiment}/vf', l_vf.item(), iteration)
self.writer.add_scalar(f'loss', loss.item(), iteration)
self.writer.add_scalar(f'entropy', l_entropy.item(), iteration)
self.writer.add_scalar(f'ratio', rt.mean().item(), iteration)
self.writer.add_scalar(f'advantage', adv.mean().item(), iteration)
self.writer.add_scalar(f'vf', l_vf.item(), iteration)
loss.backward()
self.optimizer.step()

def logging_episode(self, i_episode, ep_reward, running_reward):
self.writer.add_scalar(f'{self.experiment}/reward', ep_reward, i_episode)
self.writer.add_scalar(f'{self.experiment}/running reward', running_reward, i_episode)
self.writer.add_scalar(f'reward', ep_reward, i_episode)
self.writer.add_scalar(f'running reward', running_reward, i_episode)
tune.report(iterations=i_episode, running_reward=running_reward)

def train(self):
# Training loop
Expand Down