diff --git a/install/install.sh b/install/install.sh old mode 100644 new mode 100755 index 5ae789d..7bb05e8 --- a/install/install.sh +++ b/install/install.sh @@ -15,9 +15,8 @@ wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh chmod +x Miniconda3-latest-Linux-x86_64.sh sh Miniconda3-latest-Linux-x86_64.sh -cd idl-2021-wo-rl - conda create --name car-racing python=3.8 conda activate car-racing -pip install -r requirements +pip install -r requirements.txt +pip install git+https://github.com/xeviknal/gym.git@master diff --git a/main.py b/main.py index d08ca25..df61d04 100644 --- a/main.py +++ b/main.py @@ -1,5 +1,7 @@ +import time import torch import numpy as np +from ray import tune import helpers from environment import CarRacingEnv @@ -9,42 +11,65 @@ # if gpu is to be used device = torch.device("cuda" if torch.cuda.is_available() else "cpu") -#for concurrent runs and logging -experiment='ppo-nm' + +def train(config): + # Reproducibility: manual seeding + seed = config['seed'] + torch.manual_seed(seed) + torch.cuda.manual_seed(seed) + np.random.seed(seed) + + config['params_path'] = f'./params/policy-params-{experiment}-{int(time.time())}.dl' + + # make sure that params folder exists + helpers.create_directory('params') + + env = CarRacingEnv(device, seed, config['stack_frames'], config['train']) + helpers.display_start() + # Train it first + trainer = Trainer(env, config) + trainer.train() + + # Let's store a vid with one episode + config['train'] = False + runner = Runner(env, config) + runner.run() + config['train'] = True + + +# for concurrent runs and logging +experiment = 'ppo-nm-hp-tuning-max' if __name__ == "__main__": hyperparams = { - 'num_epochs': 25000, # Number of training episodes - 'num_ppo_epochs': 10, + 'num_epochs': 1500, # Number of training episodes + 'num_ppo_epochs': tune.randint(3, 5), 'mini_batch_size': 128, 'memory_size': 2000, 'eps': 0.2, - 'c1': 1., # Value Function coeff - 'c2': 0.01, # Entropy coeff + 'c1': tune.quniform(0.5, 2.5, 0.25), # Value Function coeff + 'c2': tune.quniform(0.00, 0.16, 0.02), # Entropy coeff 'lr': 1e-3, # Learning rate 'gamma': 0.99, # Discount rate 'log_interval': 10, # controls how often we log progress 'stack_frames': 4, 'device': device, - 'experiment':experiment, - 'params_path': f'./params/policy-params-{experiment}.dl', - 'action_set_num': 0, - 'train': True + 'experiment': experiment, + 'action_set_num': 4, + 'train': True, + 'seed': tune.grid_search([7081960, 1000, 190421]) } - # Reproducibility: manual seeding - seed = 7081960 # Yann LeCun birthday - torch.manual_seed(seed) - torch.cuda.manual_seed(seed) - np.random.seed(seed) +analysis = tune.run( + train, + metric='running_reward', + mode='max', + num_samples=18, + resources_per_trial={"cpu": 0.4, "gpu": 0.3}, + config=hyperparams, +) - # make sure that params folder exists - helpers.create_directory('params') +print("Best config: ", analysis.get_best_config( + metric="running_reward", mode="max")) - env = CarRacingEnv(device, seed, hyperparams['stack_frames'], hyperparams['train']) - helpers.display_start() - if hyperparams['train']: - trainer = Trainer(env, hyperparams) - trainer.train() - else: - runner = Runner(env, hyperparams) - runner.run() +# Get a dataframe for analyzing trial results. +df = analysis.results_df diff --git a/requirements.txt b/requirements.txt index 74d11cb..15df770 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,4 +6,6 @@ IPython torch torchvision opencv-python -tensorboard \ No newline at end of file +tensorboard +ray==1.0.1.post1 +ray[tune] diff --git a/trainer.py b/trainer.py index 1ed15b1..21676c7 100644 --- a/trainer.py +++ b/trainer.py @@ -1,3 +1,4 @@ +from ray import tune import numpy as np import torch import torch.nn as nn @@ -15,6 +16,7 @@ def __init__(self, env, config): super().__init__() self.env = env self.config = config + self.experiment = config['experiment'] self.gamma = config['gamma'] self.input_channels = config['stack_frames'] self.device = config['device'] @@ -23,14 +25,13 @@ def __init__(self, env, config): self.mini_batch = config['mini_batch_size'] self.memory_size = config['memory_size'] self.c1, self.c2, self.eps = config['c1'], config['c2'], config['eps'] - self.writer = SummaryWriter(flush_secs=5) + self.writer = SummaryWriter(flush_secs=5, log_dir=f'runs/{self.experiment}') self.action_set = get_action(config['action_set_num']) self.policy = Policy(len(self.action_set), 1, self.input_channels).to(self.device) self.last_epoch, optim_params, self.running_reward = self.policy.load_checkpoint(config['params_path']) self.memory = ReplayMemory(self.memory_size) self.value_loss = nn.SmoothL1Loss() self.optimizer = torch.optim.Adam(self.policy.parameters(), lr=config['lr']) - self.experiment = config['experiment'] if optim_params is not None: self.optimizer.load_state_dict(optim_params) @@ -125,17 +126,18 @@ def policy_update(self, transitions, v_targ, adv, iteration): loss = -l_clip + l_vf - l_entropy self.optimizer.zero_grad() - self.writer.add_scalar(f'{self.experiment}/loss', loss.item(), iteration) - self.writer.add_scalar(f'{self.experiment}/entropy', l_entropy.item(), iteration) - self.writer.add_scalar(f'{self.experiment}/ratio', rt.mean().item(), iteration) - self.writer.add_scalar(f'{self.experiment}/advantage', adv.mean().item(), iteration) - self.writer.add_scalar(f'{self.experiment}/vf', l_vf.item(), iteration) + self.writer.add_scalar(f'loss', loss.item(), iteration) + self.writer.add_scalar(f'entropy', l_entropy.item(), iteration) + self.writer.add_scalar(f'ratio', rt.mean().item(), iteration) + self.writer.add_scalar(f'advantage', adv.mean().item(), iteration) + self.writer.add_scalar(f'vf', l_vf.item(), iteration) loss.backward() self.optimizer.step() def logging_episode(self, i_episode, ep_reward, running_reward): - self.writer.add_scalar(f'{self.experiment}/reward', ep_reward, i_episode) - self.writer.add_scalar(f'{self.experiment}/running reward', running_reward, i_episode) + self.writer.add_scalar(f'reward', ep_reward, i_episode) + self.writer.add_scalar(f'running reward', running_reward, i_episode) + tune.report(iterations=i_episode, running_reward=running_reward) def train(self): # Training loop