|
| 1 | +#!/usr/bin/env python |
| 2 | + |
| 3 | +# # Practical PyTorch: Playing GridWorld with Reinforcement Learning (Actor-Critic with REINFORCE) |
| 4 | + |
| 5 | +# ## Resources |
| 6 | + |
| 7 | +# ## Requirements |
| 8 | + |
| 9 | +import numpy as np |
| 10 | +from itertools import count |
| 11 | +import random, math |
| 12 | + |
| 13 | +import torch |
| 14 | +import torch.nn as nn |
| 15 | +import torch.nn.functional as F |
| 16 | +import torch.optim as optim |
| 17 | +import torch.autograd as autograd |
| 18 | +from torch.autograd import Variable |
| 19 | + |
| 20 | +from helpers import * |
| 21 | + |
| 22 | +# Configuration |
| 23 | + |
| 24 | +gamma = 0.9 # Discounted reward factor |
| 25 | + |
| 26 | +hidden_size = 50 |
| 27 | +learning_rate = 1e-4 |
| 28 | +weight_decay = 1e-5 |
| 29 | + |
| 30 | +log_every = 1000 |
| 31 | +render_every = 20000 |
| 32 | + |
| 33 | +import sconce |
| 34 | +job = sconce.Job('rl2', { |
| 35 | + 'gamma': gamma, |
| 36 | + 'learning_rate': learning_rate, |
| 37 | +}) |
| 38 | +job.log_every = log_every |
| 39 | +job.plot_every = 500 |
| 40 | + |
| 41 | +DROP_MAX = 0.3 |
| 42 | +DROP_MIN = 0.05 |
| 43 | +DROP_OVER = 200000 |
| 44 | + |
| 45 | +# ## The Grid World, Agent and Environment |
| 46 | + |
| 47 | +# ### The Grid |
| 48 | + |
| 49 | +MIN_PLANT_VALUE = -1 |
| 50 | +MAX_PLANT_VALUE = 0.5 |
| 51 | +GOAL_VALUE = 10 |
| 52 | +EDGE_VALUE = -10 |
| 53 | +VISIBLE_RADIUS = 1 |
| 54 | + |
| 55 | +class Grid(): |
| 56 | + def __init__(self, grid_size=8, n_plants=15): |
| 57 | + self.grid_size = grid_size |
| 58 | + self.n_plants = n_plants |
| 59 | + |
| 60 | + def reset(self): |
| 61 | + padded_size = self.grid_size + 2 * VISIBLE_RADIUS |
| 62 | + self.grid = np.zeros((padded_size, padded_size)) # Padding for edges |
| 63 | + |
| 64 | + # Edges |
| 65 | + self.grid[0:VISIBLE_RADIUS, :] = EDGE_VALUE |
| 66 | + self.grid[-1*VISIBLE_RADIUS:, :] = EDGE_VALUE |
| 67 | + self.grid[:, 0:VISIBLE_RADIUS] = EDGE_VALUE |
| 68 | + self.grid[:, -1*VISIBLE_RADIUS:] = EDGE_VALUE |
| 69 | + |
| 70 | + # Randomly placed plants |
| 71 | + for i in range(self.n_plants): |
| 72 | + plant_value = random.random() * (MAX_PLANT_VALUE - MIN_PLANT_VALUE) + MIN_PLANT_VALUE |
| 73 | + ry = random.randint(0, self.grid_size-1) + VISIBLE_RADIUS |
| 74 | + rx = random.randint(0, self.grid_size-1) + VISIBLE_RADIUS |
| 75 | + self.grid[ry, rx] = plant_value |
| 76 | + |
| 77 | + # Goal in one of the corners |
| 78 | + S = VISIBLE_RADIUS |
| 79 | + E = self.grid_size + VISIBLE_RADIUS - 1 |
| 80 | + gps = [(E, E), (S, E), (E, S), (S, S)] |
| 81 | + gp = gps[random.randint(0, len(gps)-1)] |
| 82 | + self.grid[gp] = GOAL_VALUE |
| 83 | + |
| 84 | + def visible(self, pos): |
| 85 | + y, x = pos |
| 86 | + return self.grid[y-VISIBLE_RADIUS:y+VISIBLE_RADIUS+1, x-VISIBLE_RADIUS:x+VISIBLE_RADIUS+1] |
| 87 | + |
| 88 | +# ### The Agent |
| 89 | + |
| 90 | +START_HEALTH = 1 |
| 91 | +STEP_VALUE = -0.02 |
| 92 | + |
| 93 | +class Agent: |
| 94 | + def reset(self): |
| 95 | + self.health = START_HEALTH |
| 96 | + |
| 97 | + def act(self, action): |
| 98 | + # Move according to action: 0=UP, 1=RIGHT, 2=DOWN, 3=LEFT |
| 99 | + y, x = self.pos |
| 100 | + if action == 0: y -= 1 |
| 101 | + elif action == 1: x += 1 |
| 102 | + elif action == 2: y += 1 |
| 103 | + elif action == 3: x -= 1 |
| 104 | + self.pos = (y, x) |
| 105 | + self.health += STEP_VALUE # Gradually getting hungrier |
| 106 | + |
| 107 | +# ### The Environment |
| 108 | + |
| 109 | +class Environment: |
| 110 | + def __init__(self): |
| 111 | + self.grid = Grid() |
| 112 | + self.agent = Agent() |
| 113 | + |
| 114 | + def reset(self): |
| 115 | + """Start a new episode by resetting grid and agent""" |
| 116 | + self.grid.reset() |
| 117 | + self.agent.reset() |
| 118 | + c = math.floor(self.grid.grid_size / 2) |
| 119 | + self.agent.pos = (c, c) |
| 120 | + |
| 121 | + self.t = 0 |
| 122 | + self.history = [] |
| 123 | + self.record_step() |
| 124 | + |
| 125 | + return self.visible_state |
| 126 | + |
| 127 | + def record_step(self): |
| 128 | + """Add the current state to history for display later""" |
| 129 | + grid = np.array(self.grid.grid) |
| 130 | + grid[self.agent.pos] = self.agent.health * 0.5 # Agent marker faded by health |
| 131 | + visible = np.array(self.grid.visible(self.agent.pos)) |
| 132 | + self.history.append((grid, visible, self.agent.health)) |
| 133 | + |
| 134 | + @property |
| 135 | + def visible_state(self): |
| 136 | + """Return the visible area surrounding the agent, and current agent health""" |
| 137 | + visible = self.grid.visible(self.agent.pos) |
| 138 | + y, x = self.agent.pos |
| 139 | + yp = (y - VISIBLE_RADIUS) / self.grid.grid_size |
| 140 | + xp = (x - VISIBLE_RADIUS) / self.grid.grid_size |
| 141 | + extras = [self.agent.health, yp, xp] |
| 142 | + return np.concatenate((visible.flatten(), extras), 0) |
| 143 | + |
| 144 | + def step(self, action): |
| 145 | + """Update state (grid and agent) based on an action""" |
| 146 | + self.agent.act(action) |
| 147 | + |
| 148 | + # Get reward from where agent landed, add to agent health |
| 149 | + value = self.grid.grid[self.agent.pos] |
| 150 | + self.grid.grid[self.agent.pos] = 0 |
| 151 | + self.agent.health += value |
| 152 | + |
| 153 | + # Check if agent won (reached the goal) or lost (health reached 0) |
| 154 | + won = value == GOAL_VALUE |
| 155 | + lost = self.agent.health <= 0 |
| 156 | + done = won or lost |
| 157 | + |
| 158 | + # Rewards at end of episode |
| 159 | + if won: |
| 160 | + reward = 1 |
| 161 | + elif lost: |
| 162 | + reward = -1 |
| 163 | + else: |
| 164 | + reward = 0 # Reward will only come at the end |
| 165 | + # reward = value # Try this for quicker learning |
| 166 | + |
| 167 | + # Save in history |
| 168 | + self.record_step() |
| 169 | + |
| 170 | + return self.visible_state, reward, done |
| 171 | + |
| 172 | +# ## Actor-Critic network |
| 173 | + |
| 174 | +class Policy(nn.Module): |
| 175 | + def __init__(self, hidden_size): |
| 176 | + super(Policy, self).__init__() |
| 177 | + |
| 178 | + visible_squares = (VISIBLE_RADIUS * 2 + 1) ** 2 |
| 179 | + input_size = visible_squares + 1 + 2 # Plus agent health, y, x |
| 180 | + |
| 181 | + self.inp = nn.Linear(input_size, hidden_size) |
| 182 | + self.out = nn.Linear(hidden_size, 4 + 1, bias=False) # For both action and expected value |
| 183 | + |
| 184 | + def forward(self, x): |
| 185 | + x = x.view(1, -1) |
| 186 | + x = F.tanh(x) # Squash inputs |
| 187 | + x = F.relu(self.inp(x)) |
| 188 | + x = self.out(x) |
| 189 | + |
| 190 | + # Split last five outputs into scores and value |
| 191 | + scores = x[:,:4] |
| 192 | + value = x[:,4] |
| 193 | + return scores, value |
| 194 | + |
| 195 | +# ## Selecting actions |
| 196 | + |
| 197 | +def select_action(e, state): |
| 198 | + drop = interpolate(e, DROP_MAX, DROP_MIN, DROP_OVER) |
| 199 | + |
| 200 | + state = Variable(torch.from_numpy(state).float()) |
| 201 | + scores, value = policy(state) # Forward state through network |
| 202 | + scores = F.dropout(scores, drop, True) # Dropout for exploration |
| 203 | + scores = F.softmax(scores) |
| 204 | + action = scores.multinomial() # Sample an action |
| 205 | + |
| 206 | + return action, value |
| 207 | + |
| 208 | +# ## Playing through an episode |
| 209 | + |
| 210 | +def run_episode(e): |
| 211 | + state = env.reset() |
| 212 | + actions = [] |
| 213 | + values = [] |
| 214 | + rewards = [] |
| 215 | + done = False |
| 216 | + |
| 217 | + while not done: |
| 218 | + action, value = select_action(e, state) |
| 219 | + state, reward, done = env.step(action.data[0, 0]) |
| 220 | + actions.append(action) |
| 221 | + values.append(value) |
| 222 | + rewards.append(reward) |
| 223 | + |
| 224 | + return actions, values, rewards |
| 225 | + |
| 226 | +# ## Using REINFORCE with a value baseline |
| 227 | + |
| 228 | +mse = nn.MSELoss() |
| 229 | + |
| 230 | +def finish_episode(e, actions, values, rewards): |
| 231 | + |
| 232 | + # Calculate discounted rewards, going backwards from end |
| 233 | + discounted_rewards = [] |
| 234 | + R = 0 |
| 235 | + for r in rewards[::-1]: |
| 236 | + R = r + gamma * R |
| 237 | + discounted_rewards.insert(0, R) |
| 238 | + discounted_rewards = torch.Tensor(discounted_rewards) |
| 239 | + |
| 240 | + # Use REINFORCE on chosen actions and associated discounted rewards |
| 241 | + value_loss = 0 |
| 242 | + for action, value, reward in zip(actions, values, discounted_rewards): |
| 243 | + reward_diff = reward - value.data[0] # Treat critic value as baseline |
| 244 | + action.reinforce(reward_diff) # Try to perform better than baseline |
| 245 | + value_loss += mse(value, Variable(torch.Tensor([reward]))) # Compare with actual reward |
| 246 | + |
| 247 | + # Backpropagate |
| 248 | + optimizer.zero_grad() |
| 249 | + nodes = [value_loss] + actions |
| 250 | + gradients = [torch.ones(1)] + [None for _ in actions] # No gradients for reinforced values |
| 251 | + autograd.backward(nodes, gradients) |
| 252 | + optimizer.step() |
| 253 | + |
| 254 | + return discounted_rewards, value_loss |
| 255 | + |
| 256 | +env = Environment() |
| 257 | +policy = Policy(hidden_size=hidden_size) |
| 258 | +optimizer = optim.Adam(policy.parameters(), lr=learning_rate, weight_decay=weight_decay) |
| 259 | + |
| 260 | +reward_avg = SlidingAverage('reward avg', steps=log_every) |
| 261 | +value_loss_avg = SlidingAverage('value loss avg', steps=log_every) |
| 262 | + |
| 263 | +e = 0 |
| 264 | + |
| 265 | +while reward_avg < 1.0: |
| 266 | + actions, values, rewards = run_episode(e) |
| 267 | + final_reward = rewards[-1] |
| 268 | + |
| 269 | + discounted_rewards, value_loss = finish_episode(e, actions, values, rewards) |
| 270 | + |
| 271 | + job.record(e, final_reward) # REMOVE |
| 272 | + reward_avg.add(final_reward) |
| 273 | + value_loss_avg.add(value_loss.data[0]) |
| 274 | + |
| 275 | + if e % log_every == 0: |
| 276 | + print('[epoch=%d]' % e, reward_avg, value_loss_avg) |
| 277 | + |
| 278 | + e += 1 |
| 279 | + |
| 280 | + |
0 commit comments