Skip to content
This repository was archived by the owner on Aug 18, 2021. It is now read-only.

Commit 9d7ab1a

Browse files
committed
plain python export
1 parent a858829 commit 9d7ab1a

File tree

1 file changed

+280
-0
lines changed

1 file changed

+280
-0
lines changed
+280
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,280 @@
1+
#!/usr/bin/env python
2+
3+
# # Practical PyTorch: Playing GridWorld with Reinforcement Learning (Actor-Critic with REINFORCE)
4+
5+
# ## Resources
6+
7+
# ## Requirements
8+
9+
import numpy as np
10+
from itertools import count
11+
import random, math
12+
13+
import torch
14+
import torch.nn as nn
15+
import torch.nn.functional as F
16+
import torch.optim as optim
17+
import torch.autograd as autograd
18+
from torch.autograd import Variable
19+
20+
from helpers import *
21+
22+
# Configuration
23+
24+
gamma = 0.9 # Discounted reward factor
25+
26+
hidden_size = 50
27+
learning_rate = 1e-4
28+
weight_decay = 1e-5
29+
30+
log_every = 1000
31+
render_every = 20000
32+
33+
import sconce
34+
job = sconce.Job('rl2', {
35+
'gamma': gamma,
36+
'learning_rate': learning_rate,
37+
})
38+
job.log_every = log_every
39+
job.plot_every = 500
40+
41+
DROP_MAX = 0.3
42+
DROP_MIN = 0.05
43+
DROP_OVER = 200000
44+
45+
# ## The Grid World, Agent and Environment
46+
47+
# ### The Grid
48+
49+
MIN_PLANT_VALUE = -1
50+
MAX_PLANT_VALUE = 0.5
51+
GOAL_VALUE = 10
52+
EDGE_VALUE = -10
53+
VISIBLE_RADIUS = 1
54+
55+
class Grid():
56+
def __init__(self, grid_size=8, n_plants=15):
57+
self.grid_size = grid_size
58+
self.n_plants = n_plants
59+
60+
def reset(self):
61+
padded_size = self.grid_size + 2 * VISIBLE_RADIUS
62+
self.grid = np.zeros((padded_size, padded_size)) # Padding for edges
63+
64+
# Edges
65+
self.grid[0:VISIBLE_RADIUS, :] = EDGE_VALUE
66+
self.grid[-1*VISIBLE_RADIUS:, :] = EDGE_VALUE
67+
self.grid[:, 0:VISIBLE_RADIUS] = EDGE_VALUE
68+
self.grid[:, -1*VISIBLE_RADIUS:] = EDGE_VALUE
69+
70+
# Randomly placed plants
71+
for i in range(self.n_plants):
72+
plant_value = random.random() * (MAX_PLANT_VALUE - MIN_PLANT_VALUE) + MIN_PLANT_VALUE
73+
ry = random.randint(0, self.grid_size-1) + VISIBLE_RADIUS
74+
rx = random.randint(0, self.grid_size-1) + VISIBLE_RADIUS
75+
self.grid[ry, rx] = plant_value
76+
77+
# Goal in one of the corners
78+
S = VISIBLE_RADIUS
79+
E = self.grid_size + VISIBLE_RADIUS - 1
80+
gps = [(E, E), (S, E), (E, S), (S, S)]
81+
gp = gps[random.randint(0, len(gps)-1)]
82+
self.grid[gp] = GOAL_VALUE
83+
84+
def visible(self, pos):
85+
y, x = pos
86+
return self.grid[y-VISIBLE_RADIUS:y+VISIBLE_RADIUS+1, x-VISIBLE_RADIUS:x+VISIBLE_RADIUS+1]
87+
88+
# ### The Agent
89+
90+
START_HEALTH = 1
91+
STEP_VALUE = -0.02
92+
93+
class Agent:
94+
def reset(self):
95+
self.health = START_HEALTH
96+
97+
def act(self, action):
98+
# Move according to action: 0=UP, 1=RIGHT, 2=DOWN, 3=LEFT
99+
y, x = self.pos
100+
if action == 0: y -= 1
101+
elif action == 1: x += 1
102+
elif action == 2: y += 1
103+
elif action == 3: x -= 1
104+
self.pos = (y, x)
105+
self.health += STEP_VALUE # Gradually getting hungrier
106+
107+
# ### The Environment
108+
109+
class Environment:
110+
def __init__(self):
111+
self.grid = Grid()
112+
self.agent = Agent()
113+
114+
def reset(self):
115+
"""Start a new episode by resetting grid and agent"""
116+
self.grid.reset()
117+
self.agent.reset()
118+
c = math.floor(self.grid.grid_size / 2)
119+
self.agent.pos = (c, c)
120+
121+
self.t = 0
122+
self.history = []
123+
self.record_step()
124+
125+
return self.visible_state
126+
127+
def record_step(self):
128+
"""Add the current state to history for display later"""
129+
grid = np.array(self.grid.grid)
130+
grid[self.agent.pos] = self.agent.health * 0.5 # Agent marker faded by health
131+
visible = np.array(self.grid.visible(self.agent.pos))
132+
self.history.append((grid, visible, self.agent.health))
133+
134+
@property
135+
def visible_state(self):
136+
"""Return the visible area surrounding the agent, and current agent health"""
137+
visible = self.grid.visible(self.agent.pos)
138+
y, x = self.agent.pos
139+
yp = (y - VISIBLE_RADIUS) / self.grid.grid_size
140+
xp = (x - VISIBLE_RADIUS) / self.grid.grid_size
141+
extras = [self.agent.health, yp, xp]
142+
return np.concatenate((visible.flatten(), extras), 0)
143+
144+
def step(self, action):
145+
"""Update state (grid and agent) based on an action"""
146+
self.agent.act(action)
147+
148+
# Get reward from where agent landed, add to agent health
149+
value = self.grid.grid[self.agent.pos]
150+
self.grid.grid[self.agent.pos] = 0
151+
self.agent.health += value
152+
153+
# Check if agent won (reached the goal) or lost (health reached 0)
154+
won = value == GOAL_VALUE
155+
lost = self.agent.health <= 0
156+
done = won or lost
157+
158+
# Rewards at end of episode
159+
if won:
160+
reward = 1
161+
elif lost:
162+
reward = -1
163+
else:
164+
reward = 0 # Reward will only come at the end
165+
# reward = value # Try this for quicker learning
166+
167+
# Save in history
168+
self.record_step()
169+
170+
return self.visible_state, reward, done
171+
172+
# ## Actor-Critic network
173+
174+
class Policy(nn.Module):
175+
def __init__(self, hidden_size):
176+
super(Policy, self).__init__()
177+
178+
visible_squares = (VISIBLE_RADIUS * 2 + 1) ** 2
179+
input_size = visible_squares + 1 + 2 # Plus agent health, y, x
180+
181+
self.inp = nn.Linear(input_size, hidden_size)
182+
self.out = nn.Linear(hidden_size, 4 + 1, bias=False) # For both action and expected value
183+
184+
def forward(self, x):
185+
x = x.view(1, -1)
186+
x = F.tanh(x) # Squash inputs
187+
x = F.relu(self.inp(x))
188+
x = self.out(x)
189+
190+
# Split last five outputs into scores and value
191+
scores = x[:,:4]
192+
value = x[:,4]
193+
return scores, value
194+
195+
# ## Selecting actions
196+
197+
def select_action(e, state):
198+
drop = interpolate(e, DROP_MAX, DROP_MIN, DROP_OVER)
199+
200+
state = Variable(torch.from_numpy(state).float())
201+
scores, value = policy(state) # Forward state through network
202+
scores = F.dropout(scores, drop, True) # Dropout for exploration
203+
scores = F.softmax(scores)
204+
action = scores.multinomial() # Sample an action
205+
206+
return action, value
207+
208+
# ## Playing through an episode
209+
210+
def run_episode(e):
211+
state = env.reset()
212+
actions = []
213+
values = []
214+
rewards = []
215+
done = False
216+
217+
while not done:
218+
action, value = select_action(e, state)
219+
state, reward, done = env.step(action.data[0, 0])
220+
actions.append(action)
221+
values.append(value)
222+
rewards.append(reward)
223+
224+
return actions, values, rewards
225+
226+
# ## Using REINFORCE with a value baseline
227+
228+
mse = nn.MSELoss()
229+
230+
def finish_episode(e, actions, values, rewards):
231+
232+
# Calculate discounted rewards, going backwards from end
233+
discounted_rewards = []
234+
R = 0
235+
for r in rewards[::-1]:
236+
R = r + gamma * R
237+
discounted_rewards.insert(0, R)
238+
discounted_rewards = torch.Tensor(discounted_rewards)
239+
240+
# Use REINFORCE on chosen actions and associated discounted rewards
241+
value_loss = 0
242+
for action, value, reward in zip(actions, values, discounted_rewards):
243+
reward_diff = reward - value.data[0] # Treat critic value as baseline
244+
action.reinforce(reward_diff) # Try to perform better than baseline
245+
value_loss += mse(value, Variable(torch.Tensor([reward]))) # Compare with actual reward
246+
247+
# Backpropagate
248+
optimizer.zero_grad()
249+
nodes = [value_loss] + actions
250+
gradients = [torch.ones(1)] + [None for _ in actions] # No gradients for reinforced values
251+
autograd.backward(nodes, gradients)
252+
optimizer.step()
253+
254+
return discounted_rewards, value_loss
255+
256+
env = Environment()
257+
policy = Policy(hidden_size=hidden_size)
258+
optimizer = optim.Adam(policy.parameters(), lr=learning_rate, weight_decay=weight_decay)
259+
260+
reward_avg = SlidingAverage('reward avg', steps=log_every)
261+
value_loss_avg = SlidingAverage('value loss avg', steps=log_every)
262+
263+
e = 0
264+
265+
while reward_avg < 1.0:
266+
actions, values, rewards = run_episode(e)
267+
final_reward = rewards[-1]
268+
269+
discounted_rewards, value_loss = finish_episode(e, actions, values, rewards)
270+
271+
job.record(e, final_reward) # REMOVE
272+
reward_avg.add(final_reward)
273+
value_loss_avg.add(value_loss.data[0])
274+
275+
if e % log_every == 0:
276+
print('[epoch=%d]' % e, reward_avg, value_loss_avg)
277+
278+
e += 1
279+
280+

0 commit comments

Comments
 (0)