-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgridworld.py
235 lines (183 loc) · 8.12 KB
/
gridworld.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
#!/usr/bin/env python
# -*- coding: utf-8 -*-
'''
Sets the Gridworld environments and uses a Q-learning (reinforcement learning)
algorithm derived from OpenAI Gym's Frozen Lake environment to teach an agent
to avoid a moved obstacle and to reach a goal state.
"Winter is in full swing, and the community lake has frozen over. Recalling
that the community offers warm drinks to patrons who reach the lake's gazebo,
you've donned your skates for a day out on the ice.
A large hill blocks your view of the gazebo, but you remember a way around the
hill from last winter. However, the heavy rains last fall have caused
the lake to freeze a little differently. To make matters worse,
the usual way to the gazebo is no longer there. You don't want to ruin your
skates by climbing over the hill.
Perhaps you can find another way to skate to the gazebo?"
S : your starting state, where you put on your skates
F : a frozen surface, easy to skate on, but not as rewarding as a warm drink
H : the hill, which will ruin your skates (you cannot step here)
G : the gazebo (your goal state), where you receive a reward
Assembled by Eric Easthope
'''
import numpy as np
import matplotlib.pyplot as plt
from gym.envs.toy_text.frozen_lake import FrozenLakeEnv
# Configure an environment with a way past the obstacle on the right-hand side
left_env_map = [
'FFFFFFFFG',
'FFFFFFFFF',
'FFFFFFFFF',
'HHHHHHHHF',
'FFFFFFFFF',
'FFFSFFFFF'
]
# Configure an environment with a way past the obstacle on the left-hand side
right_env_map = [
'FFFFFFFFG',
'FFFFFFFFF',
'FFFFFFFFF',
'FHHHHHHHH',
'FFFFFFFFF',
'FFFSFFFFF'
]
# Assign numerical values to possible actions (the agent's movements)
directions = dict([
(0, 'LEFT'),
(1, 'DOWN'),
(2, 'RIGHT'),
(3, 'UP')
])
# Generate references for movements prohibited by the obstacle
left_prohibited_moves = dict(
zip(range(18,26), ['DOWN' for i in range(8)]) +
zip(range(36,44), ['UP' for i in range(8)]) +
[(35, 'LEFT')]
)
right_prohibited_moves = dict(
zip(range(19,27), ['DOWN' for i in range(8)]) +
zip(range(37,45), ['UP' for i in range(8)]) +
[(27, 'RIGHT')]
)
gamma = 0.95 # Set a discount factor (fixed within [0,1])
epsilon = 0.9 # Set the probability that the agent acts randomly
step_size = 0.85 # Set a step size (fixed within [0,1])
# Set the number of times that we train the agent to reach the goal state
total_episodes = 9000
# Set at which episode the obstacle moves
episode_when_obstacle_moves = 1000
# Limit how many steps the agent may take during a single episode
max_steps = 50
def choose_action(environment, state, Q, epsilon):
'''
Choose an action based on:
* The `environment`
* The agent's current `state`
* An action-value function Q(state, action)
* The probability `epsilon` that the agent acts randomly
The agent defaults to moving upwards unless an `action` is set by the
if-else statement below.
Given some `state` s, the agent chooses a random action with probability
`epsilon`. Otherwise, the agent chooses whichever action maximizes the
current estimate by `Q`.
Returns a number within 0-3 to specify one of four directions
'''
action = 3
if np.random.uniform(0, 1) < epsilon:
action = environment.action_space.sample()
else:
action = np.argmax(Q[state, :])
return action;
def run_episode(environment, Q, step_size, prohibited_moves=None):
'''
Run an episode to move the agent around Gridworld
The agent begins at the starting state S, and chooses at most `max_steps`
to try to reach the goal state G. The agent receives a reward +1 for
reaching the goal state.
As the agent moves, the action-value function `Q` is iteratively updated to
favour optimal steps.
Returns reward, and number of steps to reach goal state G
'''
# Reset the environment to place the agent at the starting state S
state = environment.reset()
# Take at most `max_steps` to reach the goal state G
for t in range(max_steps):
# Make the agent choose which direction to step
action = choose_action(environment, state, Q, epsilon)
# Detect if the chosen action hits an obstacle
if (prohibited_moves is None or
not (state, directions[action]) in prohibited_moves.viewitems()):
# Take a step in the environment based on the chosen action
next_state, reward, done, info = environment.step(action)
# Update the action-value function `Q`
Q[state, action] = (Q[state, action] +
step_size *
(reward + gamma * np.max(Q[next_state, :]) - Q[state, action])
)
# Update state to the state that is being moved to
state = next_state
# Give reward +1 if the goal state G is reached within `max_steps`
if done:
return 1, t+1;
# Give reward 0 if the goal state G is not reached within `max_steps`
return 0, max_steps;
if __name__ == '__main__':
'''
Initialize and run the Q-learning algorithm within Gridworld
'''
# Initialize two "Frozen Lake" environments using the maps specified above
left_environment = FrozenLakeEnv(desc=left_env_map, is_slippery=False)
right_environment = FrozenLakeEnv(desc=right_env_map, is_slippery=False)
# Initialize an action-value function Q(state, action)
Q = np.zeros(shape=[
left_environment.observation_space.n,
left_environment.action_space.n
])
# Initialize accounts of reward and steps for every episode
cumulative_reward = np.zeros(total_episodes)
number_of_steps = np.zeros(total_episodes)
print 'Initialization successful. Teaching agent ...'
# Run episodes with the first (left) environment map
for episode in range(episode_when_obstacle_moves):
reward, steps = run_episode(left_environment,
Q, step_size=step_size,
prohibited_moves=left_prohibited_moves)
# Store the cumulative reward at this episode
cumulative_reward[episode] = np.max(cumulative_reward) + reward
# Store the number of steps taken during this episode
number_of_steps[episode] = steps
# Run episodes with the second (right) environment map (the obstacle moves)
for episode in range(episode_when_obstacle_moves, total_episodes):
reward, steps = run_episode(right_environment,
Q, step_size=step_size,
prohibited_moves=right_prohibited_moves)
# Store the cumulative reward at this episode
cumulative_reward[episode] = np.max(cumulative_reward) + reward
# Store the number of steps taken during this episode
number_of_steps[episode] = steps
print 'Teaching successful. Plotting result ...'
'''
Below I plot the outcome of teaching the agent.
The plot relates the agent's cumulative reward to the number of
steps that the agent takes during each episode. At first, the agent is seen
to take many steps with little to no reward.
However, once the agent reaches the goal state G a number of times, the
ratio of cumulative reward to number of episodes appears to trend linearly.
The agent is also taking few steps to reach the goal state, so the
ratio of cumulative reward to number of steps tends to increase.
At the 1000th episode, whereupon the agent must adapt to a moved obstacle,
this trend in the reward-step ratio is temporarily disturbed.
Nevertheless, the agent continues to receive reward, and eventually learns
a new optimal policy for reaching for the goal state in fewer steps.
'''
plt.figure(figsize=(8,5))
plt.axis([
0, total_episodes,
0, np.max(cumulative_reward/number_of_steps) + 1
])
plt.xlabel("Episodes", fontsize=15)
plt.ylabel("Cumulative Reward / Number of Steps", fontsize=15)
plt.plot(
[i for i in range(total_episodes)],
cumulative_reward/number_of_steps,
)
plt.show()