Skip to content

Commit 57e3875

Browse files
author
User
committed
update
1 parent ac0d8a5 commit 57e3875

File tree

2 files changed

+170
-13
lines changed

2 files changed

+170
-13
lines changed

rl/cartpole.py

+17-13
Original file line numberDiff line numberDiff line change
@@ -29,14 +29,15 @@ def epsilon_greedy(model, s, eps=0.1):
2929
def gather_samples(env, n_episodes=10000):
3030
samples = []
3131
for _ in range(n_episodes):
32-
s = env.reset()
32+
s, info = env.reset()
3333
done = False
34-
while not done:
34+
truncated = False
35+
while not (done or truncated):
3536
a = env.action_space.sample()
3637
sa = np.concatenate((s, [a]))
3738
samples.append(sa)
3839

39-
s, r, done, info = env.step(a)
40+
s, r, done, truncated, info = env.step(a)
4041
return samples
4142

4243

@@ -70,31 +71,33 @@ def test_agent(model, env, n_episodes=20):
7071
reward_per_episode = np.zeros(n_episodes)
7172
for it in range(n_episodes):
7273
done = False
74+
truncated = False
7375
episode_reward = 0
74-
s = env.reset()
75-
while not done:
76+
s, info = env.reset()
77+
while not (done or truncated):
7678
a = epsilon_greedy(model, s, eps=0)
77-
s, r, done, info = env.step(a)
79+
s, r, done, truncated, info = env.step(a)
7880
episode_reward += r
7981
reward_per_episode[it] = episode_reward
8082
return np.mean(reward_per_episode)
8183

8284

8385
def watch_agent(model, env, eps):
8486
done = False
87+
truncated = False
8588
episode_reward = 0
86-
s = env.reset()
87-
while not done:
89+
s, info = env.reset()
90+
while not (done or truncated):
8891
a = epsilon_greedy(model, s, eps=eps)
89-
s, r, done, info = env.step(a)
92+
s, r, done, truncated, info = env.step(a)
9093
env.render()
9194
episode_reward += r
9295
print("Episode reward:", episode_reward)
9396

9497

9598
if __name__ == '__main__':
9699
# instantiate environment
97-
env = gym.make("CartPole-v0")
100+
env = gym.make("CartPole-v1", render_mode="rgb_array")
98101

99102
model = Model(env)
100103
reward_per_episode = []
@@ -105,12 +108,13 @@ def watch_agent(model, env, eps):
105108
# repeat until convergence
106109
n_episodes = 1500
107110
for it in range(n_episodes):
108-
s = env.reset()
111+
s, info = env.reset()
109112
episode_reward = 0
110113
done = False
111-
while not done:
114+
truncated = False
115+
while not (done or truncated):
112116
a = epsilon_greedy(model, s)
113-
s2, r, done, info = env.step(a)
117+
s2, r, done, truncated, info = env.step(a)
114118

115119
# get the target
116120
if done:

rl/cartpole_gym0.19.py

+153
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,153 @@
1+
# https://deeplearningcourses.com/c/artificial-intelligence-reinforcement-learning-in-python
2+
# https://www.udemy.com/artificial-intelligence-reinforcement-learning-in-python
3+
from __future__ import print_function, division
4+
from builtins import range
5+
# Note: you may need to update your version of future
6+
# sudo pip install -U future
7+
8+
import gym
9+
import numpy as np
10+
import matplotlib.pyplot as plt
11+
from sklearn.kernel_approximation import RBFSampler
12+
13+
14+
GAMMA = 0.99
15+
ALPHA = 0.1
16+
17+
18+
def epsilon_greedy(model, s, eps=0.1):
19+
# we'll use epsilon-soft to ensure all states are visited
20+
# what happens if you don't do this? i.e. eps=0
21+
p = np.random.random()
22+
if p < (1 - eps):
23+
values = model.predict_all_actions(s)
24+
return np.argmax(values)
25+
else:
26+
return model.env.action_space.sample()
27+
28+
29+
def gather_samples(env, n_episodes=10000):
30+
samples = []
31+
for _ in range(n_episodes):
32+
s = env.reset()
33+
done = False
34+
while not done:
35+
a = env.action_space.sample()
36+
sa = np.concatenate((s, [a]))
37+
samples.append(sa)
38+
39+
s, r, done, info = env.step(a)
40+
return samples
41+
42+
43+
class Model:
44+
def __init__(self, env):
45+
# fit the featurizer to data
46+
self.env = env
47+
samples = gather_samples(env)
48+
self.featurizer = RBFSampler()
49+
self.featurizer.fit(samples)
50+
dims = self.featurizer.n_components
51+
52+
# initialize linear model weights
53+
self.w = np.zeros(dims)
54+
55+
def predict(self, s, a):
56+
sa = np.concatenate((s, [a]))
57+
x = self.featurizer.transform([sa])[0]
58+
return x @ self.w
59+
60+
def predict_all_actions(self, s):
61+
return [self.predict(s, a) for a in range(self.env.action_space.n)]
62+
63+
def grad(self, s, a):
64+
sa = np.concatenate((s, [a]))
65+
x = self.featurizer.transform([sa])[0]
66+
return x
67+
68+
69+
def test_agent(model, env, n_episodes=20):
70+
reward_per_episode = np.zeros(n_episodes)
71+
for it in range(n_episodes):
72+
done = False
73+
episode_reward = 0
74+
s = env.reset()
75+
while not done:
76+
a = epsilon_greedy(model, s, eps=0)
77+
s, r, done, info = env.step(a)
78+
episode_reward += r
79+
reward_per_episode[it] = episode_reward
80+
return np.mean(reward_per_episode)
81+
82+
83+
def watch_agent(model, env, eps):
84+
done = False
85+
episode_reward = 0
86+
s = env.reset()
87+
while not done:
88+
a = epsilon_greedy(model, s, eps=eps)
89+
s, r, done, info = env.step(a)
90+
env.render()
91+
episode_reward += r
92+
print("Episode reward:", episode_reward)
93+
94+
95+
if __name__ == '__main__':
96+
# instantiate environment
97+
env = gym.make("CartPole-v0")
98+
99+
model = Model(env)
100+
reward_per_episode = []
101+
102+
# watch untrained agent
103+
watch_agent(model, env, eps=0)
104+
105+
# repeat until convergence
106+
n_episodes = 1500
107+
for it in range(n_episodes):
108+
s = env.reset()
109+
episode_reward = 0
110+
done = False
111+
while not done:
112+
a = epsilon_greedy(model, s)
113+
s2, r, done, info = env.step(a)
114+
115+
# get the target
116+
if done:
117+
target = r
118+
else:
119+
values = model.predict_all_actions(s2)
120+
target = r + GAMMA * np.max(values)
121+
122+
# update the model
123+
g = model.grad(s, a)
124+
err = target - model.predict(s, a)
125+
model.w += ALPHA * err * g
126+
127+
# accumulate reward
128+
episode_reward += r
129+
130+
# update state
131+
s = s2
132+
133+
if (it + 1) % 50 == 0:
134+
print(f"Episode: {it + 1}, Reward: {episode_reward}")
135+
136+
# early exit
137+
if it > 20 and np.mean(reward_per_episode[-20:]) == 200:
138+
print("Early exit")
139+
break
140+
141+
reward_per_episode.append(episode_reward)
142+
143+
# test trained agent
144+
test_reward = test_agent(model, env)
145+
print(f"Average test reward: {test_reward}")
146+
147+
plt.plot(reward_per_episode)
148+
plt.title("Reward per episode")
149+
plt.show()
150+
151+
# watch trained agent
152+
watch_agent(model, env, eps=0)
153+

0 commit comments

Comments
 (0)