Skip to content

Commit 30e757e

Browse files
committed
update
1 parent 05feb39 commit 30e757e

18 files changed

+2134
-0
lines changed

rl3/a2c/a2c.py

+214
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,214 @@
1+
# https://deeplearningcourses.com/c/cutting-edge-artificial-intelligence
2+
import time
3+
import joblib
4+
import numpy as np
5+
import tensorflow as tf
6+
import os
7+
8+
9+
def set_global_seeds(i):
10+
tf.set_random_seed(i)
11+
np.random.seed(i)
12+
13+
14+
def cat_entropy(logits):
15+
a0 = logits - tf.reduce_max(logits, 1, keepdims=True)
16+
ea0 = tf.exp(a0)
17+
z0 = tf.reduce_sum(ea0, 1, keepdims=True)
18+
p0 = ea0 / z0
19+
return tf.reduce_sum(p0 * (tf.log(z0) - a0), 1)
20+
21+
22+
def find_trainable_variables(key):
23+
with tf.variable_scope(key):
24+
return tf.trainable_variables()
25+
26+
27+
def discount_with_dones(rewards, dones, gamma):
28+
discounted = []
29+
r = 0
30+
for reward, done in zip(rewards[::-1], dones[::-1]):
31+
r = reward + gamma * r * (1. - done) # fixed off by one bug
32+
discounted.append(r)
33+
return discounted[::-1]
34+
35+
36+
37+
class Agent:
38+
def __init__(self, Network, ob_space, ac_space, nenvs, nsteps, nstack,
39+
ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4,
40+
alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6)):
41+
config = tf.ConfigProto(intra_op_parallelism_threads=nenvs,
42+
inter_op_parallelism_threads=nenvs)
43+
config.gpu_options.allow_growth = True
44+
sess = tf.Session(config=config)
45+
nbatch = nenvs * nsteps
46+
47+
A = tf.placeholder(tf.int32, [nbatch])
48+
ADV = tf.placeholder(tf.float32, [nbatch])
49+
R = tf.placeholder(tf.float32, [nbatch])
50+
LR = tf.placeholder(tf.float32, [])
51+
52+
step_model = Network(sess, ob_space, ac_space, nenvs, 1, nstack, reuse=False)
53+
train_model = Network(sess, ob_space, ac_space, nenvs, nsteps, nstack, reuse=True)
54+
55+
neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi, labels=A)
56+
pg_loss = tf.reduce_mean(ADV * neglogpac)
57+
vf_loss = tf.reduce_mean(tf.squared_difference(tf.squeeze(train_model.vf), R) / 2.0)
58+
entropy = tf.reduce_mean(cat_entropy(train_model.pi))
59+
loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef
60+
61+
params = find_trainable_variables("model")
62+
grads = tf.gradients(loss, params)
63+
if max_grad_norm is not None:
64+
grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
65+
grads_and_params = list(zip(grads, params))
66+
trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon)
67+
_train = trainer.apply_gradients(grads_and_params)
68+
69+
def train(states, rewards, actions, values):
70+
advs = rewards - values
71+
feed_dict = {train_model.X: states, A: actions, ADV: advs, R: rewards, LR: lr}
72+
policy_loss, value_loss, policy_entropy, _ = sess.run(
73+
[pg_loss, vf_loss, entropy, _train],
74+
feed_dict
75+
)
76+
return policy_loss, value_loss, policy_entropy
77+
78+
def save(save_path):
79+
ps = sess.run(params)
80+
joblib.dump(ps, save_path)
81+
82+
def load(load_path):
83+
loaded_params = joblib.load(load_path)
84+
restores = []
85+
for p, loaded_p in zip(params, loaded_params):
86+
restores.append(p.assign(loaded_p))
87+
ps = sess.run(restores)
88+
89+
self.train = train
90+
self.train_model = train_model
91+
self.step_model = step_model
92+
self.step = step_model.step
93+
self.value = step_model.value
94+
self.save = save
95+
self.load = load
96+
tf.global_variables_initializer().run(session=sess)
97+
98+
99+
class Runner:
100+
def __init__(self, env, agent, nsteps=5, nstack=4, gamma=0.99):
101+
self.env = env
102+
self.agent = agent
103+
nh, nw, nc = env.observation_space.shape
104+
nenv = env.num_envs
105+
self.batch_ob_shape = (nenv * nsteps, nh, nw, nc * nstack)
106+
self.state = np.zeros((nenv, nh, nw, nc * nstack), dtype=np.uint8)
107+
self.nc = nc
108+
obs = env.reset()
109+
self.update_state(obs)
110+
self.gamma = gamma
111+
self.nsteps = nsteps
112+
self.dones = [False for _ in range(nenv)]
113+
self.total_rewards = [] # store all workers' total rewards
114+
self.real_total_rewards = []
115+
116+
def update_state(self, obs):
117+
# Do frame-stacking here instead of the FrameStack wrapper to reduce IPC overhead
118+
self.state = np.roll(self.state, shift=-self.nc, axis=3)
119+
self.state[:, :, :, -self.nc:] = obs
120+
121+
def run(self):
122+
mb_states, mb_rewards, mb_actions, mb_values, mb_dones = [], [], [], [], []
123+
for n in range(self.nsteps):
124+
actions, values = self.agent.step(self.state)
125+
mb_states.append(np.copy(self.state))
126+
mb_actions.append(actions)
127+
mb_values.append(values)
128+
mb_dones.append(self.dones)
129+
obs, rewards, dones, infos = self.env.step(actions)
130+
for done, info in zip(dones, infos):
131+
if done:
132+
self.total_rewards.append(info['reward'])
133+
if info['total_reward'] != -1:
134+
self.real_total_rewards.append(info['total_reward'])
135+
self.dones = dones
136+
for n, done in enumerate(dones):
137+
if done:
138+
self.state[n] = self.state[n] * 0
139+
self.update_state(obs)
140+
mb_rewards.append(rewards)
141+
mb_dones.append(self.dones)
142+
# batch of steps to batch of rollouts
143+
mb_states = np.asarray(mb_states, dtype=np.uint8).swapaxes(1, 0).reshape(self.batch_ob_shape)
144+
mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0)
145+
mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0)
146+
mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0)
147+
mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0)
148+
mb_dones = mb_dones[:, 1:]
149+
last_values = self.agent.value(self.state).tolist()
150+
# discount/bootstrap off value fn
151+
for n, (rewards, dones, value) in enumerate(zip(mb_rewards, mb_dones, last_values)):
152+
rewards = rewards.tolist()
153+
dones = dones.tolist()
154+
if dones[-1] == 0:
155+
rewards = discount_with_dones(rewards + [value], dones + [0], self.gamma)[:-1]
156+
else:
157+
rewards = discount_with_dones(rewards, dones, self.gamma)
158+
mb_rewards[n] = rewards
159+
mb_rewards = mb_rewards.flatten()
160+
mb_actions = mb_actions.flatten()
161+
mb_values = mb_values.flatten()
162+
return mb_states, mb_rewards, mb_actions, mb_values
163+
164+
165+
def learn(network, env, seed, new_session=True, nsteps=5, nstack=4, total_timesteps=int(80e6),
166+
vf_coef=0.5, ent_coef=0.01, max_grad_norm=0.5, lr=7e-4,
167+
epsilon=1e-5, alpha=0.99, gamma=0.99, log_interval=1000):
168+
tf.reset_default_graph()
169+
set_global_seeds(seed)
170+
171+
nenvs = env.num_envs
172+
env_id = env.env_id
173+
save_name = os.path.join('models', env_id + '.save')
174+
ob_space = env.observation_space
175+
ac_space = env.action_space
176+
agent = Agent(Network=network, ob_space=ob_space, ac_space=ac_space, nenvs=nenvs,
177+
nsteps=nsteps, nstack=nstack,
178+
ent_coef=ent_coef, vf_coef=vf_coef,
179+
max_grad_norm=max_grad_norm,
180+
lr=lr, alpha=alpha, epsilon=epsilon, total_timesteps=total_timesteps)
181+
if os.path.exists(save_name):
182+
agent.load(save_name)
183+
184+
runner = Runner(env, agent, nsteps=nsteps, nstack=nstack, gamma=gamma)
185+
186+
nbatch = nenvs * nsteps
187+
tstart = time.time()
188+
for update in range(1, total_timesteps // nbatch + 1):
189+
states, rewards, actions, values = runner.run()
190+
policy_loss, value_loss, policy_entropy = agent.train(
191+
states, rewards, actions, values)
192+
nseconds = time.time() - tstart
193+
fps = int((update * nbatch) / nseconds)
194+
if update % log_interval == 0 or update == 1:
195+
print(' - - - - - - - ')
196+
print("nupdates", update)
197+
print("total_timesteps", update * nbatch)
198+
print("fps", fps)
199+
print("policy_entropy", float(policy_entropy))
200+
print("value_loss", float(value_loss))
201+
202+
# total reward
203+
r = runner.total_rewards[-100:] # get last 100
204+
tr = runner.real_total_rewards[-100:]
205+
if len(r) == 100:
206+
print("avg reward (last 100):", np.mean(r))
207+
if len(tr) == 100:
208+
print("avg total reward (last 100):", np.mean(tr))
209+
print("max (last 100):", np.max(tr))
210+
211+
agent.save(save_name)
212+
213+
env.close()
214+
agent.save(save_name)

0 commit comments

Comments
 (0)