Skip to content

Commit b2d737c

Browse files
committed
update comment
1 parent ce4d343 commit b2d737c

5 files changed

+10
-7
lines changed

ppo_continuous.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@
6666
BATCH = 128 # update batchsize
6767
A_UPDATE_STEPS = 10 # actor update steps
6868
C_UPDATE_STEPS = 10 # critic update steps
69-
EPS = 1e-8 # epsilon
69+
EPS = 1e-8 # numerical residual
7070
METHOD = [
7171
dict(name='kl_pen', kl_target=0.01, lam=0.5), # KL penalty
7272
dict(name='clip', epsilon=0.2), # Clipped surrogate objective, find this is better

ppo_continuous2.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@
4848
C_UPDATE_STEPS = 10 # critic update steps
4949
S_DIM, A_DIM = 3, 1 # state dimension, action dimension
5050
ACTION_RANGE = 2. # if unnormalized, normalized action range should be 1.
51-
EPS = 1e-8 # epsilon
51+
EPS = 1e-8 # numerical residual
5252
TEST_EP = 10
5353
# ppo-penalty
5454
KL_TARGET = 0.01

ppo_continuous_multiprocess.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@
6161
BATCH = 128 # update batchsize
6262
A_UPDATE_STEPS = 10 # actor update steps
6363
C_UPDATE_STEPS = 10 # critic update steps
64-
EPS = 1e-8 # epsilon
64+
EPS = 1e-8 # numerical residual
6565
MODEL_PATH = 'model/ppo_multi'
6666
NUM_WORKERS=2 # or: mp.cpu_count()
6767
ACTION_RANGE = 2. # if unnormalized, normalized action range should be 1.

ppo_continuous_multiprocess2.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@
6161
BATCH = 256 # update batchsize
6262
A_UPDATE_STEPS = 10 # actor update steps
6363
C_UPDATE_STEPS = 10 # critic update steps
64-
EPS = 1e-8 # epsilon
64+
EPS = 1e-8 # numerical residual
6565
MODEL_PATH = 'model/ppo_multi'
6666
NUM_WORKERS=2 # or: mp.cpu_count()
6767
ACTION_RANGE = 2. # if unnormalized, normalized action range should be 1.

ppo_continuous_multiprocess2_test.py

+6-3
Original file line numberDiff line numberDiff line change
@@ -170,8 +170,8 @@ def a_train(self, state, action, adv, old_pi):
170170
"""
171171
mu, sigma = self.actor(state)
172172
pi = torch.distributions.Normal(mu, sigma)
173-
ratio = torch.exp(pi.log_prob(action) - old_pi.log_prob(action))
174-
surr = ratio * adv
173+
# ratio = torch.exp(pi.log_prob(a) - oldpi.log_prob(a)) # sometimes give nan
174+
ratio = torch.exp(pi.log_prob(a)) / (torch.exp(oldpi.log_prob(a)) + EPS) surr = ratio * adv
175175
if self.method == 'penalty':
176176
kl = torch.distributions.kl_divergence(old_pi, pi)
177177
kl_mean = kl.mean()
@@ -400,7 +400,10 @@ def main():
400400
while True: # keep geting the episode reward from the queue
401401
r = rewards_queue.get()
402402
if r is not None:
403-
rewards.append(r)
403+
if len(rewards) == 0:
404+
rewards.append(r)
405+
else:
406+
rewards.append(rewards[-1] * 0.9 + r * 0.1)
404407
else:
405408
break
406409

0 commit comments

Comments
 (0)