update comment

quantumiracle · quantumiracle · commit b2d737c8ee14 · 2019-12-30T19:24:43.000Z
diff --git a/ppo_continuous.py b/ppo_continuous.py
@@ -66,7 +66,7 @@
 BATCH = 128  # update batchsize
 A_UPDATE_STEPS = 10  # actor update steps
 C_UPDATE_STEPS = 10  # critic update steps
-EPS = 1e-8  # epsilon
+EPS = 1e-8  # numerical residual
 METHOD = [
     dict(name='kl_pen', kl_target=0.01, lam=0.5),  # KL penalty
     dict(name='clip', epsilon=0.2),  # Clipped surrogate objective, find this is better
diff --git a/ppo_continuous2.py b/ppo_continuous2.py
@@ -48,7 +48,7 @@
 C_UPDATE_STEPS = 10  # critic update steps
 S_DIM, A_DIM = 3, 1  # state dimension, action dimension
 ACTION_RANGE = 2.  # if unnormalized, normalized action range should be 1.
-EPS = 1e-8  # epsilon
+EPS = 1e-8  # numerical residual
 TEST_EP = 10
 # ppo-penalty
 KL_TARGET = 0.01
diff --git a/ppo_continuous_multiprocess.py b/ppo_continuous_multiprocess.py
@@ -61,7 +61,7 @@
 BATCH = 128  # update batchsize
 A_UPDATE_STEPS = 10  # actor update steps
 C_UPDATE_STEPS = 10  # critic update steps
-EPS = 1e-8  # epsilon
+EPS = 1e-8   # numerical residual
 MODEL_PATH = 'model/ppo_multi'
 NUM_WORKERS=2  # or: mp.cpu_count()
 ACTION_RANGE = 2.  # if unnormalized, normalized action range should be 1.
diff --git a/ppo_continuous_multiprocess2.py b/ppo_continuous_multiprocess2.py
@@ -61,7 +61,7 @@
 BATCH = 256  # update batchsize
 A_UPDATE_STEPS = 10  # actor update steps
 C_UPDATE_STEPS = 10  # critic update steps
-EPS = 1e-8  # epsilon
+EPS = 1e-8  # numerical residual
 MODEL_PATH = 'model/ppo_multi'
 NUM_WORKERS=2  # or: mp.cpu_count()
 ACTION_RANGE = 2.  # if unnormalized, normalized action range should be 1.
diff --git a/ppo_continuous_multiprocess2_test.py b/ppo_continuous_multiprocess2_test.py
@@ -170,8 +170,8 @@ def a_train(self, state, action, adv, old_pi):
         """
         mu, sigma = self.actor(state)
         pi = torch.distributions.Normal(mu, sigma)
-        ratio = torch.exp(pi.log_prob(action) - old_pi.log_prob(action))
-        surr = ratio * adv
+        # ratio = torch.exp(pi.log_prob(a) - oldpi.log_prob(a))  # sometimes give nan
+        ratio = torch.exp(pi.log_prob(a)) / (torch.exp(oldpi.log_prob(a)) + EPS)        surr = ratio * adv
         if self.method == 'penalty':
             kl = torch.distributions.kl_divergence(old_pi, pi)
             kl_mean = kl.mean()
@@ -400,7 +400,10 @@ def main():
         while True:  # keep geting the episode reward from the queue
             r = rewards_queue.get()
             if r is not None:
-                rewards.append(r)
+                if len(rewards) == 0:
+                    rewards.append(r)
+                else:
+                    rewards.append(rewards[-1] * 0.9 + r * 0.1)   
             else:
                 break