quantumiracle
diff --git a/‎README.md
Lines changed: 10 additions & 3 deletions b/‎README.md
Lines changed: 10 additions & 3 deletions
diff --git a/‎__pycache__/reacher.cpython-36.pyc
11 Bytes b/‎__pycache__/reacher.cpython-36.pyc
11 Bytes
diff --git a/‎ac.py
Lines changed: 1 addition & 1 deletion b/‎ac.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎ddpg.py
Lines changed: 342 additions & 0 deletions b/‎ddpg.py
Lines changed: 342 additions & 0 deletions
@@ -12,7 +12,7 @@ This repo only contains **PyTorch** Implementation.
 ## Contents:
 
 * Two versions of **Soft Actor-Critic (SAC)** are implemented.
-  
+
   **SAC Version 1**:
 
      `sac.py`: using state-value function.
@@ -25,6 +25,10 @@ This repo only contains **PyTorch** Implementation.
 
     paper: https://arxiv.org/pdf/1812.05905.pdf
 
+* **Deep Deterministic Policy Gradient (DDPG)**:
+
+  `ddpg.py`: implementation of DDPG.
+
 * **Twin Delayed DDPG (TD3)**:
 
    `td3.py`: implementation of TD3.
@@ -33,6 +37,7 @@ This repo only contains **PyTorch** Implementation.
 
 * **Proximal Policy Optimization (PPO)**:
   Todo
+
 * **Actor-Critic (AC) / A2C**:
 
   `ac.py`: extensible AC/A2C, easy to change to be DDPG, etc.
@@ -44,12 +49,14 @@ This repo only contains **PyTorch** Implementation.
 * **PointNet** for landmarks generation from images with unsupervised learning is implemented [here](https://github.com/quantumiracle/PointNet_Landmarks_from_Image/tree/master). This method is also used for image-based reinforcement learning as a STOA algorithm, called **Transporter**.
 
   original paper: [Unsupervised Learning of Object Landmarksthrough Conditional Image Generation](https://papers.nips.cc/paper/7657-unsupervised-learning-of-object-landmarks-through-conditional-image-generation.pdf)
-  
+
   paper for RL: [Unsupervised Learning of Object Keypointsfor Perception and Control](https://arxiv.org/pdf/1906.11883.pdf)
 
 
 ## Usage:
-`python ***.py`
+`python ***.py --train` 
+
+`python ***.py --test` 
 
 ## Troubleshooting:
 
 
@@ -346,7 +346,7 @@ def plot(frame_idx, rewards):
 hidden_dim = 30   
 UPDATE=['Approach0', 'Approach1'][0]
 # choose env
-ENV = ['Pendulum-v0', 'CartPole-v0', 'Reacher'][0]  # Pendulum is continuous, CartPole is discrete
+ENV = ['Pendulum-v0', 'CartPole-v0', 'Reacher'][1]  # Pendulum is continuous, CartPole is discrete
 if ENV == 'Reacher':
     DISCRETE = False
     hidden_dim = 512
 
@@ -0,0 +1,342 @@
+'''
+DDPG
+'''
+
+
+import math
+import random
+
+import gym
+import numpy as np
+
+import torch
+import torch.nn as nn
+import torch.optim as optim
+import torch.nn.functional as F
+from torch.distributions import Normal
+from torch.distributions import Categorical
+from collections import namedtuple
+
+import matplotlib.pyplot as plt
+from matplotlib import animation
+from IPython.display import display
+from reacher import Reacher
+import argparse
+
+
+GPU = True
+device_idx = 0
+if GPU:
+    device = torch.device("cuda:" + str(device_idx) if torch.cuda.is_available() else "cpu")
+else:
+    device = torch.device("cpu")
+print(device)
+
+parser = argparse.ArgumentParser(description='Train or test neural net motor controller.')
+parser.add_argument('--train', dest='train', action='store_true', default=False)
+parser.add_argument('--test', dest='test', action='store_true', default=False)
+
+args = parser.parse_args()
+
+class ReplayBuffer:
+    def __init__(self, capacity):
+        self.capacity = capacity
+        self.buffer = []
+        self.position = 0
+    
+    def push(self, state, action, reward, next_state, done):
+        if len(self.buffer) < self.capacity:
+            self.buffer.append(None)
+        self.buffer[self.position] = (state, action, reward, next_state, done)
+        self.position = int((self.position + 1) % self.capacity)  # as a ring buffer
+    
+    def sample(self, batch_size):
+        batch = random.sample(self.buffer, batch_size)
+        state, action, reward, next_state, done = map(np.stack, zip(*batch)) # stack for each element
+        ''' 
+        the * serves as unpack: sum(a,b) <=> batch=(a,b), sum(*batch) ;
+        zip: a=[1,2], b=[2,3], zip(a,b) => [(1, 2), (2, 3)] ;
+        the map serves as mapping the function on each list element: map(square, [2,3]) => [4,9] ;
+        np.stack((1,2)) => array([1, 2])
+        '''
+        return state, action, reward, next_state, done
+    
+    def __len__(self):
+        return len(self.buffer)
+
+
+class ActorNetwork(nn.Module):
+    def __init__(self, input_dim, output_dim, hidden_dim, init_w=3e-3):
+        super(ActorNetwork, self).__init__()
+        self.action_dim=output_dim
+        
+        self.linear1 = nn.Linear(input_dim, hidden_dim)
+        self.linear2 = nn.Linear(hidden_dim, hidden_dim)
+        self.linear3 = nn.Linear(hidden_dim, output_dim) # output dim = dim of action
+
+        # weights initialization
+        self.linear3.weight.data.uniform_(-init_w, init_w)
+        self.linear3.bias.data.uniform_(-init_w, init_w)
+    
+
+    def forward(self, state):
+        activation=F.relu
+        x = activation(self.linear1(state)) 
+        x = activation(self.linear2(x))
+        # x = F.tanh(self.linear3(x)).clone() # need clone to prevent in-place operation (which cause gradients not be drived)
+        x = self.linear3(x) # for simplicity, no restriction on action range
+
+        return x
+
+    def select_action(self, state, noise_scale=1.0):
+        '''
+        select action for sampling, no gradients flow, noisy action, return .cpu
+        '''
+        state = torch.FloatTensor(state).unsqueeze(0).to(device) # state dim: (N, dim of state)
+        normal = Normal(0, 1)
+        action = self.forward(state)
+        noise = noise_scale * normal.sample(action.shape).to(device)
+        action+=noise
+        return action.detach().cpu().numpy()[0]
+
+    def sample_action(self, action_range=1.):
+        normal = Normal(0, 1)
+        random_action=action_range*normal.sample( (self.action_dim,) )
+
+        return random_action.cpu().numpy()
+
+
+    def evaluate_action(self, state, noise_scale=0.0):
+        '''
+        evaluate action within GPU graph, for gradients flowing through it, noise_scale controllable
+        '''
+        normal = Normal(0, 1)
+        action = self.forward(state)
+        # action = torch.tanh(action)
+        noise = noise_scale * normal.sample(action.shape).to(device)
+        action+=noise
+        return action
+
+
+class QNetwork(nn.Module):
+    def __init__(self, input_dim, hidden_dim, init_w=3e-3):
+        super(QNetwork, self).__init__()
+        
+        self.linear1 = nn.Linear(input_dim, hidden_dim)
+        self.linear2 = nn.Linear(hidden_dim, hidden_dim)
+        self.linear3 = nn.Linear(hidden_dim, 1)
+        
+        self.linear3.weight.data.uniform_(-init_w, init_w)
+        self.linear3.bias.data.uniform_(-init_w, init_w)
+        
+    def forward(self, state, action):
+        x = torch.cat([state, action], 1) # the dim 0 is number of samples
+        x = F.relu(self.linear1(x))
+        x = F.relu(self.linear2(x))
+        x = self.linear3(x)
+        return x
+
+class DDPG():
+    def __init__(self, replay_buffer, state_dim, action_dim, hidden_dim):
+        self.replay_buffer = replay_buffer
+        self.qnet = QNetwork(state_dim+action_dim, hidden_dim).to(device)
+        self.target_qnet = QNetwork(state_dim+action_dim, hidden_dim).to(device)
+        self.policy_net = ActorNetwork(state_dim, action_dim, hidden_dim).to(device)
+        self.target_policy_net = ActorNetwork(state_dim, action_dim, hidden_dim).to(device)
+
+        print('Q network: ', self.qnet)
+        print('Policy network: ', self.policy_net)
+
+        for target_param, param in zip(self.target_qnet.parameters(), self.qnet.parameters()):
+            target_param.data.copy_(param.data)
+        self.q_criterion = nn.MSELoss()
+        q_lr=8e-4
+        policy_lr = 8e-4
+        self.update_cnt=0
+
+        self.q_optimizer = optim.Adam(self.qnet.parameters(), lr=q_lr)
+        self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=policy_lr)
+    
+    def target_soft_update(self, net, target_net, soft_tau):
+    # Soft update the target net
+        for target_param, param in zip(target_net.parameters(), net.parameters()):
+            target_param.data.copy_(  # copy data value into target parameters
+                target_param.data * (1.0 - soft_tau) + param.data * soft_tau
+            )
+
+        return target_net
+
+    def update(self, batch_size, reward_scale=10.0, gamma=0.99, soft_tau=1e-2, policy_up_itr=10, target_update_delay=3, warmup=True):
+        self.update_cnt+=1
+        state, action, reward, next_state, done = self.replay_buffer.sample(batch_size)
+        # print('sample:', state, action,  reward, done)
+
+        state      = torch.FloatTensor(state).to(device)
+        next_state = torch.FloatTensor(next_state).to(device)
+        action     = torch.FloatTensor(action).to(device)
+        reward     = torch.FloatTensor(reward).unsqueeze(1).to(device)  
+        done       = torch.FloatTensor(np.float32(done)).unsqueeze(1).to(device)
+
+        predict_q = self.qnet(state, action) # for q 
+        new_next_action = self.target_policy_net.evaluate_action(next_state)  # for q
+        new_action = self.policy_net.evaluate_action(state) # for policy
+        predict_new_q = self.qnet(state, new_action) # for policy
+        target_q = reward+(1-done)*gamma*self.target_qnet(next_state, new_next_action)  # for q
+        # reward = reward_scale * (reward - reward.mean(dim=0)) /reward.std(dim=0) # normalize with batch mean and std
+
+        # train qnet
+        q_loss = self.q_criterion(predict_q, target_q.detach())
+        self.q_optimizer.zero_grad()
+        q_loss.backward()
+        self.q_optimizer.step()
+
+        # train policy_net
+        policy_loss = -torch.mean(predict_new_q)
+        self.policy_optimizer.zero_grad()
+        policy_loss.backward()
+        self.policy_optimizer.step()
+
+            
+        # update the target_qnet
+        if self.update_cnt%target_update_delay==0:
+            self.target_qnet=self.target_soft_update(self.qnet, self.target_qnet, soft_tau)
+            self.target_policy_net=self.target_soft_update(self.policy_net, self.target_policy_net, soft_tau)
+
+        return q_loss.detach().cpu().numpy(), policy_loss.detach().cpu().numpy()
+
+    def save_model(self, path):
+        torch.save(self.qnet.state_dict(), path+'_q')
+        torch.save(self.target_qnet.state_dict(), path+'_target_q')
+        torch.save(self.policy_net.state_dict(), path+'_policy')
+
+    def load_model(self, path):
+        self.qnet.load_state_dict(torch.load(path+'_q'))
+        self.target_qnet.load_state_dict(torch.load(path+'_target_q'))
+        self.policy_net.load_state_dict(torch.load(path+'_policy'))
+        self.qnet.eval()
+        self.target_qnet.eval()
+        self.policy_net.eval()
+
+def plot(rewards):
+    plt.figure(figsize=(20,5))
+    plt.plot(rewards)
+    plt.savefig('ddpg.png')
+    # plt.show()
+    plt.clf()
+
+class NormalizedActions(gym.ActionWrapper): # gym env wrapper
+    def _action(self, action):
+        low  = self.action_space.low
+        high = self.action_space.high
+        
+        action = low + (action + 1.0) * 0.5 * (high - low)
+        action = np.clip(action, low, high)
+        
+        return action
+
+    def _reverse_action(self, action):
+        low  = self.action_space.low
+        high = self.action_space.high
+        
+        action = 2 * (action - low) / (high - low) - 1
+        action = np.clip(action, low, high)
+        
+        return action
+
+
+if __name__ == '__main__':
+    NUM_JOINTS=2
+    LINK_LENGTH=[200, 140]
+    INI_JOING_ANGLES=[0.1, 0.1]
+    SCREEN_SIZE=1000
+    # SPARSE_REWARD=False
+    # SCREEN_SHOT=False
+    ENV = ['Pendulum', 'Reacher'][1]
+    if ENV == 'Reacher':
+        env=Reacher(screen_size=SCREEN_SIZE, num_joints=NUM_JOINTS, link_lengths = LINK_LENGTH, \
+        ini_joint_angles=INI_JOING_ANGLES, target_pos = [369,430], render=True)
+        action_dim = env.num_actions
+        state_dim  = env.num_observations
+    elif ENV == 'Pendulum':
+        # env = NormalizedActions(gym.make("Pendulum-v0"))
+        env = gym.make("Pendulum-v0")
+        action_dim = env.action_space.shape[0]
+        state_dim  = env.observation_space.shape[0]
+    hidden_dim = 512
+    explore_steps = 0  # for random exploration
+    batch_size = 64
+
+    replay_buffer_size=1e6
+    replay_buffer = ReplayBuffer(replay_buffer_size)
+    model_path='./model/ddpg'
+    torch.autograd.set_detect_anomaly(True)
+    alg = DDPG(replay_buffer, state_dim, action_dim, hidden_dim)
+
+    if args.train:
+        # alg.load_model(model_path)
+
+        # hyper-parameters
+        max_episodes  = 1000
+        max_steps   = 100
+        frame_idx   = 0
+        rewards=[]
+
+        for i_episode in range (max_episodes):
+            q_loss_list=[]
+            policy_loss_list=[]
+            state = env.reset()
+            episode_reward = 0
+
+            for step in range(max_steps):
+                if frame_idx > explore_steps:
+                    action = alg.policy_net.select_action(state)
+                else:
+                    action = alg.policy_net.sample_action(action_range=1.)
+                next_state, reward, done, _ = env.step(action)
+                if ENV !='Reacher':
+                    env.render()
+                replay_buffer.push(state, action, reward, next_state, done)
+                
+                state = next_state
+                episode_reward += reward
+                frame_idx += 1
+                
+                if len(replay_buffer) > batch_size:
+                    q_loss, policy_loss = alg.update(batch_size)
+                    q_loss_list.append(q_loss)
+                    policy_loss_list.append(policy_loss)
+                
+                if done:
+                    break
+            if i_episode % 20 == 0:
+                plot(rewards)
+                alg.save_model(model_path)
+            print('Eps: ', i_episode, '| Reward: ', episode_reward, '| Loss: ', np.average(q_loss_list), np.average(policy_loss_list))
+            
+            rewards.append(episode_reward)
+
+
+    if args.test:
+        test_episodes = 10
+        max_steps=100
+        alg.load_model(model_path)
+
+        for i_episode in range (test_episodes):
+            q_loss_list=[]
+            policy_loss_list=[]
+            state = env.reset()
+            episode_reward = 0
+
+            for step in range(max_steps):
+                action = alg.policy_net.select_action(state, noise_scale=0.0)  # no noise for testing
+                next_state, reward, done, _ = env.step(action)
+                
+                state = next_state
+                episode_reward += reward
+                
+                
+                if done:
+                    break
+ 
+            print('Eps: ', i_episode, '| Reward: ', episode_reward)
+