Skip to content

Commit a62846b

Browse files
committed
add ddpg
1 parent 2a54e99 commit a62846b

File tree

5 files changed

+355
-6
lines changed

5 files changed

+355
-6
lines changed

Diff for: README.md

+10-3
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ This repo only contains **PyTorch** Implementation.
1212
## Contents:
1313

1414
* Two versions of **Soft Actor-Critic (SAC)** are implemented.
15-
15+
1616
**SAC Version 1**:
1717

1818
`sac.py`: using state-value function.
@@ -25,6 +25,10 @@ This repo only contains **PyTorch** Implementation.
2525

2626
paper: https://arxiv.org/pdf/1812.05905.pdf
2727

28+
* **Deep Deterministic Policy Gradient (DDPG)**:
29+
30+
`ddpg.py`: implementation of DDPG.
31+
2832
* **Twin Delayed DDPG (TD3)**:
2933

3034
`td3.py`: implementation of TD3.
@@ -33,6 +37,7 @@ This repo only contains **PyTorch** Implementation.
3337

3438
* **Proximal Policy Optimization (PPO)**:
3539
Todo
40+
3641
* **Actor-Critic (AC) / A2C**:
3742

3843
`ac.py`: extensible AC/A2C, easy to change to be DDPG, etc.
@@ -44,12 +49,14 @@ This repo only contains **PyTorch** Implementation.
4449
* **PointNet** for landmarks generation from images with unsupervised learning is implemented [here](https://github.com/quantumiracle/PointNet_Landmarks_from_Image/tree/master). This method is also used for image-based reinforcement learning as a STOA algorithm, called **Transporter**.
4550

4651
original paper: [Unsupervised Learning of Object Landmarksthrough Conditional Image Generation](https://papers.nips.cc/paper/7657-unsupervised-learning-of-object-landmarks-through-conditional-image-generation.pdf)
47-
52+
4853
paper for RL: [Unsupervised Learning of Object Keypointsfor Perception and Control](https://arxiv.org/pdf/1906.11883.pdf)
4954

5055

5156
## Usage:
52-
`python ***.py`
57+
`python ***.py --train`
58+
59+
`python ***.py --test`
5360

5461
## Troubleshooting:
5562

Diff for: __pycache__/reacher.cpython-36.pyc

11 Bytes
Binary file not shown.

Diff for: ac.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -346,7 +346,7 @@ def plot(frame_idx, rewards):
346346
hidden_dim = 30
347347
UPDATE=['Approach0', 'Approach1'][0]
348348
# choose env
349-
ENV = ['Pendulum-v0', 'CartPole-v0', 'Reacher'][0] # Pendulum is continuous, CartPole is discrete
349+
ENV = ['Pendulum-v0', 'CartPole-v0', 'Reacher'][1] # Pendulum is continuous, CartPole is discrete
350350
if ENV == 'Reacher':
351351
DISCRETE = False
352352
hidden_dim = 512

Diff for: ddpg.py

+342
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,342 @@
1+
'''
2+
DDPG
3+
'''
4+
5+
6+
import math
7+
import random
8+
9+
import gym
10+
import numpy as np
11+
12+
import torch
13+
import torch.nn as nn
14+
import torch.optim as optim
15+
import torch.nn.functional as F
16+
from torch.distributions import Normal
17+
from torch.distributions import Categorical
18+
from collections import namedtuple
19+
20+
import matplotlib.pyplot as plt
21+
from matplotlib import animation
22+
from IPython.display import display
23+
from reacher import Reacher
24+
import argparse
25+
26+
27+
GPU = True
28+
device_idx = 0
29+
if GPU:
30+
device = torch.device("cuda:" + str(device_idx) if torch.cuda.is_available() else "cpu")
31+
else:
32+
device = torch.device("cpu")
33+
print(device)
34+
35+
parser = argparse.ArgumentParser(description='Train or test neural net motor controller.')
36+
parser.add_argument('--train', dest='train', action='store_true', default=False)
37+
parser.add_argument('--test', dest='test', action='store_true', default=False)
38+
39+
args = parser.parse_args()
40+
41+
class ReplayBuffer:
42+
def __init__(self, capacity):
43+
self.capacity = capacity
44+
self.buffer = []
45+
self.position = 0
46+
47+
def push(self, state, action, reward, next_state, done):
48+
if len(self.buffer) < self.capacity:
49+
self.buffer.append(None)
50+
self.buffer[self.position] = (state, action, reward, next_state, done)
51+
self.position = int((self.position + 1) % self.capacity) # as a ring buffer
52+
53+
def sample(self, batch_size):
54+
batch = random.sample(self.buffer, batch_size)
55+
state, action, reward, next_state, done = map(np.stack, zip(*batch)) # stack for each element
56+
'''
57+
the * serves as unpack: sum(a,b) <=> batch=(a,b), sum(*batch) ;
58+
zip: a=[1,2], b=[2,3], zip(a,b) => [(1, 2), (2, 3)] ;
59+
the map serves as mapping the function on each list element: map(square, [2,3]) => [4,9] ;
60+
np.stack((1,2)) => array([1, 2])
61+
'''
62+
return state, action, reward, next_state, done
63+
64+
def __len__(self):
65+
return len(self.buffer)
66+
67+
68+
class ActorNetwork(nn.Module):
69+
def __init__(self, input_dim, output_dim, hidden_dim, init_w=3e-3):
70+
super(ActorNetwork, self).__init__()
71+
self.action_dim=output_dim
72+
73+
self.linear1 = nn.Linear(input_dim, hidden_dim)
74+
self.linear2 = nn.Linear(hidden_dim, hidden_dim)
75+
self.linear3 = nn.Linear(hidden_dim, output_dim) # output dim = dim of action
76+
77+
# weights initialization
78+
self.linear3.weight.data.uniform_(-init_w, init_w)
79+
self.linear3.bias.data.uniform_(-init_w, init_w)
80+
81+
82+
def forward(self, state):
83+
activation=F.relu
84+
x = activation(self.linear1(state))
85+
x = activation(self.linear2(x))
86+
# x = F.tanh(self.linear3(x)).clone() # need clone to prevent in-place operation (which cause gradients not be drived)
87+
x = self.linear3(x) # for simplicity, no restriction on action range
88+
89+
return x
90+
91+
def select_action(self, state, noise_scale=1.0):
92+
'''
93+
select action for sampling, no gradients flow, noisy action, return .cpu
94+
'''
95+
state = torch.FloatTensor(state).unsqueeze(0).to(device) # state dim: (N, dim of state)
96+
normal = Normal(0, 1)
97+
action = self.forward(state)
98+
noise = noise_scale * normal.sample(action.shape).to(device)
99+
action+=noise
100+
return action.detach().cpu().numpy()[0]
101+
102+
def sample_action(self, action_range=1.):
103+
normal = Normal(0, 1)
104+
random_action=action_range*normal.sample( (self.action_dim,) )
105+
106+
return random_action.cpu().numpy()
107+
108+
109+
def evaluate_action(self, state, noise_scale=0.0):
110+
'''
111+
evaluate action within GPU graph, for gradients flowing through it, noise_scale controllable
112+
'''
113+
normal = Normal(0, 1)
114+
action = self.forward(state)
115+
# action = torch.tanh(action)
116+
noise = noise_scale * normal.sample(action.shape).to(device)
117+
action+=noise
118+
return action
119+
120+
121+
class QNetwork(nn.Module):
122+
def __init__(self, input_dim, hidden_dim, init_w=3e-3):
123+
super(QNetwork, self).__init__()
124+
125+
self.linear1 = nn.Linear(input_dim, hidden_dim)
126+
self.linear2 = nn.Linear(hidden_dim, hidden_dim)
127+
self.linear3 = nn.Linear(hidden_dim, 1)
128+
129+
self.linear3.weight.data.uniform_(-init_w, init_w)
130+
self.linear3.bias.data.uniform_(-init_w, init_w)
131+
132+
def forward(self, state, action):
133+
x = torch.cat([state, action], 1) # the dim 0 is number of samples
134+
x = F.relu(self.linear1(x))
135+
x = F.relu(self.linear2(x))
136+
x = self.linear3(x)
137+
return x
138+
139+
class DDPG():
140+
def __init__(self, replay_buffer, state_dim, action_dim, hidden_dim):
141+
self.replay_buffer = replay_buffer
142+
self.qnet = QNetwork(state_dim+action_dim, hidden_dim).to(device)
143+
self.target_qnet = QNetwork(state_dim+action_dim, hidden_dim).to(device)
144+
self.policy_net = ActorNetwork(state_dim, action_dim, hidden_dim).to(device)
145+
self.target_policy_net = ActorNetwork(state_dim, action_dim, hidden_dim).to(device)
146+
147+
print('Q network: ', self.qnet)
148+
print('Policy network: ', self.policy_net)
149+
150+
for target_param, param in zip(self.target_qnet.parameters(), self.qnet.parameters()):
151+
target_param.data.copy_(param.data)
152+
self.q_criterion = nn.MSELoss()
153+
q_lr=8e-4
154+
policy_lr = 8e-4
155+
self.update_cnt=0
156+
157+
self.q_optimizer = optim.Adam(self.qnet.parameters(), lr=q_lr)
158+
self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=policy_lr)
159+
160+
def target_soft_update(self, net, target_net, soft_tau):
161+
# Soft update the target net
162+
for target_param, param in zip(target_net.parameters(), net.parameters()):
163+
target_param.data.copy_( # copy data value into target parameters
164+
target_param.data * (1.0 - soft_tau) + param.data * soft_tau
165+
)
166+
167+
return target_net
168+
169+
def update(self, batch_size, reward_scale=10.0, gamma=0.99, soft_tau=1e-2, policy_up_itr=10, target_update_delay=3, warmup=True):
170+
self.update_cnt+=1
171+
state, action, reward, next_state, done = self.replay_buffer.sample(batch_size)
172+
# print('sample:', state, action, reward, done)
173+
174+
state = torch.FloatTensor(state).to(device)
175+
next_state = torch.FloatTensor(next_state).to(device)
176+
action = torch.FloatTensor(action).to(device)
177+
reward = torch.FloatTensor(reward).unsqueeze(1).to(device)
178+
done = torch.FloatTensor(np.float32(done)).unsqueeze(1).to(device)
179+
180+
predict_q = self.qnet(state, action) # for q
181+
new_next_action = self.target_policy_net.evaluate_action(next_state) # for q
182+
new_action = self.policy_net.evaluate_action(state) # for policy
183+
predict_new_q = self.qnet(state, new_action) # for policy
184+
target_q = reward+(1-done)*gamma*self.target_qnet(next_state, new_next_action) # for q
185+
# reward = reward_scale * (reward - reward.mean(dim=0)) /reward.std(dim=0) # normalize with batch mean and std
186+
187+
# train qnet
188+
q_loss = self.q_criterion(predict_q, target_q.detach())
189+
self.q_optimizer.zero_grad()
190+
q_loss.backward()
191+
self.q_optimizer.step()
192+
193+
# train policy_net
194+
policy_loss = -torch.mean(predict_new_q)
195+
self.policy_optimizer.zero_grad()
196+
policy_loss.backward()
197+
self.policy_optimizer.step()
198+
199+
200+
# update the target_qnet
201+
if self.update_cnt%target_update_delay==0:
202+
self.target_qnet=self.target_soft_update(self.qnet, self.target_qnet, soft_tau)
203+
self.target_policy_net=self.target_soft_update(self.policy_net, self.target_policy_net, soft_tau)
204+
205+
return q_loss.detach().cpu().numpy(), policy_loss.detach().cpu().numpy()
206+
207+
def save_model(self, path):
208+
torch.save(self.qnet.state_dict(), path+'_q')
209+
torch.save(self.target_qnet.state_dict(), path+'_target_q')
210+
torch.save(self.policy_net.state_dict(), path+'_policy')
211+
212+
def load_model(self, path):
213+
self.qnet.load_state_dict(torch.load(path+'_q'))
214+
self.target_qnet.load_state_dict(torch.load(path+'_target_q'))
215+
self.policy_net.load_state_dict(torch.load(path+'_policy'))
216+
self.qnet.eval()
217+
self.target_qnet.eval()
218+
self.policy_net.eval()
219+
220+
def plot(rewards):
221+
plt.figure(figsize=(20,5))
222+
plt.plot(rewards)
223+
plt.savefig('ddpg.png')
224+
# plt.show()
225+
plt.clf()
226+
227+
class NormalizedActions(gym.ActionWrapper): # gym env wrapper
228+
def _action(self, action):
229+
low = self.action_space.low
230+
high = self.action_space.high
231+
232+
action = low + (action + 1.0) * 0.5 * (high - low)
233+
action = np.clip(action, low, high)
234+
235+
return action
236+
237+
def _reverse_action(self, action):
238+
low = self.action_space.low
239+
high = self.action_space.high
240+
241+
action = 2 * (action - low) / (high - low) - 1
242+
action = np.clip(action, low, high)
243+
244+
return action
245+
246+
247+
if __name__ == '__main__':
248+
NUM_JOINTS=2
249+
LINK_LENGTH=[200, 140]
250+
INI_JOING_ANGLES=[0.1, 0.1]
251+
SCREEN_SIZE=1000
252+
# SPARSE_REWARD=False
253+
# SCREEN_SHOT=False
254+
ENV = ['Pendulum', 'Reacher'][1]
255+
if ENV == 'Reacher':
256+
env=Reacher(screen_size=SCREEN_SIZE, num_joints=NUM_JOINTS, link_lengths = LINK_LENGTH, \
257+
ini_joint_angles=INI_JOING_ANGLES, target_pos = [369,430], render=True)
258+
action_dim = env.num_actions
259+
state_dim = env.num_observations
260+
elif ENV == 'Pendulum':
261+
# env = NormalizedActions(gym.make("Pendulum-v0"))
262+
env = gym.make("Pendulum-v0")
263+
action_dim = env.action_space.shape[0]
264+
state_dim = env.observation_space.shape[0]
265+
hidden_dim = 512
266+
explore_steps = 0 # for random exploration
267+
batch_size = 64
268+
269+
replay_buffer_size=1e6
270+
replay_buffer = ReplayBuffer(replay_buffer_size)
271+
model_path='./model/ddpg'
272+
torch.autograd.set_detect_anomaly(True)
273+
alg = DDPG(replay_buffer, state_dim, action_dim, hidden_dim)
274+
275+
if args.train:
276+
# alg.load_model(model_path)
277+
278+
# hyper-parameters
279+
max_episodes = 1000
280+
max_steps = 100
281+
frame_idx = 0
282+
rewards=[]
283+
284+
for i_episode in range (max_episodes):
285+
q_loss_list=[]
286+
policy_loss_list=[]
287+
state = env.reset()
288+
episode_reward = 0
289+
290+
for step in range(max_steps):
291+
if frame_idx > explore_steps:
292+
action = alg.policy_net.select_action(state)
293+
else:
294+
action = alg.policy_net.sample_action(action_range=1.)
295+
next_state, reward, done, _ = env.step(action)
296+
if ENV !='Reacher':
297+
env.render()
298+
replay_buffer.push(state, action, reward, next_state, done)
299+
300+
state = next_state
301+
episode_reward += reward
302+
frame_idx += 1
303+
304+
if len(replay_buffer) > batch_size:
305+
q_loss, policy_loss = alg.update(batch_size)
306+
q_loss_list.append(q_loss)
307+
policy_loss_list.append(policy_loss)
308+
309+
if done:
310+
break
311+
if i_episode % 20 == 0:
312+
plot(rewards)
313+
alg.save_model(model_path)
314+
print('Eps: ', i_episode, '| Reward: ', episode_reward, '| Loss: ', np.average(q_loss_list), np.average(policy_loss_list))
315+
316+
rewards.append(episode_reward)
317+
318+
319+
if args.test:
320+
test_episodes = 10
321+
max_steps=100
322+
alg.load_model(model_path)
323+
324+
for i_episode in range (test_episodes):
325+
q_loss_list=[]
326+
policy_loss_list=[]
327+
state = env.reset()
328+
episode_reward = 0
329+
330+
for step in range(max_steps):
331+
action = alg.policy_net.select_action(state, noise_scale=0.0) # no noise for testing
332+
next_state, reward, done, _ = env.step(action)
333+
334+
state = next_state
335+
episode_reward += reward
336+
337+
338+
if done:
339+
break
340+
341+
print('Eps: ', i_episode, '| Reward: ', episode_reward)
342+

0 commit comments

Comments
 (0)