Skip to content

Commit e70f1ff

Browse files
nkarpachevv-goncharenko
authored andcommitted
Adding week9 materials
1 parent cafe16d commit e70f1ff

File tree

7 files changed

+735
-0
lines changed

7 files changed

+735
-0
lines changed

week1_09_approx_qlearning/README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Approximate Q-learning implementation practice:
2+
[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/girafe-ai/ml-mipt/blob/advanced/week09_approx_qlearning/week09_approximate_Q_learning.ipynb)
Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
# taken from OpenAI baselines.
2+
3+
import numpy as np
4+
import gym
5+
6+
class MaxAndSkipEnv(gym.Wrapper):
7+
def __init__(self, env, skip=4):
8+
"""Return only every `skip`-th frame"""
9+
gym.Wrapper.__init__(self, env)
10+
# most recent raw observations (for max pooling across time steps)
11+
self._obs_buffer = np.zeros((2,)+env.observation_space.shape, dtype=np.uint8)
12+
self._skip = skip
13+
14+
def step(self, action):
15+
"""Repeat action, sum reward, and max over last observations."""
16+
total_reward = 0.0
17+
done = None
18+
for i in range(self._skip):
19+
obs, reward, done, info = self.env.step(action)
20+
if i == self._skip - 2: self._obs_buffer[0] = obs
21+
if i == self._skip - 1: self._obs_buffer[1] = obs
22+
total_reward += reward
23+
if done:
24+
break
25+
# Note that the observation on the done=True frame
26+
# doesn't matter
27+
max_frame = self._obs_buffer.max(axis=0)
28+
29+
return max_frame, total_reward, done, info
30+
31+
def reset(self, **kwargs):
32+
return self.env.reset(**kwargs)
33+
34+
35+
class ClipRewardEnv(gym.RewardWrapper):
36+
def __init__(self, env):
37+
gym.RewardWrapper.__init__(self, env)
38+
39+
def reward(self, reward):
40+
"""Bin reward to {+1, 0, -1} by its sign."""
41+
return np.sign(reward)
42+
43+
44+
class FireResetEnv(gym.Wrapper):
45+
def __init__(self, env):
46+
"""Take action on reset for environments that are fixed until firing."""
47+
gym.Wrapper.__init__(self, env)
48+
assert env.unwrapped.get_action_meanings()[1] == 'FIRE'
49+
assert len(env.unwrapped.get_action_meanings()) >= 3
50+
51+
def reset(self, **kwargs):
52+
self.env.reset(**kwargs)
53+
obs, _, done, _ = self.env.step(1)
54+
if done:
55+
self.env.reset(**kwargs)
56+
obs, _, done, _ = self.env.step(2)
57+
if done:
58+
self.env.reset(**kwargs)
59+
return obs
60+
61+
def step(self, ac):
62+
return self.env.step(ac)
63+
64+
65+
class EpisodicLifeEnv(gym.Wrapper):
66+
def __init__(self, env):
67+
"""Make end-of-life == end-of-episode, but only reset on true game over.
68+
Done by DeepMind for the DQN and co. since it helps value estimation.
69+
"""
70+
gym.Wrapper.__init__(self, env)
71+
self.lives = 0
72+
self.was_real_done = True
73+
74+
def step(self, action):
75+
obs, reward, done, info = self.env.step(action)
76+
self.was_real_done = done
77+
# check current lives, make loss of life terminal,
78+
# then update lives to handle bonus lives
79+
lives = self.env.unwrapped.ale.lives()
80+
if lives < self.lives and lives > 0:
81+
# for Qbert sometimes we stay in lives == 0 condition for a few frames
82+
# so it's important to keep lives > 0, so that we only reset once
83+
# the environment advertises done.
84+
done = True
85+
self.lives = lives
86+
return obs, reward, done, info
87+
88+
def reset(self, **kwargs):
89+
"""Reset only when lives are exhausted.
90+
This way all states are still reachable even though lives are episodic,
91+
and the learner need not know about any of this behind-the-scenes.
92+
"""
93+
if self.was_real_done:
94+
obs = self.env.reset(**kwargs)
95+
else:
96+
# no-op step to advance from terminal/lost life state
97+
obs, _, _, _ = self.env.step(0)
98+
self.lives = self.env.unwrapped.ale.lives()
99+
return obs
100+
101+
102+
# in torch imgs have shape [c, h, w] instead of common [h, w, c]
103+
class AntiTorchWrapper(gym.ObservationWrapper):
104+
def __init__(self, env):
105+
gym.ObservationWrapper.__init__(self, env)
106+
107+
self.img_size = [env.observation_space.shape[i]
108+
for i in [1, 2, 0]
109+
]
110+
self.observation_space = gym.spaces.Box(0.0, 1.0, self.img_size)
111+
112+
def _observation(self, img):
113+
"""what happens to each observation"""
114+
img = img.transpose(1, 2, 0)
115+
return img
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
import numpy as np
2+
from gym.spaces.box import Box
3+
from gym.core import Wrapper
4+
class FrameBuffer(Wrapper):
5+
def __init__(self, env, n_frames=4, dim_order='tensorflow'):
6+
"""A gym wrapper that reshapes, crops and scales image into the desired shapes"""
7+
super(FrameBuffer, self).__init__(env)
8+
self.dim_order = dim_order
9+
if dim_order == 'tensorflow':
10+
height, width, n_channels = env.observation_space.shape
11+
obs_shape = [height, width, n_channels * n_frames]
12+
elif dim_order == 'pytorch':
13+
n_channels, height, width = env.observation_space.shape
14+
obs_shape = [n_channels * n_frames, height, width]
15+
else:
16+
raise ValueError('dim_order should be "tensorflow" or "pytorch", got {}'.format(dim_order))
17+
self.observation_space = Box(0.0, 1.0, obs_shape)
18+
self.framebuffer = np.zeros(obs_shape, 'float32')
19+
20+
def reset(self):
21+
"""resets breakout, returns initial frames"""
22+
self.framebuffer = np.zeros_like(self.framebuffer)
23+
self.update_buffer(self.env.reset())
24+
return self.framebuffer
25+
26+
def step(self, action):
27+
"""plays breakout for 1 step, returns frame buffer"""
28+
new_img, reward, done, info = self.env.step(action)
29+
self.update_buffer(new_img)
30+
return self.framebuffer, reward, done, info
31+
32+
def update_buffer(self, img):
33+
if self.dim_order == 'tensorflow':
34+
offset = self.env.observation_space.shape[-1]
35+
axis = -1
36+
cropped_framebuffer = self.framebuffer[:,:,:-offset]
37+
elif self.dim_order == 'pytorch':
38+
offset = self.env.observation_space.shape[0]
39+
axis = 0
40+
cropped_framebuffer = self.framebuffer[:-offset]
41+
self.framebuffer = np.concatenate([img, cropped_framebuffer], axis = axis)
Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
# This code is shamelessly stolen from https://github.com/openai/baselines/blob/master/baselines/deepq/replay_buffer.py
2+
import numpy as np
3+
import random
4+
5+
class ReplayBuffer(object):
6+
def __init__(self, size):
7+
"""Create Replay buffer.
8+
Parameters
9+
----------
10+
size: int
11+
Max number of transitions to store in the buffer. When the buffer
12+
overflows the old memories are dropped.
13+
"""
14+
self._storage = []
15+
self._maxsize = size
16+
self._next_idx = 0
17+
18+
def __len__(self):
19+
return len(self._storage)
20+
21+
def add(self, obs_t, action, reward, obs_tp1, done):
22+
data = (obs_t, action, reward, obs_tp1, done)
23+
24+
if self._next_idx >= len(self._storage):
25+
self._storage.append(data)
26+
else:
27+
self._storage[self._next_idx] = data
28+
self._next_idx = (self._next_idx + 1) % self._maxsize
29+
30+
def _encode_sample(self, idxes):
31+
obses_t, actions, rewards, obses_tp1, dones = [], [], [], [], []
32+
for i in idxes:
33+
data = self._storage[i]
34+
obs_t, action, reward, obs_tp1, done = data
35+
obses_t.append(np.array(obs_t, copy=False))
36+
actions.append(np.array(action, copy=False))
37+
rewards.append(reward)
38+
obses_tp1.append(np.array(obs_tp1, copy=False))
39+
dones.append(done)
40+
return np.array(obses_t), np.array(actions), np.array(rewards), np.array(obses_tp1), np.array(dones)
41+
42+
def sample(self, batch_size):
43+
"""Sample a batch of experiences.
44+
Parameters
45+
----------
46+
batch_size: int
47+
How many transitions to sample.
48+
Returns
49+
-------
50+
obs_batch: np.array
51+
batch of observations
52+
act_batch: np.array
53+
batch of actions executed given obs_batch
54+
rew_batch: np.array
55+
rewards received as results of executing act_batch
56+
next_obs_batch: np.array
57+
next set of observations seen after executing act_batch
58+
done_mask: np.array
59+
done_mask[i] = 1 if executing act_batch[i] resulted in
60+
the end of an episode and 0 otherwise.
61+
"""
62+
idxes = [random.randint(0, len(self._storage) - 1) for _ in range(batch_size)]
63+
return self._encode_sample(idxes)

week1_09_approx_qlearning/utils.py

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
import numpy as np
2+
import psutil
3+
from scipy.signal import convolve, gaussian
4+
import torch
5+
from torch import nn
6+
import os
7+
8+
def get_cum_discounted_rewards(rewards, gamma):
9+
"""
10+
evaluates cumulative discounted rewards:
11+
r_t + gamma * r_{t+1} + gamma^2 * r_{t_2} + ...
12+
"""
13+
cum_rewards = []
14+
cum_rewards.append(rewards[-1])
15+
for r in reversed(rewards[:-1]):
16+
cum_rewards.insert(0, r + gamma * cum_rewards[0])
17+
return cum_rewards
18+
19+
20+
def play_and_log_episode(env, agent, gamma=0.99, t_max=10000):
21+
"""
22+
always greedy
23+
"""
24+
states = []
25+
v_mc = []
26+
v_agent = []
27+
q_spreads = []
28+
td_errors = []
29+
rewards = []
30+
31+
s = env.reset()
32+
for step in range(t_max):
33+
states.append(s)
34+
qvalues = agent.get_qvalues([s])
35+
max_q_value, min_q_value = np.max(qvalues), np.min(qvalues)
36+
v_agent.append(max_q_value)
37+
q_spreads.append(max_q_value - min_q_value)
38+
if step > 0:
39+
td_errors.append(np.abs(rewards[-1] + gamma * v_agent[-1] - v_agent[-2]))
40+
41+
action = qvalues.argmax(axis=-1)[0]
42+
43+
s, r, done, _ = env.step(action)
44+
rewards.append(r)
45+
if done:
46+
break
47+
td_errors.append(np.abs(rewards[-1] + gamma * v_agent[-1] - v_agent[-2]))
48+
49+
v_mc = get_cum_discounted_rewards(rewards, gamma)
50+
51+
return_pack = {
52+
'states': np.array(states),
53+
'v_mc': np.array(v_mc),
54+
'v_agent': np.array(v_agent),
55+
'q_spreads': np.array(q_spreads),
56+
'td_errors': np.array(td_errors),
57+
'rewards': np.array(rewards),
58+
'episode_finished': np.array(done)
59+
}
60+
61+
return return_pack
62+
63+
64+
def img_by_obs(obs, state_dim):
65+
"""
66+
Unwraps obs by channels.
67+
observation is of shape [c, h=w, w=h]
68+
"""
69+
return obs.reshape([-1, state_dim[2]])
70+
71+
72+
def is_enough_ram(min_available_gb = 0.1):
73+
mem = psutil.virtual_memory()
74+
return mem.available >= min_available_gb * (1024 ** 3)
75+
76+
77+
def linear_decay(init_val, final_val, cur_step, total_steps):
78+
if cur_step >= total_steps:
79+
return final_val
80+
return (init_val * (total_steps - cur_step) + final_val * cur_step) / total_steps
81+
82+
83+
def smoothen(values):
84+
kernel = gaussian(100, std=100)
85+
# kernel = np.concatenate([np.arange(100), np.arange(99, -1, -1)])
86+
kernel = kernel / np.sum(kernel)
87+
return convolve(values, kernel, 'valid')
Binary file not shown.

0 commit comments

Comments
 (0)