Skip to content

Commit b354cc3

Browse files
authored
Merge pull request #8 from HumanCompatibleAI/rework-regression-model
Rework training supervised model
2 parents d756cc5 + b8c72e0 commit b354cc3

File tree

10 files changed

+373
-116
lines changed

10 files changed

+373
-116
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ __pycache__/
1515
/fig
1616
src/reward_preprocessing/output
1717
src/scratch
18+
src/output
1819

1920
.idea/
2021

README.md

Lines changed: 60 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,65 @@ You can also use docker by building the image and running the scripts from insid
2222

2323
## Usage
2424

25-
Use `print_config` to get details of the config params of the sacred scripts.
25+
### Getting a Reward Model
26+
27+
#### Training the Supervised Reward Model
28+
29+
- Get an expert rl agent
30+
- For coinrun, use this fork: [PavelCz/train-procgen-pytorch](https://github.com/PavelCz/train-procgen-pytorch)
31+
- Something like this to train:
32+
```bash
33+
python train.py \
34+
--exp_name coinrun \
35+
--env_name coinrun \
36+
--num_levels 100000 \
37+
--distribution_mode hard \
38+
--param_name hard-500 \
39+
--num_timesteps 200000000 \
40+
--num_checkpoints 5 \
41+
--seed 6033 \
42+
--random_percent 0 \
43+
--gpu_device=3
44+
```
45+
- Render videos to see how well the expert does:
46+
```bash
47+
python render.py \
48+
--exp_name render \
49+
--env_name coinrun \
50+
--num_levels 100000 \
51+
--distribution_mode hard \
52+
--param_name hard-500 \
53+
--seed 6033 \
54+
--model_file path/model_200015872.pth
55+
--vid_dir video
56+
```
57+
- Add a trajectory path to save trajectories that can be used to train reward nets:
58+
```bash
59+
python render.py \
60+
--exp_name render \
61+
--env_name coinrun \
62+
--num_levels 100000 \
63+
--distribution_mode hard \
64+
--param_name hard-500 \
65+
--seed 9073 \
66+
--model_file path/model_200015872.pth \
67+
--vid_dir video \
68+
--traj_path rollouts.npz \
69+
--noview \
70+
# num_wpisodes is not actually the number of episodes, but the number of iterations
71+
# n_steps (usually 256 for most hparams) each.
72+
--num_episodes=1000
73+
```
74+
75+
76+
### Interpreting a Reward Model
77+
78+
Once you have a reward model, you can use visualize various interpretability methods.
79+
Use `print_config` to get details of the config params of the sacred scripts, as in:
80+
81+
```bash
82+
python -m reward_preprocessing.interpret print_config
83+
```
2684

2785
## Code Structure
2886

@@ -36,7 +94,7 @@ Use `print_config` to get details of the config params of the sacred scripts.
3694
- `policies`: RL policies for training experts with train_rl.
3795
- `preprocessing` Reward preprocessing / reward shaping code.
3896
- `scripts`: All scripts that are not the main scripts of the projects. Helpers and scripts that produce artifacts that are used by the main script. Everything here should either be an executable file or a config for one.
39-
- `helpers`: Helper scripts that are bash executables.
97+
- `helpers`: Helper scripts that are bash executables or python scripts that are not full sacred experiments.
4098
- `trainers`: Our additions to the suite of reward learning algorithms available in imitation. Currently this contains the trainer for training reward nets with supervised learning.
4199
- `vis`: Visualization code for interpreting reward functions.
42100
- `interpret.py`: The main script that provides the functionality for this project.

src/reward_preprocessing/common/utils.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -25,15 +25,15 @@ def make_transition_to_tensor(num_acts):
2525

2626
def transition_to_tensor(transition):
2727
obs = transition["obs"]
28-
# Only normalize for integer types.
2928
if np.issubdtype(obs.dtype, np.integer):
30-
obs = obs / 255.0
31-
# For floats we don't divide by 255.0.
29+
obs = obs.float() / 255.0
30+
# For floats we don't divide by 255.0. In that case we assume the
31+
# observation is already in the range [0, 1].
3232
act = int(transition["acts"])
3333
next_obs = transition["next_obs"]
34-
# Only normalize for integer types.
34+
3535
if np.issubdtype(next_obs.dtype, np.integer):
36-
next_obs = next_obs / 255.0
36+
next_obs = next_obs.float() / 255.0
3737

3838
transp_obs = np.transpose(obs, (2, 0, 1))
3939
obs_height = transp_obs.shape[1]

src/reward_preprocessing/models.py

Lines changed: 0 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -3,71 +3,14 @@
33

44
import gym
55
from imitation.rewards.reward_nets import RewardNet
6-
from imitation.util.networks import build_cnn
76
import numpy as np
8-
from stable_baselines3.common.preprocessing import preprocess_obs
97
import torch as th
108

119
from reward_preprocessing.env import maze, mountain_car # noqa: F401
1210

1311
logger = logging.getLogger(__name__)
1412

1513

16-
class ProcgenCnnRegressionRewardNet(RewardNet):
17-
"""RewardNet using a CNN for learning reward using supervised regression on obs, rew
18-
pairs."""
19-
20-
def __init__(self, observation_space: gym.Space, action_space: gym.Space):
21-
super().__init__(observation_space=observation_space, action_space=action_space)
22-
23-
# TODO: Not sure if Cnn (from this module) or build_cnn is better here. The
24-
# former gives us more freedom in the architecture.
25-
self.cnn_regressor = build_cnn(
26-
in_channels=3,
27-
hid_channels=[32, 64],
28-
out_size=1,
29-
)
30-
31-
def forward(
32-
self,
33-
state: th.Tensor,
34-
action: th.Tensor,
35-
next_state: th.Tensor,
36-
done: th.Tensor,
37-
) -> th.Tensor:
38-
"""
39-
Args:
40-
state: Tensor of shape (batch_size, height, width, channels)
41-
action: Tensor of shape (batch_size, action_size)
42-
next_state: Tensor of shape (batch_size, state_size)
43-
done: Tensor of shape (batch_size,)
44-
Returns:
45-
Tensor of shape (batch_size,)
46-
"""
47-
# TODO: We always assume shape (batch_size, height, width, channels) for inputs,
48-
# do we actually want that or do we want to allow different shapes?
49-
# Performs preprocessing for images
50-
preprocessed_obs = preprocess_obs(
51-
next_state, self.observation_space, normalize_images=self.normalize_images
52-
)
53-
assert isinstance(preprocessed_obs, th.Tensor)
54-
# Reshape from (batch_size [0], height [1], width [2], channels [3])
55-
# to (batch_size [0], channels [3], height [1], width [2])
56-
if len(preprocessed_obs.shape) == 4:
57-
transposed = th.permute(preprocessed_obs, [0, 3, 1, 2])
58-
else:
59-
logging.warning(
60-
f"Encountered unexpected shape {preprocessed_obs.shape}. "
61-
"Skipping transpose."
62-
)
63-
transposed = preprocessed_obs
64-
batch_size = transposed.shape[0]
65-
66-
# Reshape into shape expected by imitation (see RewardNet predict_th())
67-
out = self.cnn_regressor(transposed).reshape((batch_size,))
68-
return out
69-
70-
7114
class MazeRewardNet(RewardNet):
7215
def __init__(self, size: int, maze_name: str = "EmptyMaze", **kwargs):
7316
env = gym.make(f"reward_preprocessing/{maze_name}{size}-v0", **kwargs)

src/reward_preprocessing/scripts/common/supervised.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,4 +16,32 @@ def config():
1616
batch_size = 32 # Batch size for training a supervised model
1717
num_loader_workers = 0 # Number of workers for data loading
1818

19+
# Apparently in sacred I need default values for parameters that I want to be able
20+
# to override. At least that's how I interpret this information:
21+
# https://github.com/IDSIA/sacred/issues/644
22+
23+
# Keyword arguments for reward network
24+
net_kwargs = dict(
25+
use_state=True, use_action=True, use_next_state=True, hid_channels=(32, 64)
26+
)
27+
1928
locals() # quieten flake8
29+
30+
31+
@supervised_ingredient.config_hook
32+
def config_hook(config, command_name, logger) -> dict:
33+
"""Warn if network is set to `use_done`, since this setting will be overriden
34+
in train_regression."""
35+
del command_name
36+
res = {}
37+
if (
38+
"use_done" in config["supervised"]["net_kwargs"]
39+
and config["supervised"]["net_kwargs"]["use_done"]
40+
):
41+
logger.warning(
42+
"Supervised training does not support setting use_done to "
43+
"True. We don't support networks that take in the done signal. "
44+
"This value will be ignored."
45+
)
46+
47+
return res
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
from imitation.scripts.common import common, demonstrations
2+
import sacred
3+
4+
from reward_preprocessing.scripts.common import supervised
5+
6+
train_regression_ex = sacred.Experiment(
7+
"train_regression",
8+
ingredients=[
9+
common.common_ingredient,
10+
demonstrations.demonstrations_ingredient,
11+
supervised.supervised_ingredient,
12+
],
13+
)
14+
15+
16+
@train_regression_ex.config
17+
def defaults():
18+
# Every checkpoint_epoch_interval epochs, save the model. Epochs start at 1.
19+
checkpoint_epoch_interval = 1
20+
locals() # make flake8 happy
21+
22+
23+
@train_regression_ex.named_config
24+
def use_next_state():
25+
supervised = dict(
26+
net_kwargs=dict(use_state=False, use_action=False, use_next_state=True)
27+
)
28+
locals() # make flake8 happy
29+
30+
31+
@train_regression_ex.named_config
32+
def use_all():
33+
supervised = dict(
34+
net_kwargs=dict(use_state=True, use_action=True, use_next_state=True)
35+
)
36+
locals() # make flake8 happy
37+
38+
39+
@train_regression_ex.named_config
40+
def large_net():
41+
# Similar to AlexNet architecture, only in the number of convolutional layers and
42+
# the number of channels in them.
43+
supervised = dict(net_kwargs=dict(hid_channels=(96, 256, 384, 384, 256)))
44+
locals() # make flake8 happy
45+
46+
47+
@train_regression_ex.named_config
48+
def very_large_net():
49+
# Net that has the same convolutional networks as the Impala net used to train
50+
# policies. This does not have the other bells and whistles of Impala, such as
51+
# residual connections.
52+
# This network is probably too unnecessarily large for predicting the rewards using
53+
# supervised learning.
54+
supervised = dict(
55+
net_kwargs=dict(
56+
hid_channels=(16, 16, 16, 16, 16, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32)
57+
)
58+
)
59+
locals() # make flake8 happy
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
"""Fix saved trajectory format from that one time that I saved them wrong."""
2+
import numpy as np
3+
4+
path = "/home/pavel/out/interpret/expert-rollouts/procgen-gm/005/fixed-coin_1000.2k.npz"
5+
data = np.load(path, allow_pickle=True)
6+
7+
# Observations need to be fixed
8+
observations = data["obs"]
9+
10+
indices = data["indices"]
11+
traj_list = []
12+
for i in range(len(indices)):
13+
if i == 0:
14+
start = 0
15+
else:
16+
start = indices[i - 1]
17+
end = indices[i]
18+
# + 1 because we also want to include the last next_obs
19+
obs = observations[start : end + 1]
20+
traj_list.append(obs)
21+
# Also add the last trajectory
22+
traj_list.append(observations[indices[-1] :])
23+
24+
# Concatenate them together, duplicates and all
25+
new_observations = np.concatenate(traj_list, axis=0)
26+
27+
# Sanity check
28+
assert (
29+
np.cumsum([len(traj) - 1 for traj in traj_list[:-1]]) == np.array(indices)
30+
).all()
31+
32+
new_dict = {
33+
"obs": new_observations,
34+
"acts": data["acts"],
35+
"infos": data["infos"],
36+
"terminal": data["terminal"],
37+
"rews": data["rews"],
38+
"indices": data["indices"],
39+
}
40+
41+
# Update path name
42+
split = path.split(".")
43+
split[-2] += "_fixed"
44+
save_path = ".".join(split)
45+
46+
# Save fixed data
47+
with open(save_path, "wb") as f:
48+
np.savez_compressed(f, **new_dict)

src/reward_preprocessing/scripts/train_regression.py

Lines changed: 13 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -3,31 +3,14 @@
33
from typing import Sequence, cast
44

55
from imitation.data import types
6+
from imitation.rewards.reward_nets import CnnRewardNet
67
from imitation.scripts.common import common, demonstrations
7-
import sacred
88
from sacred.observers import FileStorageObserver
99
import torch as th
1010

11-
from reward_preprocessing.models import ProcgenCnnRegressionRewardNet
12-
from reward_preprocessing.scripts.common import supervised as supervised_config
11+
from reward_preprocessing.scripts.config.train_regression import train_regression_ex
1312
from reward_preprocessing.trainers.supervised_trainer import SupervisedTrainer
1413

15-
train_regression_ex = sacred.Experiment(
16-
"train_regression",
17-
ingredients=[
18-
common.common_ingredient,
19-
demonstrations.demonstrations_ingredient,
20-
supervised_config.supervised_ingredient,
21-
],
22-
)
23-
24-
25-
@train_regression_ex.config
26-
def defaults():
27-
# Every checkpoint_epoch_interval epochs, save the model. Epochs start at 1.
28-
checkpoint_epoch_interval = 1
29-
locals() # make flake8 happy
30-
3114

3215
def save(trainer: SupervisedTrainer, save_path):
3316
"""Save regression model."""
@@ -47,9 +30,15 @@ def train_regression(supervised, checkpoint_epoch_interval: int): # From ingred
4730

4831
with common.make_venv() as venv:
4932
# Init the regression CNN
50-
model = ProcgenCnnRegressionRewardNet(
51-
observation_space=venv.observation_space, action_space=venv.action_space
33+
model = CnnRewardNet(
34+
**supervised["net_kwargs"],
35+
# We don't want the following to be overriden.
36+
observation_space=venv.observation_space,
37+
action_space=venv.action_space,
38+
use_done=False,
5239
)
40+
custom_logger.log(model)
41+
5342
device = "cuda" if th.cuda.is_available() else "cpu"
5443
loss_fn = th.nn.MSELoss()
5544

@@ -69,13 +58,16 @@ def train_regression(supervised, checkpoint_epoch_interval: int): # From ingred
6958
# Move model to correct device
7059
model.to(device)
7160

61+
trainer.log_data_stats()
62+
7263
def checkpoint_callback(epoch_num):
7364
if (
7465
checkpoint_epoch_interval > 0
7566
and epoch_num % checkpoint_epoch_interval == 0
7667
):
7768
save(trainer, os.path.join(log_dir, "checkpoints", f"{epoch_num:05d}"))
7869

70+
custom_logger.log("Start training regression model.")
7971
# Start training
8072
trainer.train(
8173
num_epochs=supervised["epochs"],

src/reward_preprocessing/trainers/__init__.py

Whitespace-only changes.

0 commit comments

Comments
 (0)