Merge pull request #8 from HumanCompatibleAI/rework-regression-model

PavelCz · web-flow · commit b354cc30120c · 2022-11-01T03:22:02.000-07:00
Rework training supervised model
diff --git a/.gitignore b/.gitignore
@@ -15,6 +15,7 @@ __pycache__/
 /fig
 src/reward_preprocessing/output
 src/scratch
+src/output
 
 .idea/
 
diff --git a/README.md b/README.md
@@ -22,7 +22,65 @@ You can also use docker by building the image and running the scripts from insid
 
 ## Usage
 
-Use `print_config` to get details of the config params of the sacred scripts.
+### Getting a Reward Model
+
+#### Training the Supervised Reward Model
+
+- Get an expert rl agent
+  - For coinrun, use this fork: [PavelCz/train-procgen-pytorch](https://github.com/PavelCz/train-procgen-pytorch)
+  - Something like this to train:
+  ```bash
+  python train.py \
+    --exp_name coinrun \
+    --env_name coinrun \
+    --num_levels 100000 \
+    --distribution_mode hard \
+    --param_name hard-500 \
+    --num_timesteps 200000000 \
+    --num_checkpoints 5 \
+    --seed 6033 \
+    --random_percent 0 \
+    --gpu_device=3
+  ```
+  - Render videos to see how well the expert does:
+  ```bash
+  python render.py \
+    --exp_name render \
+    --env_name coinrun \
+    --num_levels 100000 \
+    --distribution_mode hard \
+    --param_name hard-500 \
+    --seed 6033 \
+    --model_file path/model_200015872.pth 
+    --vid_dir video
+  ```
+  - Add a trajectory path to save trajectories that can be used to train reward nets:
+  ```bash
+  python render.py \
+    --exp_name render \
+    --env_name coinrun \
+    --num_levels 100000 \
+    --distribution_mode hard \
+    --param_name hard-500 \
+    --seed 9073 \
+    --model_file path/model_200015872.pth \
+    --vid_dir video \
+    --traj_path rollouts.npz \
+    --noview \
+    # num_wpisodes is not actually the number of episodes, but the number of iterations
+    # n_steps (usually 256 for most hparams) each.
+    --num_episodes=1000
+  ```
+
+
+### Interpreting a Reward Model
+
+Once you have a reward model, you can use visualize various interpretability methods.
+Use `print_config` to get details of the config params of the sacred scripts, as in:
+
+```bash
+python -m reward_preprocessing.interpret print_config
+```
 
 ## Code Structure
 
@@ -36,7 +94,7 @@ Use `print_config` to get details of the config params of the sacred scripts.
     - `policies`: RL policies for training experts with train_rl.
     - `preprocessing` Reward preprocessing / reward shaping code.
     - `scripts`: All scripts that are not the main scripts of the projects. Helpers and scripts that produce artifacts that are used by the main script. Everything here should either be an executable file or a config for one.
-       - `helpers`: Helper scripts that are bash executables.
+       - `helpers`: Helper scripts that are bash executables or python scripts that are not full sacred experiments.
     - `trainers`: Our additions to the suite of reward learning algorithms available in imitation. Currently this contains the trainer for training reward nets with supervised learning.
     - `vis`: Visualization code for interpreting reward functions.
     - `interpret.py`: The main script that provides the functionality for this project.
diff --git a/src/reward_preprocessing/common/utils.py b/src/reward_preprocessing/common/utils.py
@@ -25,15 +25,15 @@ def make_transition_to_tensor(num_acts):
 
     def transition_to_tensor(transition):
         obs = transition["obs"]
-        # Only normalize for integer types.
         if np.issubdtype(obs.dtype, np.integer):
-            obs = obs / 255.0
-            # For floats we don't divide by 255.0.
+            obs = obs.float() / 255.0
+            # For floats we don't divide by 255.0. In that case we assume the
+            # observation is already in the range [0, 1].
         act = int(transition["acts"])
         next_obs = transition["next_obs"]
-        # Only normalize for integer types.
+
         if np.issubdtype(next_obs.dtype, np.integer):
-            next_obs = next_obs / 255.0
+            next_obs = next_obs.float() / 255.0
 
         transp_obs = np.transpose(obs, (2, 0, 1))
         obs_height = transp_obs.shape[1]
diff --git a/src/reward_preprocessing/models.py b/src/reward_preprocessing/models.py
@@ -3,71 +3,14 @@
 
 import gym
 from imitation.rewards.reward_nets import RewardNet
-from imitation.util.networks import build_cnn
 import numpy as np
-from stable_baselines3.common.preprocessing import preprocess_obs
 import torch as th
 
 from reward_preprocessing.env import maze, mountain_car  # noqa: F401
 
 logger = logging.getLogger(__name__)
 
 
-class ProcgenCnnRegressionRewardNet(RewardNet):
-    """RewardNet using a CNN for learning reward using supervised regression on obs, rew
-    pairs."""
-
-    def __init__(self, observation_space: gym.Space, action_space: gym.Space):
-        super().__init__(observation_space=observation_space, action_space=action_space)
-
-        # TODO: Not sure if Cnn (from this module) or build_cnn is better here. The
-        # former gives us more freedom in the architecture.
-        self.cnn_regressor = build_cnn(
-            in_channels=3,
-            hid_channels=[32, 64],
-            out_size=1,
-        )
-
-    def forward(
-        self,
-        state: th.Tensor,
-        action: th.Tensor,
-        next_state: th.Tensor,
-        done: th.Tensor,
-    ) -> th.Tensor:
-        """
-        Args:
-            state: Tensor of shape (batch_size, height, width, channels)
-            action: Tensor of shape (batch_size, action_size)
-            next_state: Tensor of shape (batch_size, state_size)
-            done: Tensor of shape (batch_size,)
-        Returns:
-            Tensor of shape (batch_size,)
-        """
-        # TODO: We always assume shape (batch_size, height, width, channels) for inputs,
-        # do we actually want that or do we want to allow different shapes?
-        # Performs preprocessing for images
-        preprocessed_obs = preprocess_obs(
-            next_state, self.observation_space, normalize_images=self.normalize_images
-        )
-        assert isinstance(preprocessed_obs, th.Tensor)
-        # Reshape from (batch_size [0], height [1], width [2], channels [3])
-        # to (batch_size [0], channels [3], height [1], width [2])
-        if len(preprocessed_obs.shape) == 4:
-            transposed = th.permute(preprocessed_obs, [0, 3, 1, 2])
-        else:
-            logging.warning(
-                f"Encountered unexpected shape {preprocessed_obs.shape}. "
-                "Skipping transpose."
-            )
-            transposed = preprocessed_obs
-        batch_size = transposed.shape[0]
-
-        # Reshape into shape expected by imitation (see RewardNet predict_th())
-        out = self.cnn_regressor(transposed).reshape((batch_size,))
-        return out
-
-
 class MazeRewardNet(RewardNet):
     def __init__(self, size: int, maze_name: str = "EmptyMaze", **kwargs):
         env = gym.make(f"reward_preprocessing/{maze_name}{size}-v0", **kwargs)
diff --git a/src/reward_preprocessing/scripts/common/supervised.py b/src/reward_preprocessing/scripts/common/supervised.py
@@ -16,4 +16,32 @@ def config():
     batch_size = 32  # Batch size for training a supervised model
     num_loader_workers = 0  # Number of workers for data loading
 
+    # Apparently in sacred I need default values for parameters that I want to be able
+    # to override. At least that's how I interpret this information:
+    # https://github.com/IDSIA/sacred/issues/644
+
+    # Keyword arguments for reward network
+    net_kwargs = dict(
+        use_state=True, use_action=True, use_next_state=True, hid_channels=(32, 64)
+    )
+
     locals()  # quieten flake8
+
+
+@supervised_ingredient.config_hook
+def config_hook(config, command_name, logger) -> dict:
+    """Warn if network is set to `use_done`, since this setting will be overriden
+    in train_regression."""
+    del command_name
+    res = {}
+    if (
+        "use_done" in config["supervised"]["net_kwargs"]
+        and config["supervised"]["net_kwargs"]["use_done"]
+    ):
+        logger.warning(
+            "Supervised training does not support setting use_done to "
+            "True. We don't support networks that take in the done signal. "
+            "This value will be ignored."
+        )
+
+    return res
diff --git a/src/reward_preprocessing/scripts/config/train_regression.py b/src/reward_preprocessing/scripts/config/train_regression.py
@@ -0,0 +1,59 @@
+from imitation.scripts.common import common, demonstrations
+import sacred
+
+from reward_preprocessing.scripts.common import supervised
+
+train_regression_ex = sacred.Experiment(
+    "train_regression",
+    ingredients=[
+        common.common_ingredient,
+        demonstrations.demonstrations_ingredient,
+        supervised.supervised_ingredient,
+    ],
+)
+
+
+@train_regression_ex.config
+def defaults():
+    # Every checkpoint_epoch_interval epochs, save the model. Epochs start at 1.
+    checkpoint_epoch_interval = 1
+    locals()  # make flake8 happy
+
+
+@train_regression_ex.named_config
+def use_next_state():
+    supervised = dict(
+        net_kwargs=dict(use_state=False, use_action=False, use_next_state=True)
+    )
+    locals()  # make flake8 happy
+
+
+@train_regression_ex.named_config
+def use_all():
+    supervised = dict(
+        net_kwargs=dict(use_state=True, use_action=True, use_next_state=True)
+    )
+    locals()  # make flake8 happy
+
+
+@train_regression_ex.named_config
+def large_net():
+    # Similar to AlexNet architecture, only in the number of convolutional layers and
+    # the number of channels in them.
+    supervised = dict(net_kwargs=dict(hid_channels=(96, 256, 384, 384, 256)))
+    locals()  # make flake8 happy
+
+
+@train_regression_ex.named_config
+def very_large_net():
+    # Net that has the same convolutional networks as the Impala net used to train
+    # policies. This does not have the other bells and whistles of Impala, such as
+    # residual connections.
+    # This network is probably too unnecessarily large for predicting the rewards using
+    # supervised learning.
+    supervised = dict(
+        net_kwargs=dict(
+            hid_channels=(16, 16, 16, 16, 16, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32)
+        )
+    )
+    locals()  # make flake8 happy
diff --git a/src/reward_preprocessing/scripts/helpers/fix_trajectories.py b/src/reward_preprocessing/scripts/helpers/fix_trajectories.py
@@ -0,0 +1,48 @@
+"""Fix saved trajectory format from that one time that I saved them wrong."""
+import numpy as np
+
+path = "/home/pavel/out/interpret/expert-rollouts/procgen-gm/005/fixed-coin_1000.2k.npz"
+data = np.load(path, allow_pickle=True)
+
+# Observations need to be fixed
+observations = data["obs"]
+
+indices = data["indices"]
+traj_list = []
+for i in range(len(indices)):
+    if i == 0:
+        start = 0
+    else:
+        start = indices[i - 1]
+    end = indices[i]
+    # + 1 because we also want to include the last next_obs
+    obs = observations[start : end + 1]
+    traj_list.append(obs)
+# Also add the last trajectory
+traj_list.append(observations[indices[-1] :])
+
+# Concatenate them together, duplicates and all
+new_observations = np.concatenate(traj_list, axis=0)
+
+# Sanity check
+assert (
+    np.cumsum([len(traj) - 1 for traj in traj_list[:-1]]) == np.array(indices)
+).all()
+
+new_dict = {
+    "obs": new_observations,
+    "acts": data["acts"],
+    "infos": data["infos"],
+    "terminal": data["terminal"],
+    "rews": data["rews"],
+    "indices": data["indices"],
+}
+
+# Update path name
+split = path.split(".")
+split[-2] += "_fixed"
+save_path = ".".join(split)
+
+# Save fixed data
+with open(save_path, "wb") as f:
+    np.savez_compressed(f, **new_dict)
diff --git a/src/reward_preprocessing/scripts/train_regression.py b/src/reward_preprocessing/scripts/train_regression.py
@@ -3,31 +3,14 @@
 from typing import Sequence, cast
 
 from imitation.data import types
+from imitation.rewards.reward_nets import CnnRewardNet
 from imitation.scripts.common import common, demonstrations
-import sacred
 from sacred.observers import FileStorageObserver
 import torch as th
 
-from reward_preprocessing.models import ProcgenCnnRegressionRewardNet
-from reward_preprocessing.scripts.common import supervised as supervised_config
+from reward_preprocessing.scripts.config.train_regression import train_regression_ex
 from reward_preprocessing.trainers.supervised_trainer import SupervisedTrainer
 
-train_regression_ex = sacred.Experiment(
-    "train_regression",
-    ingredients=[
-        common.common_ingredient,
-        demonstrations.demonstrations_ingredient,
-        supervised_config.supervised_ingredient,
-    ],
-)
-
-
-@train_regression_ex.config
-def defaults():
-    # Every checkpoint_epoch_interval epochs, save the model. Epochs start at 1.
-    checkpoint_epoch_interval = 1
-    locals()  # make flake8 happy
-
 
 def save(trainer: SupervisedTrainer, save_path):
     """Save regression model."""
@@ -47,9 +30,15 @@ def train_regression(supervised, checkpoint_epoch_interval: int):  # From ingred
 
     with common.make_venv() as venv:
         # Init the regression CNN
-        model = ProcgenCnnRegressionRewardNet(
-            observation_space=venv.observation_space, action_space=venv.action_space
+        model = CnnRewardNet(
+            **supervised["net_kwargs"],
+            # We don't want the following to be overriden.
+            observation_space=venv.observation_space,
+            action_space=venv.action_space,
+            use_done=False,
         )
+        custom_logger.log(model)
+
         device = "cuda" if th.cuda.is_available() else "cpu"
         loss_fn = th.nn.MSELoss()
 
@@ -69,13 +58,16 @@ def train_regression(supervised, checkpoint_epoch_interval: int):  # From ingred
         # Move model to correct device
         model.to(device)
 
+        trainer.log_data_stats()
+
         def checkpoint_callback(epoch_num):
             if (
                 checkpoint_epoch_interval > 0
                 and epoch_num % checkpoint_epoch_interval == 0
             ):
                 save(trainer, os.path.join(log_dir, "checkpoints", f"{epoch_num:05d}"))
 
+        custom_logger.log("Start training regression model.")
         # Start training
         trainer.train(
             num_epochs=supervised["epochs"],
diff --git a/src/reward_preprocessing/trainers/__init__.py b/src/reward_preprocessing/trainers/__init__.py
diff --git a/src/reward_preprocessing/trainers/supervised_trainer.py b/src/reward_preprocessing/trainers/supervised_trainer.py