Merge pull request #31 from HumanCompatibleAI/imitation-final-obs

PavelCz · web-flow · commit ab3d103879eb · 2023-01-04T18:41:38.000+01:00
Final observation and CnnRewardNet
diff --git a/requirements.txt b/requirements.txt
@@ -2,7 +2,7 @@
 --extra-index-url https://download.pytorch.org/whl/cu116
 torch==1.12.1
 torchvision==0.13.1
-stable-baselines3==1.6.0
+stable-baselines3==1.6.1
 sacred==0.8.2
 numpy==1.21.2
 gym==0.21
@@ -12,7 +12,7 @@ seals==0.1.5
 torch-lucent==0.1.8
 jupyter==1.0.0
 git+https://github.com/ejnnr/mazelab.git@3042551
-git+https://github.com/HumanCompatibleAI/imitation.git@91c66b7377
+git+https://github.com/HumanCompatibleAI/imitation.git@40a2a559706e50bf60d7cc388a2c36dd0d4e8619
 # This version includes some fixes that are not in the newest pip version
 git+https://github.com/openai/gym3.git@4c38246
 # This commit on the branch final-obs of my (PavelCz) fork of procgen includes
diff --git a/src/reward_preprocessing/__init__.py b/src/reward_preprocessing/__init__.py
@@ -0,0 +1,15 @@
+"""Register custom environments"""
+
+import gym
+import procgen  # noqa: F401
+
+import reward_preprocessing.procgen as rmi_procgen  # noqa: I001
+
+# Procgen
+
+# note that procgen was imported to add procgen environments to the gym registry
+
+GYM_PROCGEN_ENV_SPECS = list(
+    filter(rmi_procgen.supported_procgen_env, gym.envs.registry.all())
+)
+rmi_procgen.register_procgen_envs(GYM_PROCGEN_ENV_SPECS)
diff --git a/src/reward_preprocessing/common/utils.py b/src/reward_preprocessing/common/utils.py
@@ -284,7 +284,7 @@ def log_img_wandb(
 
 
 def array_to_image(arr: np.ndarray, scale: int) -> PIL.Image.Image:
-    """Take numpy array on [0,1] scale, return PIL image."""
+    """Take numpy array on [0,1] scale with shape (h,w,c), return PIL image."""
     return Image.fromarray(np.uint8(arr * 255), mode="RGB").resize(
         # PIL expects tuple of (width, height), numpy's dimension 1 is width, and
         # dimension 0 height.
diff --git a/src/reward_preprocessing/models.py b/src/reward_preprocessing/models.py
@@ -2,7 +2,7 @@
 from typing import Tuple
 
 import gym
-from imitation.rewards.reward_nets import RewardNet
+from imitation.rewards.reward_nets import CnnRewardNet, RewardNet
 import numpy as np
 import torch as th
 
@@ -11,6 +11,42 @@
 logger = logging.getLogger(__name__)
 
 
+class CnnRewardNetWorkaround(CnnRewardNet):
+    """Identical to CnnRewardNet, except that it fixes imitation issue #644 by
+    removing normalize_input_layer from the kwargs.
+    TODO: Reconsider this once the underlying issue is fixed.
+    """
+
+    def __init__(
+        self,
+        observation_space: gym.Space,
+        action_space: gym.Space,
+        use_state: bool = True,
+        use_action: bool = True,
+        use_next_state: bool = False,
+        use_done: bool = False,
+        hwc_format: bool = True,
+        **kwargs,
+    ):
+        normalize = kwargs.pop("normalize_input_layer", None)
+        if normalize is not None:
+            logger.warning(
+                f"normalize_input_layer={normalize} was provided, will be ignored. See "
+                "imitation issue #644"
+            )
+
+        super().__init__(
+            observation_space,
+            action_space,
+            use_state,
+            use_action,
+            use_next_state,
+            use_done,
+            hwc_format,
+            **kwargs,
+        )
+
+
 class MazeRewardNet(RewardNet):
     def __init__(self, size: int, maze_name: str = "EmptyMaze", **kwargs):
         env = gym.make(f"reward_preprocessing/{maze_name}{size}-v0", **kwargs)
diff --git a/src/reward_preprocessing/policies/utils.py b/src/reward_preprocessing/policies/utils.py
@@ -2,7 +2,7 @@
 from typing import Callable, Optional, Union
 
 import gym
-from imitation.data.types import AnyPath, path_to_str
+from imitation.data.types import AnyPath
 import numpy as np
 from stable_baselines3 import PPO
 from stable_baselines3.common.base_class import BaseAlgorithm
@@ -16,6 +16,13 @@
 Policy = Union[gym.Space, PolicyCallable, BaseAlgorithm, BasePolicy]
 
 
+def path_to_str(path: AnyPath) -> str:
+    if isinstance(path, bytes):
+        return path.decode()
+    else:
+        return str(path)
+
+
 def policy_to_callable(
     policy: Policy, deterministic_policy: bool = True
 ) -> PolicyCallable:
diff --git a/src/reward_preprocessing/procgen.py b/src/reward_preprocessing/procgen.py
@@ -0,0 +1,86 @@
+"""Code to register procgen environments to train reward funcs on."""
+
+import re
+from typing import Iterable
+
+import gym
+from seals.util import AutoResetWrapper, get_gym_max_episode_steps
+
+
+def supported_procgen_env(gym_spec: gym.envs.registration.EnvSpec) -> bool:
+    starts_with_procgen = gym_spec.id.startswith("procgen-")
+    three_parts = len(re.split("-|_", gym_spec.id)) == 3
+    return starts_with_procgen and three_parts
+
+
+def make_auto_reset_procgen(procgen_env_id: str, **make_env_kwargs) -> gym.Env:
+    env = AutoResetWrapper(gym.make(procgen_env_id, **make_env_kwargs))
+    return env
+
+
+def make_fin_obs_procgen(procgen_env_id: str, **make_env_kwargs) -> gym.Env:
+    env = ProcgenFinalObsWrapper(gym.make(procgen_env_id, **make_env_kwargs))
+    return env
+
+
+def local_name_autoreset(gym_spec: gym.envs.registration.EnvSpec) -> str:
+    split_str = gym_spec.id.split("-")
+    version = split_str[-1]
+    split_str[-1] = "autoreset"
+    return "-".join(split_str + [version])
+
+
+def local_name_fin_obs(gym_spec: gym.envs.registration.EnvSpec) -> str:
+    split_str = gym_spec.id.split("-")
+    version = split_str[-1]
+    split_str[-1] = "final-obs"
+    return "-".join(split_str + [version])
+
+
+def register_procgen_envs(
+    gym_procgen_env_specs: Iterable[gym.envs.registration.EnvSpec],
+) -> None:
+
+    for gym_spec in gym_procgen_env_specs:
+        gym.register(
+            id=local_name_autoreset(gym_spec),
+            entry_point="reward_preprocessing.procgen:make_auto_reset_procgen",
+            max_episode_steps=get_gym_max_episode_steps(gym_spec.id),
+            kwargs=dict(procgen_env_id=gym_spec.id),
+        )
+
+    # There are no envs that have both autoreset and final obs wrappers.
+    # fin-obs would only affect the terminal_observation in the info dict, if it were
+    # to be wrapped by an AutoResetWrapper. Since, at the moment, we don't use the
+    # terminal_observation in the info dict, there is no point to combining them.
+    for gym_spec in gym_procgen_env_specs:
+        gym.register(
+            id=local_name_fin_obs(gym_spec),
+            entry_point="reward_preprocessing.procgen:make_fin_obs_procgen",
+            max_episode_steps=get_gym_max_episode_steps(gym_spec.id),
+            kwargs=dict(procgen_env_id=gym_spec.id),
+        )
+
+
+class ProcgenFinalObsWrapper(gym.Wrapper):
+    """Returns the final observation of gym3 procgen environment, correcting for the
+    fact that Procgen gym environments return the second-to-last observation again
+    instead of the final observation.
+    
+    Only works correctly when the 'done' signal coincides with the end of an episode
+    (which is not the case when using e.g. the seals AutoResetWrapper).
+    Requires the use of the PavelCz/procgenAISC fork, which adds the 'final_obs' value.
+
+    Since procgen builds on gym3, it always resets the environment after a terminal
+    state. The final 'obs' returned when done==True will be the obs that was already
+    returned in the previous step. In our fork of procgen, we save the true last
+    observation of the terminated episode in the info dict. This wrapper extracts that
+    obs and returns it.
+    """
+
+    def step(self, action):
+        """When done=True, returns the final_obs from the dict."""
+        obs, rew, done, info = self.env.step(action)
+        if done:
+            obs = info["final_obs"]
+        return obs, rew, done, info
diff --git a/src/reward_preprocessing/scripts/train_pref_comparison.py b/src/reward_preprocessing/scripts/train_pref_comparison.py
@@ -0,0 +1,62 @@
+"""Thin wrapper around imitation's train_preference_comparisons script."""
+from imitation.scripts.config.train_preference_comparisons import (
+    train_preference_comparisons_ex,
+)
+from imitation.scripts.train_preference_comparisons import main_console
+
+from reward_preprocessing.env.maze import use_config
+from reward_preprocessing.models import CnnRewardNetWorkaround
+import reward_preprocessing.policies.base
+
+use_config(train_preference_comparisons_ex)
+
+
+@train_preference_comparisons_ex.named_config
+def coinrun():
+    """Training with preference comparisons on coinrun."""
+    fragment_length = 200
+    total_comparisons = 100_000
+    total_timesteps = 200_000_000
+    train = dict(
+        policy_cls=reward_preprocessing.policies.base.ImpalaPolicy,
+    )
+    common = dict(
+        env_name="procgen:procgen-coinrun-autoreset-v0",
+        num_vec=256,  # Goal Misg paper uses 64 envs for each of 4 workers.
+        env_make_kwargs=dict(num_levels=100_000, distribution_mode="hard"),
+    )
+    rl = dict(
+        batch_size=256 * 256,
+        rl_kwargs=dict(
+            n_epochs=3,
+            ent_coef=0.01,
+            learning_rate=0.0005,
+            batch_size=8192,
+            gamma=0.999,
+            gae_lambda=0.95,
+            clip_range=0.2,
+            max_grad_norm=0.5,
+            vf_coef=0.5,
+            normalize_advantage=True,
+        ),
+    )
+    reward = dict(
+        # Use default CNN reward net, since procgen envs are image-based.
+        # Also, hopefully, CNNs are more interpretable.
+        net_cls=CnnRewardNetWorkaround,
+    )
+    locals()  # make flake8 happy
+
+
+@train_preference_comparisons_ex.named_config
+def fast_procgen():  # Overrides some settings for fast setup for debugging purposes.
+    rl = dict(batch_size=2, rl_kwargs=dict(batch_size=2))
+    common = dict(num_vec=1)
+    total_comparisons = 32
+    fragment_length = 16
+    total_timesteps = 64
+    locals()  # make flake8 happy
+
+
+if __name__ == "__main__":  # pragma: no cover
+    main_console()