|
| 1 | +"""Code to register procgen environments to train reward funcs on.""" |
| 2 | + |
| 3 | +import re |
| 4 | +from typing import Iterable |
| 5 | + |
| 6 | +import gym |
| 7 | +from seals.util import AutoResetWrapper, get_gym_max_episode_steps |
| 8 | + |
| 9 | + |
| 10 | +def supported_procgen_env(gym_spec: gym.envs.registration.EnvSpec) -> bool: |
| 11 | + starts_with_procgen = gym_spec.id.startswith("procgen-") |
| 12 | + three_parts = len(re.split("-|_", gym_spec.id)) == 3 |
| 13 | + return starts_with_procgen and three_parts |
| 14 | + |
| 15 | + |
| 16 | +def make_auto_reset_procgen(procgen_env_id: str, **make_env_kwargs) -> gym.Env: |
| 17 | + env = AutoResetWrapper(gym.make(procgen_env_id, **make_env_kwargs)) |
| 18 | + return env |
| 19 | + |
| 20 | + |
| 21 | +def make_fin_obs_procgen(procgen_env_id: str, **make_env_kwargs) -> gym.Env: |
| 22 | + env = ProcgenFinalObsWrapper(gym.make(procgen_env_id, **make_env_kwargs)) |
| 23 | + return env |
| 24 | + |
| 25 | + |
| 26 | +def local_name_autoreset(gym_spec: gym.envs.registration.EnvSpec) -> str: |
| 27 | + split_str = gym_spec.id.split("-") |
| 28 | + version = split_str[-1] |
| 29 | + split_str[-1] = "autoreset" |
| 30 | + return "-".join(split_str + [version]) |
| 31 | + |
| 32 | + |
| 33 | +def local_name_fin_obs(gym_spec: gym.envs.registration.EnvSpec) -> str: |
| 34 | + split_str = gym_spec.id.split("-") |
| 35 | + version = split_str[-1] |
| 36 | + split_str[-1] = "final-obs" |
| 37 | + return "-".join(split_str + [version]) |
| 38 | + |
| 39 | + |
| 40 | +def register_procgen_envs( |
| 41 | + gym_procgen_env_specs: Iterable[gym.envs.registration.EnvSpec], |
| 42 | +) -> None: |
| 43 | + |
| 44 | + for gym_spec in gym_procgen_env_specs: |
| 45 | + gym.register( |
| 46 | + id=local_name_autoreset(gym_spec), |
| 47 | + entry_point="reward_preprocessing.procgen:make_auto_reset_procgen", |
| 48 | + max_episode_steps=get_gym_max_episode_steps(gym_spec.id), |
| 49 | + kwargs=dict(procgen_env_id=gym_spec.id), |
| 50 | + ) |
| 51 | + |
| 52 | + # There are no envs that have both autoreset and final obs wrappers. |
| 53 | + # fin-obs would only affect the terminal_observation in the info dict, if it were |
| 54 | + # to be wrapped by an AutoResetWrapper. Since, at the moment, we don't use the |
| 55 | + # terminal_observation in the info dict, there is no point to combining them. |
| 56 | + for gym_spec in gym_procgen_env_specs: |
| 57 | + gym.register( |
| 58 | + id=local_name_fin_obs(gym_spec), |
| 59 | + entry_point="reward_preprocessing.procgen:make_fin_obs_procgen", |
| 60 | + max_episode_steps=get_gym_max_episode_steps(gym_spec.id), |
| 61 | + kwargs=dict(procgen_env_id=gym_spec.id), |
| 62 | + ) |
| 63 | + |
| 64 | + |
| 65 | +class ProcgenFinalObsWrapper(gym.Wrapper): |
| 66 | + """Returns the final observation of gym3 procgen environment, correcting for the |
| 67 | + fact that Procgen gym environments return the second-to-last observation again |
| 68 | + instead of the final observation. |
| 69 | + |
| 70 | + Only works correctly when the 'done' signal coincides with the end of an episode |
| 71 | + (which is not the case when using e.g. the seals AutoResetWrapper). |
| 72 | + Requires the use of the PavelCz/procgenAISC fork, which adds the 'final_obs' value. |
| 73 | +
|
| 74 | + Since procgen builds on gym3, it always resets the environment after a terminal |
| 75 | + state. The final 'obs' returned when done==True will be the obs that was already |
| 76 | + returned in the previous step. In our fork of procgen, we save the true last |
| 77 | + observation of the terminated episode in the info dict. This wrapper extracts that |
| 78 | + obs and returns it. |
| 79 | + """ |
| 80 | + |
| 81 | + def step(self, action): |
| 82 | + """When done=True, returns the final_obs from the dict.""" |
| 83 | + obs, rew, done, info = self.env.step(action) |
| 84 | + if done: |
| 85 | + obs = info["final_obs"] |
| 86 | + return obs, rew, done, info |
0 commit comments