From bd095fe16920663998273ce8d61382889088ab4b Mon Sep 17 00:00:00 2001 From: Nicholas Gorichs Date: Thu, 1 Feb 2024 17:46:45 -0600 Subject: [PATCH] per step reward Per game reward is better for a pre-trained model --- connectx/connectx_gym/connectx_env.py | 4 +++- connectx/connectx_gym/reward_spaces.py | 17 ++++++++--------- connectx/connectx_gym/wrappers.py | 7 +++---- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/connectx/connectx_gym/connectx_env.py b/connectx/connectx_gym/connectx_env.py index 07b5a1a..1ca3b30 100644 --- a/connectx/connectx_gym/connectx_env.py +++ b/connectx/connectx_gym/connectx_env.py @@ -1,6 +1,7 @@ from kaggle_environments import make from typing import Dict, List, Optional, Tuple import gym +import math import numpy as np from scipy.special import softmax @@ -46,7 +47,8 @@ def reset(self, **kwargs): return obs, self.reward, done, self.info def step(self, action): - obs, self.reward, done, _ = self.trainer.step(action) + obs, _, done, _ = self.trainer.step(action) + self.reward = self.turn / math.prod(BOARD_SIZE) self._update(obs, self.reward, action) return obs, self.reward, done, self.info diff --git a/connectx/connectx_gym/reward_spaces.py b/connectx/connectx_gym/reward_spaces.py index deffcbf..5bed144 100644 --- a/connectx/connectx_gym/reward_spaces.py +++ b/connectx/connectx_gym/reward_spaces.py @@ -42,7 +42,7 @@ def compute_rewards(self, game_state: Environment) -> Tuple[Tuple[float, float], pass @abstractmethod - def _compute_rewards(self, game_state: dict, done: bool) -> Tuple[float, float]: + def _compute_rewards(self, game_state: dict) -> Tuple[float, float]: pass @@ -50,10 +50,10 @@ class GameResultReward(FullGameRewardSpace): @staticmethod def get_reward_spec() -> RewardSpec: return RewardSpec( - reward_min=-1., - reward_max=1., + reward_min=-10., + reward_max=10., zero_sum=True, - only_once=True + only_once=False ) def __init__(self, early_stop: bool = False, **kwargs): @@ -63,12 +63,11 @@ def __init__(self, early_stop: bool = False, **kwargs): def compute_rewards(self, game_state: Environment) -> Tuple[float, bool]: if self.early_stop: raise NotImplementedError # done = done or should_early_stop(game_state) - done = game_state.done - return self._compute_rewards(game_state, done), done + return self._compute_rewards(game_state), game_state.done - def _compute_rewards(self, game_state: Environment, done: bool) -> float: - if not done: - return 0. + def _compute_rewards(self, game_state: Environment) -> float: + if game_state.done: + return game_state.reward * 10. return game_state.reward # def compute_rewards(self, game_state: Environment) -> Tuple[Tuple[float, float], bool]: diff --git a/connectx/connectx_gym/wrappers.py b/connectx/connectx_gym/wrappers.py index ab866ba..e167adc 100644 --- a/connectx/connectx_gym/wrappers.py +++ b/connectx/connectx_gym/wrappers.py @@ -18,14 +18,13 @@ def __init__(self, env: gym.Env, reward_space: BaseRewardSpace): super(LoggingEnv, self).__init__(env) self.reward_space = reward_space self.vals_peak = {} - self.reward_sum = [0., 0.] + self.reward_sum = [] def info(self, info: Dict[str, np.ndarray], rewards: int) -> Dict[str, np.ndarray]: info = copy.copy(info) - player = self.env.unwrapped.player_id logs = dict(step=self.env.unwrapped.turn) - self.reward_sum[player] = rewards + self.reward_sum[player] + self.reward_sum.append(rewards) logs["mean_cumulative_rewards"] = [np.mean(self.reward_sum)] logs["mean_cumulative_reward_magnitudes"] = [np.mean(np.abs(self.reward_sum))] logs["max_cumulative_rewards"] = [np.max(self.reward_sum)] @@ -38,7 +37,7 @@ def info(self, info: Dict[str, np.ndarray], rewards: int) -> Dict[str, np.ndarra def reset(self, **kwargs): obs, reward, done, info = super(LoggingEnv, self).reset(**kwargs) - self.reward_sum = [0., 0.] + self.reward_sum = [reward] return obs, [reward], done, self.info(info, reward) def step(self, action: Dict[str, np.ndarray]):