diff --git a/conf/new_beginnings.yaml b/conf/new_beginnings.yaml index 9dc16fb..ea9dc18 100644 --- a/conf/new_beginnings.yaml +++ b/conf/new_beginnings.yaml @@ -6,7 +6,7 @@ hydra: run: dir: ./outputs/${now:%m-%d}/${now:%H-%M-%S} -name: new_beginnings +name: winning_position_reward ## WANDB params # The wandb project name project: ConnectX @@ -17,7 +17,7 @@ group: debug # Parameters to overwrite use_mixed_precision: False -total_steps: 1e4 +total_steps: 1e5 batch_size: 8 checkpoint_freq: 60. num_actors: 2 @@ -34,16 +34,15 @@ device: cpu rescale_value_input: False obs_space_kwargs: {} reward_space_kwargs: {} -debug: True +debug: False # Environment params -#adversary: random adversary: negamax # Model params act_space: BasicActionSpace obs_space: BasicObsSpace -reward_space: LongGameReward +reward_space: MoreInARowReward ## OPTIMIZER params optimizer_class: Adam @@ -60,7 +59,7 @@ baseline_cost: 1. teacher_kl_cost: 0. # lambda parameter for TD-lambda and UPGO losses lmb: 0.8 -reduction: sum +reduction: mean # MISCELLANEOUS params actor_device: cpu diff --git a/connectx/connectx_gym/__init__.py b/connectx/connectx_gym/__init__.py index b780d30..5c57e6d 100644 --- a/connectx/connectx_gym/__init__.py +++ b/connectx/connectx_gym/__init__.py @@ -40,7 +40,6 @@ def create_env(flags, device: torch.device, teacher_flags: Optional = None, seed envs.append(env) env = VecEnv(envs) env = PytorchEnv(env, device) - # env = TensorflowEnv(env, device) env = DictEnv(env) return env diff --git a/connectx/connectx_gym/connectx_env.py b/connectx/connectx_gym/connectx_env.py index 24ca25e..99d7997 100644 --- a/connectx/connectx_gym/connectx_env.py +++ b/connectx/connectx_gym/connectx_env.py @@ -3,7 +3,6 @@ import gym import math import numpy as np -from scipy.special import softmax from .act_spaces import BaseActSpace from .obs_spaces import BaseObsSpace @@ -25,6 +24,7 @@ def __init__( super(ConnectFour, self).__init__() self.env = make("connectx", debug=True) self.player_id = player_id + self.mark = player_id + 1 players = [adversary, adversary] players[player_id] = None self.trainer = self.env.train(players) @@ -32,59 +32,37 @@ def __init__( self.rows = self.env.configuration.rows self.columns = self.env.configuration.columns + self.game_reward = 0. self.action_space = act_space self.obs_space = obs_space self.info = dict() def reset(self, **kwargs): obs = self.trainer.reset() - reward = 0. + self.game_reward = 0. done = False - self._update(obs, reward) + self._update(obs) - return obs, reward, done, self.info + return obs, self.game_reward, done, self.info def step(self, action): - obs, reward, done, _ = self.trainer.step(action) - self._update(obs, reward, action) - return obs, reward, done, self.info - - # def process_actions(self, logits: np.ndarray) -> Tuple[List[List[str]], Dict[str, np.ndarray]]: - # step = self.env.state[0]['observation']['step'] - # board = self.env.state[0]['observation']['board'] - # obs = np.array(board).reshape(BOARD_SIZE) - # print(f"\naction logits:\n{logits}") - # valid_action_logits = self.action_space.process_actions( - # logits, - # obs, - # ) - # print(f"\nvalid actions:\n{valid_action_logits}") - # valid_action_probs = softmax(valid_action_logits) - # action = np.random.choice(BOARD_SIZE[1], p=valid_action_probs) - # - # self.info.update( - # dict( - # logits=logits, - # masked_logits=valid_action_logits, - # masked_probs=valid_action_probs, - # action=action, - # step=step, - # ) - # ) - # return action - - def _update(self, obs, reward, action=-1): + obs, self.game_reward, done, _ = self.trainer.step(action) + self._update(obs, action) + return obs, self.game_reward, done, self.info + + def _update(self, obs, action=-1): obs_array = np.array(obs['board']).reshape((1,*BOARD_SIZE)) self.info = dict( action=action, - reward=reward, + reward=self.game_reward, available_actions_mask=self.action_space.get_available_actions_mask(obs_array), ) def render(self, **kwargs): self.env.render(**kwargs) + @property def turn(self): return self.env.state[0]['observation']['step'] @@ -92,3 +70,11 @@ def turn(self): @property def done(self): return self.env.done + + @property + def board(self): + return np.array(self.env.state[0]['observation']['board']).reshape(BOARD_SIZE) + + @property + def configuration(self): + return self.env.configuration \ No newline at end of file diff --git a/connectx/connectx_gym/reward_spaces.py b/connectx/connectx_gym/reward_spaces.py index 3d374ec..5988545 100644 --- a/connectx/connectx_gym/reward_spaces.py +++ b/connectx/connectx_gym/reward_spaces.py @@ -1,13 +1,13 @@ -from typing import NamedTuple, Tuple, Dict from abc import ABC, abstractmethod +from kaggle_environments.core import Environment import logging import math - -from kaggle_environments.core import Environment import numpy as np +from scipy.signal import convolve2d +from typing import NamedTuple, Tuple, Dict from .connectx_env import ConnectFour -from ..utility_constants import BOARD_SIZE +from ..utility_constants import BOARD_SIZE, IN_A_ROW class RewardSpec(NamedTuple): reward_min: float @@ -90,4 +90,41 @@ def compute_rewards(self, game_state: ConnectFour) -> Tuple[float, bool]: return self._compute_rewards(game_state), game_state.done def _compute_rewards(self, game_state: ConnectFour) -> float: - return game_state.turn / math.prod(BOARD_SIZE) \ No newline at end of file + return game_state.turn / math.prod(BOARD_SIZE) + + +class MoreInARowReward(BaseRewardSpace): + @staticmethod + def get_reward_spec() -> RewardSpec: + return RewardSpec( + reward_min=-1., + reward_max=1., + zero_sum=False, + only_once=False + ) + def __init__(self, **kwargs): + super(MoreInARowReward, self).__init__(**kwargs) + + horizontal_kernel = np.ones([1, IN_A_ROW], dtype=np.uint8) + vertical_kernel = np.transpose(horizontal_kernel) + diag1_kernel = np.eye(IN_A_ROW, dtype=np.uint8) + diag2_kernel = np.fliplr(diag1_kernel) + + self.victory_kernels = [ + horizontal_kernel, + vertical_kernel, + diag1_kernel, + diag2_kernel, + ] + + def compute_rewards(self, game_state: ConnectFour) -> Tuple[float, bool]: + if game_state.done: + return game_state.game_reward, game_state.done + return self._compute_rewards(game_state), game_state.done + + def _compute_rewards(self, game_state: ConnectFour) -> float: + for kernel in self.victory_kernels: + conv = convolve2d(game_state.board == game_state.mark, kernel, mode="valid") + if (conv==IN_A_ROW-1).any(): + return .5 + return -1/42 \ No newline at end of file diff --git a/connectx/torchbeast/monobeast.py b/connectx/torchbeast/monobeast.py index 5ca68f0..fc96488 100644 --- a/connectx/torchbeast/monobeast.py +++ b/connectx/torchbeast/monobeast.py @@ -451,39 +451,6 @@ def learn( assert len(last_lr) == 1, 'Logging per-parameter LR still needs support' last_lr = last_lr[0] - ''' - action_distributions_flat = { - key[16:]: val[batch["done"]][~val[batch["done"]].isnan()].sum().item() - for key, val in batch["info"].items() - if key.startswith("LOGGING_") and "ACTIONS_" in key - } - action_distributions = {space: {} for space in ACTION_MEANINGS.keys()} - for flat_name, n in action_distributions_flat.items(): - space, meaning = flat_name.split(".") - action_distributions[space][meaning] = n - action_distributions_aggregated = {} - for space, dist in action_distributions.items(): - if space == "city_tile": - action_distributions_aggregated[space] = dist - elif space in ("cart", "worker"): - aggregated = { - a: n for a, n in dist.items() if "TRANSFER" not in a and "MOVE" not in a - } - aggregated["TRANSFER"] = sum({a: n for a, n in dist.items() if "TRANSFER" in a}.values()) - aggregated["MOVE"] = sum({a: n for a, n in dist.items() if "MOVE" in a}.values()) - action_distributions_aggregated[space] = aggregated - else: - raise RuntimeError(f"Unrecognized action_space: {space}") - n_actions = sum(action_distributions_aggregated[space].values()) - if n_actions == 0: - action_distributions_aggregated[space] = { - key: float("nan") for key in action_distributions_aggregated[space].keys() - } - else: - action_distributions_aggregated[space] = { - key: val / n_actions for key, val in action_distributions_aggregated[space].items() - } - ''' total_games_played += batch["done"].sum().item() stats = { @@ -497,25 +464,17 @@ def learn( "vtrace_pg_loss": vtrace_pg_loss.detach().item(), "upgo_pg_loss": upgo_pg_loss.detach().item(), "baseline_loss": baseline_loss.detach().item(), - "teacher_kl_loss": teacher_kl_loss.detach().item(), - "teacher_baseline_loss": teacher_baseline_loss.detach().item(), + # "teacher_kl_loss": teacher_kl_loss.detach().item(), + # "teacher_baseline_loss": teacher_baseline_loss.detach().item(), "entropy_loss": entropy_loss.detach().item(), "total_loss": total_loss.detach().item(), }, - # "Entropy": { - # "overall": sum(e for e in entropies.values() if not math.isnan(e)), - # **entropies - # }, - # "Teacher_KL_Divergence": { - # "overall": sum(tkld for tkld in teacher_kl_losses.values() if not math.isnan(tkld)), - # **teacher_kl_losses - # }, "Entropy": { "overall": entropies, }, - "Teacher_KL_Divergence": { - "overall": teacher_kl_losses, - }, + # "Teacher_KL_Divergence": { + # "overall": teacher_kl_losses, + # }, "Misc": { "learning_rate": last_lr, "total_games_played": total_games_played diff --git a/connectx/utility_constants.py b/connectx/utility_constants.py index a43c81f..f2af211 100644 --- a/connectx/utility_constants.py +++ b/connectx/utility_constants.py @@ -1 +1,2 @@ -BOARD_SIZE = (6,7) \ No newline at end of file +BOARD_SIZE = (6,7) +IN_A_ROW = 4 \ No newline at end of file