Skip to content

Commit

Permalink
Reward based on winning position
Browse files Browse the repository at this point in the history
  • Loading branch information
FilipinoGambino committed Feb 2, 2024
1 parent 348b9c2 commit fa0e273
Show file tree
Hide file tree
Showing 6 changed files with 74 additions and 93 deletions.
11 changes: 5 additions & 6 deletions conf/new_beginnings.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ hydra:
run:
dir: ./outputs/${now:%m-%d}/${now:%H-%M-%S}

name: new_beginnings
name: winning_position_reward
## WANDB params
# The wandb project name
project: ConnectX
Expand All @@ -17,7 +17,7 @@ group: debug

# Parameters to overwrite
use_mixed_precision: False
total_steps: 1e4
total_steps: 1e5
batch_size: 8
checkpoint_freq: 60.
num_actors: 2
Expand All @@ -34,16 +34,15 @@ device: cpu
rescale_value_input: False
obs_space_kwargs: {}
reward_space_kwargs: {}
debug: True
debug: False

# Environment params
#adversary: random
adversary: negamax

# Model params
act_space: BasicActionSpace
obs_space: BasicObsSpace
reward_space: LongGameReward
reward_space: MoreInARowReward

## OPTIMIZER params
optimizer_class: Adam
Expand All @@ -60,7 +59,7 @@ baseline_cost: 1.
teacher_kl_cost: 0.
# lambda parameter for TD-lambda and UPGO losses
lmb: 0.8
reduction: sum
reduction: mean

# MISCELLANEOUS params
actor_device: cpu
Expand Down
1 change: 0 additions & 1 deletion connectx/connectx_gym/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@ def create_env(flags, device: torch.device, teacher_flags: Optional = None, seed
envs.append(env)
env = VecEnv(envs)
env = PytorchEnv(env, device)
# env = TensorflowEnv(env, device)
env = DictEnv(env)
return env

Expand Down
54 changes: 20 additions & 34 deletions connectx/connectx_gym/connectx_env.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import gym
import math
import numpy as np
from scipy.special import softmax

from .act_spaces import BaseActSpace
from .obs_spaces import BaseObsSpace
Expand All @@ -25,70 +24,57 @@ def __init__(
super(ConnectFour, self).__init__()
self.env = make("connectx", debug=True)
self.player_id = player_id
self.mark = player_id + 1
players = [adversary, adversary]
players[player_id] = None
self.trainer = self.env.train(players)

self.rows = self.env.configuration.rows
self.columns = self.env.configuration.columns

self.game_reward = 0.
self.action_space = act_space
self.obs_space = obs_space
self.info = dict()

def reset(self, **kwargs):
obs = self.trainer.reset()
reward = 0.
self.game_reward = 0.
done = False
self._update(obs, reward)
self._update(obs)

return obs, reward, done, self.info
return obs, self.game_reward, done, self.info

def step(self, action):
obs, reward, done, _ = self.trainer.step(action)
self._update(obs, reward, action)
return obs, reward, done, self.info

# def process_actions(self, logits: np.ndarray) -> Tuple[List[List[str]], Dict[str, np.ndarray]]:
# step = self.env.state[0]['observation']['step']
# board = self.env.state[0]['observation']['board']
# obs = np.array(board).reshape(BOARD_SIZE)
# print(f"\naction logits:\n{logits}")
# valid_action_logits = self.action_space.process_actions(
# logits,
# obs,
# )
# print(f"\nvalid actions:\n{valid_action_logits}")
# valid_action_probs = softmax(valid_action_logits)
# action = np.random.choice(BOARD_SIZE[1], p=valid_action_probs)
#
# self.info.update(
# dict(
# logits=logits,
# masked_logits=valid_action_logits,
# masked_probs=valid_action_probs,
# action=action,
# step=step,
# )
# )
# return action

def _update(self, obs, reward, action=-1):
obs, self.game_reward, done, _ = self.trainer.step(action)
self._update(obs, action)
return obs, self.game_reward, done, self.info

def _update(self, obs, action=-1):
obs_array = np.array(obs['board']).reshape((1,*BOARD_SIZE))

self.info = dict(
action=action,
reward=reward,
reward=self.game_reward,
available_actions_mask=self.action_space.get_available_actions_mask(obs_array),
)

def render(self, **kwargs):
self.env.render(**kwargs)


@property
def turn(self):
return self.env.state[0]['observation']['step']

@property
def done(self):
return self.env.done

@property
def board(self):
return np.array(self.env.state[0]['observation']['board']).reshape(BOARD_SIZE)

@property
def configuration(self):
return self.env.configuration
47 changes: 42 additions & 5 deletions connectx/connectx_gym/reward_spaces.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
from typing import NamedTuple, Tuple, Dict
from abc import ABC, abstractmethod
from kaggle_environments.core import Environment
import logging
import math

from kaggle_environments.core import Environment
import numpy as np
from scipy.signal import convolve2d
from typing import NamedTuple, Tuple, Dict

from .connectx_env import ConnectFour
from ..utility_constants import BOARD_SIZE
from ..utility_constants import BOARD_SIZE, IN_A_ROW

class RewardSpec(NamedTuple):
reward_min: float
Expand Down Expand Up @@ -90,4 +90,41 @@ def compute_rewards(self, game_state: ConnectFour) -> Tuple[float, bool]:
return self._compute_rewards(game_state), game_state.done

def _compute_rewards(self, game_state: ConnectFour) -> float:
return game_state.turn / math.prod(BOARD_SIZE)
return game_state.turn / math.prod(BOARD_SIZE)


class MoreInARowReward(BaseRewardSpace):
@staticmethod
def get_reward_spec() -> RewardSpec:
return RewardSpec(
reward_min=-1.,
reward_max=1.,
zero_sum=False,
only_once=False
)
def __init__(self, **kwargs):
super(MoreInARowReward, self).__init__(**kwargs)

horizontal_kernel = np.ones([1, IN_A_ROW], dtype=np.uint8)
vertical_kernel = np.transpose(horizontal_kernel)
diag1_kernel = np.eye(IN_A_ROW, dtype=np.uint8)
diag2_kernel = np.fliplr(diag1_kernel)

self.victory_kernels = [
horizontal_kernel,
vertical_kernel,
diag1_kernel,
diag2_kernel,
]

def compute_rewards(self, game_state: ConnectFour) -> Tuple[float, bool]:
if game_state.done:
return game_state.game_reward, game_state.done
return self._compute_rewards(game_state), game_state.done

def _compute_rewards(self, game_state: ConnectFour) -> float:
for kernel in self.victory_kernels:
conv = convolve2d(game_state.board == game_state.mark, kernel, mode="valid")
if (conv==IN_A_ROW-1).any():
return .5
return -1/42
51 changes: 5 additions & 46 deletions connectx/torchbeast/monobeast.py
Original file line number Diff line number Diff line change
Expand Up @@ -451,39 +451,6 @@ def learn(
assert len(last_lr) == 1, 'Logging per-parameter LR still needs support'
last_lr = last_lr[0]

'''
action_distributions_flat = {
key[16:]: val[batch["done"]][~val[batch["done"]].isnan()].sum().item()
for key, val in batch["info"].items()
if key.startswith("LOGGING_") and "ACTIONS_" in key
}
action_distributions = {space: {} for space in ACTION_MEANINGS.keys()}
for flat_name, n in action_distributions_flat.items():
space, meaning = flat_name.split(".")
action_distributions[space][meaning] = n
action_distributions_aggregated = {}
for space, dist in action_distributions.items():
if space == "city_tile":
action_distributions_aggregated[space] = dist
elif space in ("cart", "worker"):
aggregated = {
a: n for a, n in dist.items() if "TRANSFER" not in a and "MOVE" not in a
}
aggregated["TRANSFER"] = sum({a: n for a, n in dist.items() if "TRANSFER" in a}.values())
aggregated["MOVE"] = sum({a: n for a, n in dist.items() if "MOVE" in a}.values())
action_distributions_aggregated[space] = aggregated
else:
raise RuntimeError(f"Unrecognized action_space: {space}")
n_actions = sum(action_distributions_aggregated[space].values())
if n_actions == 0:
action_distributions_aggregated[space] = {
key: float("nan") for key in action_distributions_aggregated[space].keys()
}
else:
action_distributions_aggregated[space] = {
key: val / n_actions for key, val in action_distributions_aggregated[space].items()
}
'''
total_games_played += batch["done"].sum().item()

stats = {
Expand All @@ -497,25 +464,17 @@ def learn(
"vtrace_pg_loss": vtrace_pg_loss.detach().item(),
"upgo_pg_loss": upgo_pg_loss.detach().item(),
"baseline_loss": baseline_loss.detach().item(),
"teacher_kl_loss": teacher_kl_loss.detach().item(),
"teacher_baseline_loss": teacher_baseline_loss.detach().item(),
# "teacher_kl_loss": teacher_kl_loss.detach().item(),
# "teacher_baseline_loss": teacher_baseline_loss.detach().item(),
"entropy_loss": entropy_loss.detach().item(),
"total_loss": total_loss.detach().item(),
},
# "Entropy": {
# "overall": sum(e for e in entropies.values() if not math.isnan(e)),
# **entropies
# },
# "Teacher_KL_Divergence": {
# "overall": sum(tkld for tkld in teacher_kl_losses.values() if not math.isnan(tkld)),
# **teacher_kl_losses
# },
"Entropy": {
"overall": entropies,
},
"Teacher_KL_Divergence": {
"overall": teacher_kl_losses,
},
# "Teacher_KL_Divergence": {
# "overall": teacher_kl_losses,
# },
"Misc": {
"learning_rate": last_lr,
"total_games_played": total_games_played
Expand Down
3 changes: 2 additions & 1 deletion connectx/utility_constants.py
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
BOARD_SIZE = (6,7)
BOARD_SIZE = (6,7)
IN_A_ROW = 4

0 comments on commit fa0e273

Please sign in to comment.