Skip to content

Commit b9ad471

Browse files
committedMay 30, 2021
merged experimental into dev/jan
2 parents 6fe8ace + 3a17423 commit b9ad471

27 files changed

+1417
-424
lines changed
 

‎README.md

+7-7
Original file line numberDiff line numberDiff line change
@@ -133,12 +133,12 @@ To plot results from experiments, run `jupyter-notebook` and open [`plot_experim
133133
If you use MDP Playground in your work, please cite the following paper:
134134

135135
```bibtex
136-
@article{rajan2019mdp,
137-
title={MDP Playground: Meta-Features in Reinforcement Learning},
138-
author={Raghu Rajan and Frank Hutter},
139-
year={2019},
140-
eprint={1909.07750},
141-
archivePrefix={arXiv},
142-
primaryClass={cs.LG}
136+
@article{rajan2020mdp,
137+
title={MDP Playground: Controlling Dimensions of Hardness in Reinforcement Learning},
138+
author={Raghu Rajan and Jessica Lizeth Borja Diaz and Suresh Guttikonda and Fabio Ferreira and André Biedenkapp and Frank Hutter},
139+
year={2020},
140+
eprint={1909.07750},
141+
archivePrefix={arXiv},
142+
primaryClass={cs.LG}
143143
}
144144
```

‎default_config.py

-5
Original file line numberDiff line numberDiff line change
@@ -76,8 +76,3 @@
7676
"lstm_use_prev_action_reward": False,
7777
},
7878
}
79-
80-
varying_configs = get_grid_of_configs(var_configs)
81-
# print("VARYING_CONFIGS:", varying_configs)
82-
83-
final_configs = combined_processing(env_config, agent_config, model_config, eval_config, varying_configs=varying_configs, framework='ray', algorithm='SAC')

‎docs/_autosummary/mdp_playground.envs.mujoco_env_wrapper.rst

Whitespace-only changes.

‎docs/_autosummary/mdp_playground.spaces.test_image_multi_discrete.rst

Whitespace-only changes.

‎example.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ def discrete_environment_example():
2525
config["seed"] = 0
2626

2727
config["state_space_type"] = "discrete"
28-
config["state_space_size"] = 8
28+
config["action_space_size"] = 8
2929
config["delay"] = 1
3030
config["sequence_length"] = 3
3131
config["reward_scale"] = 2.5
@@ -59,7 +59,7 @@ def discrete_environment_image_representations_example():
5959
config["seed"] = 0
6060

6161
config["state_space_type"] = "discrete"
62-
config["state_space_size"] = 8
62+
config["action_space_size"] = 8
6363
config["image_representations"] = True
6464
config["delay"] = 1
6565
config["sequence_length"] = 3
+97
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
timesteps_total = 20_000
2+
num_seeds = 10
3+
from collections import OrderedDict
4+
var_env_configs = OrderedDict({
5+
'state_space_size': [8],#, 10, 12, 14] # [2**i for i in range(1,6)]
6+
'action_space_size': [8],#2, 4, 8, 16] # [2**i for i in range(1,6)]
7+
'delay': [0],
8+
'sequence_length': [1],#i for i in range(1,4)]
9+
'reward_density': [0.25], # np.linspace(0.0, 1.0, num=5)
10+
'make_denser': [False],
11+
'terminal_state_density': [0.25], # np.linspace(0.1, 1.0, num=5)
12+
'transition_noise': [0, 0.01, 0.02, 0.10, 0.25],
13+
'reward_noise': [0], # Std dev. of normal dist.
14+
'dummy_seed': [i for i in range(num_seeds)],
15+
})
16+
17+
import numpy as np
18+
var_agent_configs = OrderedDict({
19+
"lr": list(np.power(10.,np.linspace(-1, -6, 16))), # "lr": grid_search([1e-2, 1e-4, 1e-6]),
20+
})
21+
22+
23+
var_configs = OrderedDict({
24+
"env": var_env_configs,
25+
"agent": var_agent_configs,
26+
})
27+
28+
env_config = {
29+
"env": "RLToy-v0",
30+
"horizon": 100,
31+
"env_config": {
32+
'seed': 0, #seed
33+
'state_space_type': 'discrete',
34+
'action_space_type': 'discrete',
35+
'generate_random_mdp': True,
36+
'repeats_in_sequences': False,
37+
'reward_scale': 1.0,
38+
'completely_connected': True,
39+
},
40+
}
41+
42+
algorithm = "DQN"
43+
agent_config = {
44+
"adam_epsilon": 1e-4,
45+
"beta_annealing_fraction": 1.0,
46+
"buffer_size": 20_000,
47+
"double_q": False,
48+
"dueling": False,
49+
"exploration_final_eps": 0.01,
50+
"exploration_fraction": 0.1,
51+
"final_prioritized_replay_beta": 1.0,
52+
"hiddens": None,
53+
"learning_starts": 1000,
54+
# "lr": 1e-4, # "lr": grid_search([1e-2, 1e-4, 1e-6]),
55+
"n_step": 1,
56+
"noisy": False,
57+
"num_atoms": 1,
58+
"prioritized_replay": False,
59+
"prioritized_replay_alpha": 0.5,
60+
"sample_batch_size": 4,
61+
"schedule_max_timesteps": 20000,
62+
"target_network_update_freq": 800,
63+
"timesteps_per_iteration": 1000,
64+
"min_iter_time_s": 0,
65+
"train_batch_size": 32,
66+
}
67+
68+
model_config = {
69+
"model": {
70+
"fcnet_hiddens": [256, 256],
71+
"custom_preprocessor": "ohe",
72+
"custom_options": {}, # extra options to pass to your preprocessor
73+
"fcnet_activation": "tanh",
74+
"use_lstm": False,
75+
"max_seq_len": 20,
76+
"lstm_cell_size": 256,
77+
"lstm_use_prev_action_reward": False,
78+
},
79+
}
80+
81+
from ray import tune
82+
eval_config = {
83+
"evaluation_interval": 1, # I think this means every x training_iterations
84+
"evaluation_config": {
85+
"explore": False,
86+
"exploration_fraction": 0,
87+
"exploration_final_eps": 0,
88+
"evaluation_num_episodes": 10,
89+
"horizon": 100,
90+
"env_config": {
91+
"dummy_eval": True, #hack Used to check if we are in evaluation mode or training mode inside Ray callback on_episode_end() to be able to write eval stats
92+
'transition_noise': 0 if "state_space_type" in env_config["env_config"] and env_config["env_config"]["state_space_type"] == "discrete" else tune.function(lambda a: a.normal(0, 0)),
93+
'reward_noise': tune.function(lambda a: a.normal(0, 0)),
94+
'action_loss_weight': 0.0,
95+
}
96+
},
97+
}
+98
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
timesteps_total = 20_000
2+
num_seeds = 10
3+
from collections import OrderedDict
4+
var_env_configs = OrderedDict({
5+
'state_space_size': [8],#, 10, 12, 14] # [2**i for i in range(1,6)]
6+
'action_space_size': [8],#2, 4, 8, 16] # [2**i for i in range(1,6)]
7+
'delay': [0],
8+
'sequence_length': [1],#i for i in range(1,4)]
9+
'reward_density': [0.25], # np.linspace(0.0, 1.0, num=5)
10+
'make_denser': [False],
11+
'terminal_state_density': [0.25], # np.linspace(0.1, 1.0, num=5)
12+
'transition_noise': [0],
13+
'reward_noise': [0, 1, 5, 10, 25], # Std dev. of normal dist.
14+
'dummy_seed': [i for i in range(num_seeds)],
15+
})
16+
17+
import numpy as np
18+
var_agent_configs = OrderedDict({
19+
"lr": [1e-4] #list(np.power(10.,np.linspace(-1, -6, 16))), # "lr": grid_search([1e-2, 1e-4, 1e-6]),
20+
})
21+
22+
23+
var_configs = OrderedDict({
24+
"env": var_env_configs,
25+
"agent": var_agent_configs,
26+
})
27+
28+
env_config = {
29+
"env": "RLToy-v0",
30+
"horizon": 100,
31+
"env_config": {
32+
'seed': 0, #seed
33+
'state_space_type': 'discrete',
34+
'action_space_type': 'discrete',
35+
'generate_random_mdp': True,
36+
'repeats_in_sequences': False,
37+
'reward_scale': 1.0,
38+
'completely_connected': True,
39+
},
40+
}
41+
42+
algorithm = "DQN"
43+
agent_config = {
44+
"adam_epsilon": 1e-4,
45+
"beta_annealing_fraction": 1.0,
46+
"buffer_size": 20_000,
47+
'clip_rewards': False,
48+
"double_q": False,
49+
"dueling": False,
50+
"exploration_final_eps": 0.01,
51+
"exploration_fraction": 0.1,
52+
"final_prioritized_replay_beta": 1.0,
53+
"hiddens": None,
54+
"learning_starts": 1000,
55+
# "lr": 1e-4, # "lr": grid_search([1e-2, 1e-4, 1e-6]),
56+
"n_step": 1,
57+
"noisy": False,
58+
"num_atoms": 1,
59+
"prioritized_replay": False,
60+
"prioritized_replay_alpha": 0.5,
61+
"sample_batch_size": 4,
62+
"schedule_max_timesteps": 20000,
63+
"target_network_update_freq": 800,
64+
"timesteps_per_iteration": 1000,
65+
"min_iter_time_s": 0,
66+
"train_batch_size": 32,
67+
}
68+
69+
model_config = {
70+
"model": {
71+
"fcnet_hiddens": [256, 256],
72+
"custom_preprocessor": "ohe",
73+
"custom_options": {}, # extra options to pass to your preprocessor
74+
"fcnet_activation": "tanh",
75+
"use_lstm": False,
76+
"max_seq_len": 20,
77+
"lstm_cell_size": 256,
78+
"lstm_use_prev_action_reward": False,
79+
},
80+
}
81+
82+
from ray import tune
83+
eval_config = {
84+
"evaluation_interval": 1, # I think this means every x training_iterations
85+
"evaluation_config": {
86+
"explore": False,
87+
"exploration_fraction": 0,
88+
"exploration_final_eps": 0,
89+
"evaluation_num_episodes": 10,
90+
"horizon": 100,
91+
"env_config": {
92+
"dummy_eval": True, #hack Used to check if we are in evaluation mode or training mode inside Ray callback on_episode_end() to be able to write eval stats
93+
'transition_noise': 0 if "state_space_type" in env_config["env_config"] and env_config["env_config"]["state_space_type"] == "discrete" else tune.function(lambda a: a.normal(0, 0)),
94+
'reward_noise': tune.function(lambda a: a.normal(0, 0)),
95+
'action_loss_weight': 0.0,
96+
}
97+
},
98+
}

‎experiments/dqn_space_invaders_r_noise.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
from collections import OrderedDict
44
import numpy as np
55
var_env_configs = OrderedDict({
6-
'reward_noise': list(np.array([0, 1, 5, 10, 25])/100), # Std dev. of normal dist.
6+
'reward_noise': list(np.array([0, 1, 5, 10, 25, 50, 100, 200, 400, 800])/100), # Std dev. of normal dist.
77
'dummy_seed': [i for i in range(num_seeds)],
88
})
99

@@ -35,15 +35,15 @@
3535
agent_config = { # Taken from Ray tuned_examples
3636
'adam_epsilon': 0.00015,
3737
'buffer_size': 500000,
38-
'clip_rewards': True,
38+
'clip_rewards': False,
3939
'double_q': False,
4040
'dueling': False,
4141
'exploration_config': { 'epsilon_timesteps': 200000,
4242
'final_epsilon': 0.01},
4343
'final_prioritized_replay_beta': 1.0,
4444
'hiddens': [512],
4545
'learning_starts': 20000,
46-
'lr': 6.25e-05,
46+
# 'lr': 6.25e-05,
4747
'n_step': 1,
4848
'noisy': False,
4949
'num_atoms': 1,
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
num_seeds = 5
2+
timesteps_total = 10_000_000
3+
from collections import OrderedDict
4+
import numpy as np
5+
var_env_configs = OrderedDict({
6+
'reward_noise': list(np.array([0])/100), # Std dev. of normal dist.
7+
'dummy_seed': [i for i in range(num_seeds)],
8+
})
9+
10+
import numpy as np
11+
var_agent_configs = OrderedDict({
12+
"lr": list(np.power(10.,np.linspace(-1, -6, 11))), # "lr": grid_search([1e-2, 1e-4, 1e-6]),
13+
})
14+
15+
var_configs = OrderedDict({
16+
"env": var_env_configs,
17+
"agent": var_agent_configs,
18+
})
19+
20+
env_config = {
21+
"env": "GymEnvWrapper-Atari",
22+
"env_config": {
23+
"AtariEnv": {
24+
"game": "space_invaders",
25+
'obs_type': 'image',
26+
'frameskip': 1,
27+
},
28+
# "GymEnvWrapper": {
29+
"atari_preprocessing": True,
30+
'frame_skip': 4,
31+
'grayscale_obs': False,
32+
'state_space_type': 'discrete',
33+
'action_space_type': 'discrete',
34+
'seed': 0,
35+
# },
36+
# 'seed': 0, #seed
37+
},
38+
}
39+
40+
algorithm = "DQN"
41+
agent_config = { # Taken from Ray tuned_examples
42+
'adam_epsilon': 0.00015,
43+
'buffer_size': 500000,
44+
'clip_rewards': False,
45+
'double_q': False,
46+
'dueling': False,
47+
'exploration_config': { 'epsilon_timesteps': 200000,
48+
'final_epsilon': 0.01},
49+
'final_prioritized_replay_beta': 1.0,
50+
'hiddens': [512],
51+
'learning_starts': 20000,
52+
# 'lr': 6.25e-05,
53+
'n_step': 1,
54+
'noisy': False,
55+
'num_atoms': 1,
56+
'num_gpus': 0,
57+
'num_workers': 3,
58+
'prioritized_replay': False,
59+
'prioritized_replay_alpha': 0.5,
60+
'prioritized_replay_beta_annealing_timesteps': 2000000,
61+
'rollout_fragment_length': 4,
62+
'target_network_update_freq': 8000,
63+
'timesteps_per_iteration': 10000,
64+
'train_batch_size': 32,
65+
"tf_session_args": {
66+
# note: overriden by `local_tf_session_args`
67+
"intra_op_parallelism_threads": 4,
68+
"inter_op_parallelism_threads": 4,
69+
# "gpu_options": {
70+
# "allow_growth": True,
71+
# },
72+
# "log_device_placement": False,
73+
"device_count": {
74+
"CPU": 2
75+
},
76+
# "allow_soft_placement": True, # required by PPO multi-gpu
77+
},
78+
# Override the following tf session args on the local worker
79+
"local_tf_session_args": {
80+
"intra_op_parallelism_threads": 4,
81+
"inter_op_parallelism_threads": 4,
82+
},
83+
84+
}
85+
86+
87+
model_config = {
88+
# "model": {
89+
# "fcnet_hiddens": [256, 256],
90+
# "fcnet_activation": "tanh",
91+
# "use_lstm": False,
92+
# "max_seq_len": 20,
93+
# "lstm_cell_size": 256,
94+
# "lstm_use_prev_action_reward": False,
95+
# },
96+
}
97+
98+
from ray import tune
99+
eval_config = {
100+
"evaluation_interval": None, # I think this means every x training_iterations
101+
"evaluation_config": {
102+
"explore": False,
103+
"exploration_fraction": 0,
104+
"exploration_final_eps": 0,
105+
"evaluation_num_episodes": 10,
106+
# "horizon": 100,
107+
"env_config": {
108+
"dummy_eval": True, #hack Used to check if we are in evaluation mode or training mode inside Ray callback on_episode_end() to be able to write eval stats
109+
'transition_noise': 0 if "state_space_type" in env_config["env_config"] and env_config["env_config"]["state_space_type"] == "discrete" else tune.function(lambda a: a.normal(0, 0)),
110+
'reward_noise': tune.function(lambda a: a.normal(0, 0)),
111+
'action_loss_weight': 0.0,
112+
}
113+
},
114+
}
115+
value_tuples = []
116+
for config_type, config_dict in var_configs.items():
117+
for key in config_dict:
118+
assert type(var_configs[config_type][key]) == list, "var_config should be a dict of dicts with lists as the leaf values to allow each configuration option to take multiple possible values"
119+
value_tuples.append(var_configs[config_type][key])
120+
121+
import itertools
122+
cartesian_product_configs = list(itertools.product(*value_tuples))
123+
print("Total number of configs. to run:", len(cartesian_product_configs))

‎experiments/dqn_test_expt.py

-3
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,3 @@
1-
from mdp_playground.config_processor import *
2-
3-
# framework = 'ray'
41
timesteps_total = 10_000
52
num_seeds = 10
63
from collections import OrderedDict

‎experiments/rainbow_hydra.py

+15-129
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,10 @@
11
num_seeds = 1
22
timesteps_total = 20_000
3-
num_agent_configs = 1000
4-
num_prob_inst = 1000
3+
num_configs = 1000
54

6-
import numpy as np
75
from collections import OrderedDict
86

9-
var_env_configs = OrderedDict({
7+
sobol_env_configs = OrderedDict({
108
'action_space_size': (8,),#, 10, 12, 14] # [2**i for i in range(1,6)]
119
# 'action_space_size': (64),#2, 4, 8, 16] # [2**i for i in range(1,6)]
1210
'delay': "cat, " + str([i for i in range(11)]), # + [2**i for i in range(4)],
@@ -17,84 +15,13 @@
1715
'terminal_state_density': (0.25,), # np.linspace(0.1, 1.0, num=5)
1816
'reward_dist': "float, [0.01, 0.8]",
1917
'reward_scale': "float, log, [0.1, 100]",
20-
'dummy_seed': (0,), #"cat, " + str([i for i in range(num_seeds)]),
18+
'dummy_seed': (0,), # "cat, " + str([i for i in range(num_seeds)]), #seed
2119
})
2220

2321

24-
print(var_env_configs)
25-
cartesian_product_configs = []
26-
def sobol_configs_from_config_dict(config_dict):
27-
'''
28-
'''
22+
print(sobol_env_configs)
2923

30-
num_dims = 0
31-
for key in config_dict:
32-
val = config_dict[key]
33-
if type(val) == tuple: # i.e. a constant value
34-
pass
35-
else: # i.e. a variable value
36-
num_dims += 1
37-
38-
print("Generating sobol sequence with " + str(num_prob_inst) + " and " + str(num_dims) + " dimensions:")
39-
40-
from scipy.optimize._shgo_lib.sobol_seq import Sobol # Only generates real vectors in range 0 to 1 per dimension
41-
import json
42-
sobol_gen = Sobol()
43-
sobol = sobol_gen.i4_sobol_generate(num_dims, num_prob_inst, skip=0)
44-
print(sobol)
45-
46-
for sample in sobol:
47-
# print(sample)
48-
cartesian_product_configs.append({}) # new config
49-
j = 0
50-
for key in config_dict:
51-
val = config_dict[key]
52-
if type(val) == tuple: # i.e. a constant value
53-
cartesian_product_configs[-1][key] = val[0]
54-
# The rest are config spaces for param settings
55-
elif "int" in val:
56-
lower = float(val.split("[")[1].split(",")[0].strip())
57-
upper = float(val.split("]")[0].split(",")[-1].strip())
58-
log = True if "log" in val else False
59-
#TODO log vals
60-
sobol_val = lower + (upper - lower) * sample[j]
61-
cartesian_product_configs[-1][key] = int(sobol_val)
62-
j += 1
63-
elif "float" in val:
64-
lower = float(val.split("[")[1].split(",")[0].strip())
65-
upper = float(val.split("]")[0].split(",")[-1].strip())
66-
log = True if "log" in val else False
67-
if log:
68-
lower = np.log(lower)
69-
upper = np.log(upper)
70-
sobol_val = lower + (upper - lower) * sample[j]
71-
if log:
72-
sobol_val = np.exp(sobol_val)
73-
if key == "reward_dist":
74-
sobol_val = [sobol_val, 1.0]
75-
cartesian_product_configs[-1][key] = sobol_val
76-
j += 1
77-
elif "cat" in val:
78-
choices = json.loads("[" + val.split("[")[1].split("]")[0] + "]") # Seems faster than ast.literal_eval (See https://stackoverflow.com/questions/1894269/how-to-convert-string-representation-of-list-to-a-list)
79-
len_c = len(choices)
80-
if sample[j] == 1.0: #TODO remove? Don't know if sobol samples include 1.0
81-
sample[j] -= 1e-10
82-
index = int(sample[j] * len_c)
83-
cartesian_product_configs[-1][key] = choices[index]
84-
j += 1
85-
86-
87-
88-
sobol_configs_from_config_dict(var_env_configs)
89-
# import pprint
90-
# pp = pprint.PrettyPrinter(indent=4)
91-
92-
for i, conf in enumerate(cartesian_product_configs):
93-
cartesian_product_configs[i] = tuple(conf.values()) #hack
94-
# print(conf)
95-
# pp.pprint(cartesian_product_configs[i])
96-
97-
var_agent_configs = OrderedDict({
24+
random_agent_configs = OrderedDict({
9825

9926
"lr": "float, log, [1e-5, 1e-3]", # 1e-4
10027
"learning_starts": "int, [1, 2000]", # 500
@@ -107,53 +34,22 @@ def sobol_configs_from_config_dict(config_dict):
10734

10835
})
10936

110-
var_agent_configs = OrderedDict(sorted(var_agent_configs.items(), key=lambda t: t[0])) #hack because ConfigSpace below orders alphabetically, the returned configs are in a jumbled order compared to the order above.
37+
random_agent_configs = OrderedDict(sorted(random_agent_configs.items(), key=lambda t: t[0])) #hack because ConfigSpace below orders alphabetically, the returned configs are in a jumbled order compared to the order above, which would create problems with config processing.
11138

112-
def create_config_space_from_config_dict(config_dict):
113-
'''
114-
'''
115-
import ConfigSpace as CS
116-
cs = CS.ConfigurationSpace(seed=1234)
117-
import ConfigSpace.hyperparameters as CSH
118-
import json
11939

120-
for key in config_dict:
121-
val = config_dict[key]
122-
if "int" in val:
123-
lower = int(val.split("[")[1].split(",")[0].strip())
124-
upper = int(val.split("]")[0].split(",")[-1].strip())
125-
log = True if "log" in val else False
126-
cs.add_hyperparameter(CSH.UniformIntegerHyperparameter(name=key, lower=lower, upper=upper, log=log))
127-
elif "float" in val:
128-
lower = float(val.split("[")[1].split(",")[0].strip())
129-
upper = float(val.split("]")[0].split(",")[-1].strip())
130-
log = True if "log" in val else False
131-
cs.add_hyperparameter(CSH.UniformFloatHyperparameter(name=key, lower=lower, upper=upper, log=log))
132-
elif "cat" in val:
133-
choices = json.loads("[" + val.split("[")[1].split("]")[0] + "]") # Seems faster than ast.literal_eval (See https://stackoverflow.com/questions/1894269/how-to-convert-string-representation-of-list-to-a-list)
134-
cs.add_hyperparameter(CSH.CategoricalHyperparameter(name=key, choices=choices))
135-
# print(type(CSH.CategoricalHyperparameter(name=key, choices=choices).choices[0]))
40+
random_configs = OrderedDict({
41+
"env": {},
42+
"agent": random_agent_configs,
13643

137-
return cs
44+
})
13845

139-
cs = create_config_space_from_config_dict(var_agent_configs)
140-
print("Agent variable ConfigSpace:")
141-
print(cs)
142-
random_configs = cs.sample_configuration(size=num_agent_configs)
143-
# print("type(random_configs):", type(random_configs))
144-
for i in range(len(random_configs)):
145-
# if random_configs[i].get_dictionary()["train_batch_size"] == 4 \
146-
# and random_configs[i].get_dictionary()["buffer_size"] < 33:
147-
# print("Config:", i, "train_batch_size, buffer_size:", random_configs[i].get_dictionary()["train_batch_size"], random_configs[i].get_dictionary()["buffer_size"])
148-
random_configs[i] = tuple(random_configs[i].get_dictionary().values()) #hack ####TODO Change run_experiments.py and here to directly pass whole config dict to run_experiments.py. Would need to replace in every config.py file.
149-
# print(random_configs)
15046

151-
var_configs = OrderedDict({
152-
"env": var_env_configs,
153-
"agent": var_agent_configs,
47+
sobol_configs = OrderedDict({
48+
"env": sobol_env_configs,
15449

15550
})
15651

52+
15753
env_config = {
15854
"env": "RLToy-v0",
15955
"horizon": 100,
@@ -163,7 +59,7 @@ def create_config_space_from_config_dict(config_dict):
16359
'action_space_type': 'discrete',
16460
'generate_random_mdp': True,
16561
'repeats_in_sequences': False,
166-
'reward_scale': 1.0,
62+
# 'reward_scale': 1.0,
16763
'completely_connected': True,
16864
},
16965
}
@@ -226,7 +122,7 @@ def create_config_space_from_config_dict(config_dict):
226122
"custom_preprocessor": "ohe",
227123
"custom_options": {}, # extra options to pass to your preprocessor
228124
"fcnet_activation": "tanh",
229-
"use_lstm": False,
125+
# "use_lstm": False,
230126
"max_seq_len": 20,
231127
"lstm_cell_size": 256,
232128
"lstm_use_prev_action_reward": False,
@@ -250,13 +146,3 @@ def create_config_space_from_config_dict(config_dict):
250146
}
251147
},
252148
}
253-
254-
# value_tuples = []
255-
# for config_type, config_dict in var_configs.items():
256-
# for key in config_dict:
257-
# assert type(var_configs[config_type][key]) == list, "var_config should be a dict of dicts with lists as the leaf values to allow each configuration option to take multiple possible values"
258-
# value_tuples.append(var_configs[config_type][key])
259-
#
260-
# import itertools
261-
# cartesian_product_configs = list(itertools.product(*value_tuples))
262-
# print("Total number of configs. to run:", len(cartesian_product_configs))

‎experiments/rainbow_hydra_qbert.py

+11-22
Original file line numberDiff line numberDiff line change
@@ -2,27 +2,18 @@
22
timesteps_total = 10_000_000
33
from collections import OrderedDict
44

5-
var_env_configs = OrderedDict({
5+
sobol_env_configs = OrderedDict({
66
'delay': [0],
77
'dummy_seed': [i for i in range(num_seeds)],
88
})
99

10-
var_configs = OrderedDict({
11-
"env": var_env_configs
12-
})
13-
14-
value_tuples = []
15-
for config_type, config_dict in var_configs.items():
16-
for key in config_dict:
17-
assert type(var_configs[config_type][key]) == list, "var_config should be a dict of dicts with lists as the leaf values to allow each configuration option to take multiple possible values"
18-
value_tuples.append(var_configs[config_type][key])
10+
sobol_configs = OrderedDict({
11+
"env": sobol_env_configs
1912

20-
import itertools
21-
cartesian_product_configs = list(itertools.product(*value_tuples))
22-
print("Total number of grid configs. to run:", len(cartesian_product_configs))
13+
})
2314

2415

25-
var_agent_configs = OrderedDict({
16+
random_agent_configs = OrderedDict({
2617

2718
"lr": "float, log, [1e-5, 1e-3]", # 1e-4
2819
"learning_starts": "int, [1, 2000]", # 500
@@ -35,7 +26,7 @@
3526

3627
})
3728

38-
var_agent_configs = OrderedDict(sorted(var_agent_configs.items(), key=lambda t: t[0])) #hack because saved configs used below as random_configs are ordered alphabetically.
29+
random_agent_configs = OrderedDict(sorted(random_agent_configs.items(), key=lambda t: t[0])) #hack because ConfigSpace below orders alphabetically, the returned configs are in a jumbled order compared to the order above, which would create problems with config processing.
3930

4031
random_configs = \
4132
[(1.86e-12, 1480, 0.0697, 311, 0.000545, 8, 1845, 64), # top 10 configs begin from here
@@ -59,14 +50,12 @@
5950
(0.0133, 6541, 0.218, 1393, 1.21e-05, 1, 3, 16),
6051
(0.0515, 507, 0.48100000000000004, 1866, 1.23e-05, 3, 136, 128)]
6152

62-
for i in range(len(random_configs)):
63-
random_configs[i] = tuple(random_configs[i]) ##IMP I think these are tuples because cartesian_product_configs by default has tuples.
6453

65-
var_configs = OrderedDict({
66-
"env": var_env_configs,
67-
"agent": var_agent_configs,
68-
69-
})
54+
# var_configs = OrderedDict({
55+
# "env": var_env_configs,
56+
# "agent": var_agent_configs,
57+
#
58+
# })
7059

7160
env_config = {
7261
"env": "GymEnvWrapper-Atari",
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,145 @@
1+
num_seeds = 1
2+
timesteps_total = 10_000_000
3+
num_configs = 100
4+
5+
from collections import OrderedDict
6+
7+
random_agent_configs = OrderedDict({
8+
9+
"lr": "float, log, [1e-5, 1e-3]", # 1e-4
10+
"learning_starts": "int, [10, 20000]", # 500
11+
"target_network_update_freq": "int, log, [10, 10000]", # 800,
12+
"exploration_fraction": "float, [0.01, 0.99]", # 0.1,
13+
"n_step": "int, [1, 16]", # 1
14+
"buffer_size": "int, log, [333, 500000]", # ?? 1000000, # Sizes up to 32 crashed with Ray 0.7.3 (but not always!), size 16 did not crash with Ray 0.9.0dev
15+
"adam_epsilon": "float, log, [1e-12, 1e-1]", # ?? 1e-4,
16+
"train_batch_size": "cat, [4, 8, 16, 32, 64, 128]", # 32,
17+
18+
})
19+
20+
random_agent_configs = OrderedDict(sorted(random_agent_configs.items(), key=lambda t: t[0])) #hack because ConfigSpace below orders alphabetically, the returned configs are in a jumbled order compared to the order above, which would create problems with config processing.
21+
22+
23+
random_configs = OrderedDict({
24+
"env": {},
25+
"agent": random_agent_configs,
26+
27+
})
28+
29+
30+
# These are currently needed to write dummy_seed to stats CSV. A seed column is
31+
# needed for data loading
32+
33+
sobol_env_configs = OrderedDict({
34+
'dummy_seed': (0,), # "cat, " + str([i for i in range(num_seeds)]), #seed
35+
})
36+
37+
# print(sobol_env_configs)
38+
39+
sobol_configs = OrderedDict({
40+
"env": sobol_env_configs,
41+
42+
})
43+
44+
45+
env_config = {
46+
"env": "GymEnvWrapper-Atari",
47+
"env_config": {
48+
"AtariEnv": {
49+
"game": 'qbert',
50+
'obs_type': 'image',
51+
'frameskip': 1,
52+
},
53+
# "GymEnvWrapper": {
54+
"atari_preprocessing": True,
55+
'frame_skip': 4,
56+
'grayscale_obs': False,
57+
'state_space_type': 'discrete',
58+
'action_space_type': 'discrete',
59+
'seed': 0,
60+
# },
61+
# 'seed': 0, #seed
62+
},
63+
}
64+
65+
algorithm = "DQN"
66+
agent_config = {
67+
# "adam_epsilon": 1e-4,
68+
# "buffer_size": 1000000,
69+
"double_q": True,
70+
"dueling": True,
71+
# "lr": 1e-3,
72+
"exploration_final_eps": 0.01,
73+
# "exploration_fraction": 0.1,
74+
"schedule_max_timesteps": 10_000_000,
75+
# "learning_starts": 500,
76+
# "target_network_update_freq": 800,
77+
# "n_step": 4,
78+
"noisy": False,
79+
"num_atoms": 10, # [5, 10, 20]
80+
"prioritized_replay": True,
81+
"prioritized_replay_alpha": 0.75, #
82+
"prioritized_replay_beta": 0.4,
83+
"final_prioritized_replay_beta": 1.0, #
84+
"beta_annealing_fraction": 1.0, #
85+
# "hiddens": None,
86+
'hiddens': [512],
87+
88+
"sample_batch_size": 4,
89+
"timesteps_per_iteration": 10000,
90+
# "train_batch_size": 32,
91+
"min_iter_time_s": 0,
92+
93+
'num_gpus': 0,
94+
"num_workers": 3, # extra workers I think
95+
# "num_cpus_for_driver": 2,
96+
97+
"tf_session_args": {
98+
# note: overriden by `local_tf_session_args`
99+
"intra_op_parallelism_threads": 4,
100+
"inter_op_parallelism_threads": 4,
101+
# "gpu_options": {
102+
# "allow_growth": True,
103+
# },
104+
# "log_device_placement": False,
105+
"device_count": {
106+
"CPU": 2
107+
},
108+
# "allow_soft_placement": True, # required by PPO multi-gpu
109+
},
110+
# Override the following tf session args on the local worker
111+
"local_tf_session_args": {
112+
"intra_op_parallelism_threads": 4,
113+
"inter_op_parallelism_threads": 4,
114+
},
115+
116+
}
117+
118+
model_config = {
119+
# "model": {
120+
# "fcnet_hiddens": [256, 256],
121+
# "fcnet_activation": "tanh",
122+
# "use_lstm": False,
123+
# "max_seq_len": 20,
124+
# "lstm_cell_size": 256,
125+
# "lstm_use_prev_action_reward": False,
126+
# },
127+
}
128+
129+
from ray import tune
130+
eval_config = {
131+
"evaluation_interval": None, # I think this means every x training_iterations
132+
"evaluation_config": {
133+
"explore": False,
134+
"exploration_fraction": 0,
135+
"exploration_final_eps": 0,
136+
"evaluation_num_episodes": 10,
137+
"horizon": 100,
138+
"env_config": {
139+
"dummy_eval": True, #hack Used to check if we are in evaluation mode or training mode inside Ray callback on_episode_end() to be able to write eval stats
140+
'transition_noise': 0 if "state_space_type" in env_config["env_config"] and env_config["env_config"]["state_space_type"] == "discrete" else tune.function(lambda a: a.normal(0, 0)),
141+
'reward_noise': tune.function(lambda a: a.normal(0, 0)),
142+
'action_loss_weight': 0.0,
143+
}
144+
},
145+
}

‎mdp_playground/analysis/analysis.py

+24-11
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,9 @@ def load_data(self, experiments: dict, load_eval=True, exp_type='grid'):
4343
list_exp_data.append(exp_data)
4444
return list_exp_data
4545

46-
def get_exp_data(self, dir_name, exp_name, exp_type='grid', num_metrics=3, load_eval=True, threshold=0.05, sample_freq=1): #, max_total_configs=200):
46+
def get_exp_data(self, dir_name, exp_name, exp_type='grid', num_metrics=3,
47+
load_eval=True, threshold=0.05, sample_freq=1):
48+
#, max_total_configs=200):
4749
'''Get training and evaluation data from a single set of recorded CSV stats files.
4850
4951
Parameters
@@ -147,29 +149,37 @@ def join_files(file_prefix, file_suffix):
147149

148150
config_counts = []
149151
dims_values = []
150-
#Keep only config_names that we wan't to measure
151-
#traning iteration is always first, metrics are always last.
152+
# Keep only config_names that we want to measure
153+
# traning iteration is always first, metrics are always last.
152154
self.full_config_names = col_names.copy()
153155
full_config_names = self.full_config_names
154156
full_config_names.remove("training_iteration")
155157

156-
# mean_vals = [ np.mean(stats_pd.loc[stats_pd['target_network_update_freq'] == val]["episode_reward_mean"])
157-
# for val in stats_pd["target_network_update_freq"].unique() ]
158158

159159
#config counts includes seed
160-
self.seed_idx = -1
160+
self.seed_idx = None # seed used to be fixed as the last, i.e.,
161+
# quickest varying dimension in the <experiment config>.py file's
162+
# config space because then all runs on a single env would be recorded
163+
# consecutively in the stats CSV
164+
self.ts_idx = None
161165
for i, c in enumerate(full_config_names[:-num_metrics]):
162166
dims_values.append(stats_pd[c].unique())
163167
config_counts.append(stats_pd[c].nunique())
164-
if("seed" in c): ##TODO this will just set seed index to be the "last" column name with seed in it.
168+
if("seed" in c): # ##TODO this will just set seed index to be
169+
# the "last" column name with seed in it.
165170
self.seed_idx = i
171+
if c == "timesteps_total":
172+
self.ts_idx = i
166173

167-
config_counts.append(num_metrics) #hardcoded number of training stats that were recorded
174+
175+
config_counts.append(num_metrics) # #hardcoded number of training
176+
# stats that were recorded
168177
config_counts = tuple(config_counts)
169178
self.metric_names = full_config_names[-num_metrics:]
170179
self.config_names = full_config_names[:-num_metrics]
171180

172-
# Slice into training stats and get end of training stats for individual training runs in the experiment
181+
# Slice into training stats and get end of training stats for
182+
# individual training runs in the experiment
173183
final_rows_for_a_config = []
174184
previous_i = 0
175185
list_of_learning_curves = []
@@ -365,6 +375,7 @@ def join_files(file_prefix, file_suffix):
365375
exp_data['eval_aucs'] = eval_aucs
366376

367377
# related to plots
378+
# #TODO Remove the self from these since they are per expt. variables?
368379
exp_data['metric_names'] = self.metric_names
369380
exp_data['tick_labels'] = self.tick_labels
370381
exp_data['axis_labels'] = self.axis_labels
@@ -376,6 +387,7 @@ def join_files(file_prefix, file_suffix):
376387
exp_data['config_names'] = self.config_names
377388
exp_data['dims_values'] = self.dims_values
378389
exp_data['seed_idx'] = self.seed_idx
390+
exp_data['ts_idx'] = self.ts_idx
379391

380392
return exp_data
381393

@@ -684,6 +696,7 @@ def plot_learning_curves(self, list_exp_data, save_fig=False, train=True, metric
684696
#HACK
685697
if len(list_exp_data) > 0:
686698
exp_data = list_exp_data[0] #TODO make changes to handle multiple experiments plot
699+
warnings.warn("Using only 1st expt. for the foll. plots")
687700
else:
688701
return
689702

@@ -722,10 +735,10 @@ def plot_learning_curves(self, list_exp_data, save_fig=False, train=True, metric
722735
j_index = (i//nseeds_) % ncols_ #
723736
if i == 0:
724737
to_plot_ = stats_data[0:final_rows_for_a_config[i]+1, metric_num]
725-
to_plot_x = stats_data[0:final_rows_for_a_config[i]+1,-3]
738+
to_plot_x = stats_data[0:final_rows_for_a_config[i]+1, exp_data['ts_idx']]
726739
else:
727740
to_plot_ = stats_data[final_rows_for_a_config[i-1]+1:final_rows_for_a_config[i]+1, metric_num]
728-
to_plot_x = stats_data[final_rows_for_a_config[i-1]+1:final_rows_for_a_config[i]+1, -3]
741+
to_plot_x = stats_data[final_rows_for_a_config[i-1]+1:final_rows_for_a_config[i]+1, exp_data['ts_idx']]
729742
# print(to_plot_[-1])
730743
# if i % 10 == 0:
731744
# fig = plt.figure(figsize=(12, 7))

‎mdp_playground/config_processor/config_processor.py

+353-104
Large diffs are not rendered by default.

‎mdp_playground/envs/gym_env_wrapper.py

+7-3
Original file line numberDiff line numberDiff line change
@@ -63,9 +63,13 @@ def __init__(self, env, **config):
6363
self.transition_noise = lambda a: 0.0
6464

6565
if "reward_noise" in config:
66-
self.reward_noise = config["reward_noise"]
66+
if callable(config["reward_noise"]):
67+
self.reward_noise = config["reward_noise"]
68+
else:
69+
reward_noise_std = config["reward_noise"]
70+
self.reward_noise = lambda a: a.normal(0, reward_noise_std)
6771
else:
68-
self.reward_noise = lambda a: 0.0
72+
self.reward_noise = None
6973

7074
if "wrap_deepmind_ray" in config and config["wrap_deepmind_ray"]: #hack ##TODO remove?
7175
self.env = wrap_deepmind(self.env, dim=42, framestack=True)
@@ -195,7 +199,7 @@ def step(self, action):
195199
# print("rewards:", self.reward_buffer, old_reward, reward)
196200
del self.reward_buffer[0]
197201

198-
noise_in_reward = self.reward_noise(self.np_random) #random ###TODO Would be better to parameterise this in terms of state, action and time_step as well. Would need to change implementation to have a queue for the rewards achieved and then pick the reward that was generated delay timesteps ago.
202+
noise_in_reward = self.reward_noise(self.np_random) if self.reward_noise else 0 #random ###TODO Would be better to parameterise this in terms of state, action and time_step as well. Would need to change implementation to have a queue for the rewards achieved and then pick the reward that was generated delay timesteps ago.
199203
self.total_abs_noise_in_reward_episode += np.abs(noise_in_reward)
200204
self.total_reward_episode += reward
201205
reward += noise_in_reward

‎mdp_playground/envs/rl_toy_env.py

+81-105
Large diffs are not rendered by default.

‎mdp_playground/spaces/__init__.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
from mdp_playground.spaces.box_extended import BoxExtended
33
from mdp_playground.spaces.multi_discrete_extended import MultiDiscreteExtended
44
from mdp_playground.spaces.image_multi_discrete import ImageMultiDiscrete
5+
from mdp_playground.spaces.image_continuous import ImageContinuous
56
from mdp_playground.spaces.tuple_extended import TupleExtended
67

7-
__all__ = ["BoxExtended", "DiscreteExtended", "MultiDiscreteExtended", "ImageMultiDiscrete", "TupleExtended"]
8+
__all__ = ["BoxExtended", "DiscreteExtended", "MultiDiscreteExtended",\
9+
"ImageMultiDiscrete", "ImageContinuous", "TupleExtended"]
+194
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,194 @@
1+
import warnings
2+
import numpy as np
3+
import gym
4+
from gym.spaces import Box, Space
5+
import PIL.ImageDraw as ImageDraw
6+
import PIL.Image as Image
7+
from PIL.Image import FLIP_LEFT_RIGHT, FLIP_TOP_BOTTOM
8+
import os
9+
10+
class ImageContinuous(Box):
11+
'''A space that maps a continuous 1- or 2-D space 1-to-1 to images so that the
12+
images may be used as representations for corresponding continuous environments.
13+
14+
Methods
15+
-------
16+
get_concatenated_image(continuous_obs)
17+
Gets an image representation for a given feature space observation
18+
'''
19+
20+
def __init__(self, feature_space, term_spaces=None, width=100, height=100,\
21+
circle_radius=5, target_point=None, relevant_indices=[0,1],\
22+
seed=None, use_custom_images=None, cust_path=None, dtype=np.uint8):
23+
'''
24+
Parameters
25+
----------
26+
feature_space : Gym.spaces.Box
27+
The feature space to which this class associates images as external
28+
observations
29+
term_spaces : list of Gym.spaces.Box
30+
Sub-spaces of the feature space which are terminal
31+
width : int
32+
The width of the image
33+
height : int
34+
The height of the image
35+
circle_radius : int
36+
The radius of the circle which represents the agent and target point
37+
target_point : np.array
38+
39+
relevant_indices : list
40+
41+
seed : int
42+
Seed for this space
43+
'''
44+
# ##TODO Define a common superclass for this and ImageMultiDiscrete
45+
self.feature_space = feature_space
46+
assert (self.feature_space.high != np.inf).any()
47+
assert (self.feature_space.low != -np.inf).any()
48+
self.width = width
49+
self.height = height
50+
# Warn if resolution is too low?
51+
self.circle_radius = circle_radius
52+
self.target_point = target_point
53+
self.term_spaces = term_spaces
54+
self.relevant_indices = relevant_indices
55+
all_indices = set(range(self.feature_space.shape[0]))
56+
self.irrelevant_indices = list(all_indices - set(self.relevant_indices))
57+
if len(self.irrelevant_indices) == 0:
58+
self.irrelevant_features = False
59+
else:
60+
self.irrelevant_features = True
61+
62+
self.goal_colour = (0, 255, 0)
63+
self.agent_colour = (0, 0, 255)
64+
self.term_colour = (0, 0, 0)
65+
66+
assert len(feature_space.shape) == 1
67+
relevant_dims = len(relevant_indices)
68+
irr_dims = len(self.irrelevant_indices)
69+
assert relevant_dims <= 2 and irr_dims <=2, "Image observations are "\
70+
"supported only "\
71+
"for 1- or 2-D feature spaces."
72+
73+
74+
# Shape has 1 appended for Ray Rllib to be compatible IIRC
75+
super(ImageContinuous, self).__init__(shape=(width, height, 1), \
76+
dtype=dtype, low=0, high=255)
77+
super(ImageContinuous, self).seed(seed=seed)
78+
79+
if self.target_point is not None:
80+
self.target_point_pixel = self.convert_to_pixel(target_point)
81+
82+
83+
def generate_image(self, position, relevant=True):
84+
'''
85+
Parameters
86+
----------
87+
position : np.array
88+
89+
'''
90+
# Use RGB
91+
image_ = Image.new("RGB", (self.width, self.height), color=(255,255,255))
92+
# Use L for black and white 8-bit pixels instead of RGB in case not
93+
# using custom images
94+
# image_ = Image.new("L", (self.width, self.height))
95+
draw = ImageDraw.Draw(image_)
96+
97+
# Draw term_spaces first, so that others are drawn over it
98+
if self.term_spaces is not None and relevant:
99+
for term_space in self.term_spaces:
100+
low = self.convert_to_pixel(term_space.low)
101+
high = self.convert_to_pixel(term_space.high)
102+
103+
leftUpPoint = tuple((low))
104+
rightDownPoint = tuple((high))
105+
twoPointList = [leftUpPoint, rightDownPoint]
106+
draw.rectangle(twoPointList, fill=self.term_colour)
107+
108+
R = self.circle_radius
109+
110+
if self.target_point is not None and relevant:
111+
# print("draw2", self.target_point_pixel)
112+
leftUpPoint = tuple((self.target_point_pixel - R))
113+
rightDownPoint = tuple((self.target_point_pixel + R))
114+
twoPointList = [leftUpPoint, rightDownPoint]
115+
draw.ellipse(twoPointList, fill=self.goal_colour)
116+
117+
pos_pixel = self.convert_to_pixel(position)
118+
# print("draw1", pos_pixel)
119+
# Draw circle https://stackoverflow.com/a/2980931/11063709
120+
leftUpPoint = tuple(pos_pixel - R)
121+
rightDownPoint = tuple(pos_pixel + R)
122+
twoPointList = [leftUpPoint, rightDownPoint]
123+
draw.ellipse(twoPointList, fill=self.agent_colour)
124+
125+
126+
127+
128+
# Because numpy is row-major and Image is column major, need to transpose
129+
# ret_arr = np.array(image_).T # For 2-D
130+
ret_arr = np.transpose(np.array(image_), axes=(1, 0, 2))
131+
132+
return ret_arr
133+
134+
def get_concatenated_image(self, obs):
135+
'''Gets the "stitched together" image made from images corresponding to
136+
each continuous sub-space within the continuous space, concatenated
137+
along the X-axis.
138+
'''
139+
concatenated_image = []
140+
# For relevant/irrelevant sub-spaces:
141+
concatenated_image.append(self.generate_image(obs[self.relevant_indices]))
142+
if self.irrelevant_features:
143+
irr_image = self.generate_image(obs[self.irrelevant_indices], relevant=False)
144+
concatenated_image.append(irr_image)
145+
146+
concatenated_image = np.concatenate(tuple(concatenated_image), axis=0)
147+
148+
return np.atleast_3d(concatenated_image) # because Ray expects an
149+
# image to have >=3 dims
150+
151+
def convert_to_pixel(self, position):
152+
'''
153+
'''
154+
# It's implicit that both relevant and irrelevant sub-spaces have the
155+
# same max and min here:
156+
max = self.feature_space.high[self.relevant_indices]
157+
min = self.feature_space.low[self.relevant_indices]
158+
pos_pixel = ((position - min) / (max - min))
159+
pos_pixel = (pos_pixel * self.shape[:2]).astype(int)
160+
161+
return pos_pixel
162+
163+
164+
def sample(self):
165+
166+
sampled = self.feature_space.sample()
167+
return self.get_concatenated_image(sampled)
168+
169+
def __repr__(self):
170+
return "{} with continuous underlying space of shape: {} and "\
171+
"images of resolution: {} and dtype: {}".format(self.__class__,\
172+
self.feature_space.shape,\
173+
self.shape, self.dtype)
174+
175+
def contains(self, x):
176+
"""
177+
Return boolean specifying if x is a valid
178+
member of this space
179+
"""
180+
if x.shape == (self.width, self.height, 1): #TODO compare each pixel for all possible images?
181+
return True
182+
183+
def to_jsonable(self, sample_n):
184+
"""Convert a batch of samples from this space to a JSONable data type."""
185+
# By default, assume identity is JSONable
186+
raise NotImplementedError
187+
188+
def from_jsonable(self, sample_n):
189+
"""Convert a JSONable data type to a batch of samples from this space."""
190+
# By default, assume identity is JSONable
191+
raise NotImplementedError
192+
193+
def __eq__(self, other):
194+
raise NotImplementedError

‎mdp_playground/spaces/image_multi_discrete.py

+8-5
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ class ImageMultiDiscrete(Box):
1616
Gets an image representation for a given multi_discrete_state
1717
'''
1818

19-
def __init__(self, state_space_sizes, width=100, height=100, circle_radius=20, transforms='rotate,flip,scale,shift', sh_quant=1, scale_range=(0.5,1.5), ro_quant=1, seed=None, use_custom_images=None, cust_path=None): # , polygon_sides=4
19+
def __init__(self, state_space_sizes, width=100, height=100, circle_radius=20, transforms='rotate,flip,scale,shift', sh_quant=1, scale_range=(0.5,1.5), ro_quant=1, seed=None, use_custom_images=None, cust_path=None, dtype=np.uint8): # , polygon_sides=4
2020
'''
2121
Parameters
2222
----------
@@ -84,7 +84,7 @@ def __init__(self, state_space_sizes, width=100, height=100, circle_radius=20, t
8484

8585

8686
# self.shape = (width, height, 1)
87-
super(ImageMultiDiscrete, self).__init__(shape=(width, height, 1), dtype=np.int64, low=0, high=255) #
87+
super(ImageMultiDiscrete, self).__init__(shape=(width, height, 1), dtype=dtype, low=0, high=255) #
8888
super(ImageMultiDiscrete, self).seed(seed=seed) #
8989

9090
# def seed(self, seed=None):
@@ -214,21 +214,24 @@ def get_concatenated_image(self, multi_discrete_state,):
214214
# concatenated_image.append(self.disjoint_states[i][multi_discrete_state[i]])
215215
concatenated_image = np.concatenate(tuple(concatenated_image), axis=0)
216216

217-
return concatenated_image[..., np.newaxis] # because Ray expects an image to have >=3 dims
217+
return np.atleast_3d(concatenated_image) # because Ray expects an image to have >=3 dims
218218

219219
# def get_multi_discrete_state(self,
220220

221221
def sample(self):
222222
sss = np.array(self.state_space_sizes)
223-
sampled = (self.np_random.random_sample(sss.shape) * sss).astype(np.int64) # Based on Gym's MultiDiscrete sampling
223+
sampled = (self.np_random.random_sample(sss.shape) * sss).astype(self.dtype) # Based on Gym's MultiDiscrete sampling
224224
# if type(sampled) == int:
225225
# sampled = [sampled]
226226
sampled = list(sampled)
227227

228228
return self.get_concatenated_image(sampled)
229229

230230
def __repr__(self):
231-
return "ImageMultiDiscrete with multi-discrete space of shape: {} and images of resolution: {}".format(self.state_space_sizes, self.shape)
231+
return "{} with multi-discrete space of shape: {} and "\
232+
"images of resolution: {} and dtype: {}".format(self.__class__,\
233+
self.state_space_sizes,\
234+
self.shape, self.dtype)
232235

233236
def contains(self, x):
234237
"""
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
import unittest
2+
import numpy as np
3+
from mdp_playground.spaces.image_continuous import ImageContinuous
4+
from gym.spaces import Box
5+
# import PIL.ImageDraw as ImageDraw
6+
import PIL.Image as Image
7+
8+
9+
class TestImageContinuous(unittest.TestCase):
10+
11+
def test_image_continuous(self):
12+
lows = 0.0
13+
highs = 20.0
14+
cs2 = Box(shape=(2,), low=lows, high=highs,)
15+
cs4 = Box(shape=(4,), low=lows, high=highs,)
16+
17+
imc = ImageContinuous(cs2, width=400, height=400,)
18+
pos = np.array([5.0, 7.0])
19+
img1 = Image.fromarray(np.squeeze(imc.generate_image(pos)), 'RGB')
20+
img1.show()
21+
22+
target = np.array([10, 10])
23+
imc = ImageContinuous(cs2, target_point=target, width=400, height=400,)
24+
img1 = Image.fromarray(np.squeeze(imc.generate_image(pos)), 'RGB')
25+
img1.show()
26+
27+
# Terminal sub-spaces
28+
lows = np.array([2., 4.])
29+
highs = np.array([3., 6.])
30+
cs2_term1 = Box(low=lows, high=highs,)
31+
lows = np.array([12., 3.])
32+
highs = np.array([13., 4.])
33+
cs2_term2 = Box(low=lows, high=highs,)
34+
term_spaces = [cs2_term1, cs2_term2]
35+
36+
target = np.array([10, 10])
37+
imc = ImageContinuous(cs2, target_point=target, term_spaces=term_spaces,\
38+
width=400, height=400,)
39+
pos = np.array([5.0, 7.0])
40+
img1 = Image.fromarray(np.squeeze(imc.get_concatenated_image(pos)), 'RGB')
41+
img1.show()
42+
43+
44+
# Irrelevant features
45+
target = np.array([10, 10])
46+
imc = ImageContinuous(cs4, target_point=target, width=400, height=400,)
47+
pos = np.array([5.0, 7.0, 10.0, 15.0])
48+
img1 = Image.fromarray(np.squeeze(imc.get_concatenated_image(pos)), 'RGB')
49+
img1.show()
50+
# print(imc.get_concatenated_image(pos).shape)
51+
52+
# Random sample and __repr__
53+
imc = ImageContinuous(cs4, target_point=target, width=400, height=400,)
54+
print(imc)
55+
img1 = Image.fromarray(np.squeeze(imc.sample()), 'RGB')
56+
img1.show()
57+
58+
59+
60+
61+
if __name__ == '__main__':
62+
unittest.main()

‎mdp_playground/spaces/test_image_multi_discrete.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import unittest
22
import numpy as np
3-
from gym.spaces.image_multi_discrete import ImageMultiDiscrete
3+
from mdp_playground.spaces.image_multi_discrete import ImageMultiDiscrete
44
from gym.spaces import Discrete, MultiDiscrete
55
# import gym
66
# from gym.spaces import MultiDiscrete
@@ -13,6 +13,8 @@ class TestImageMultiDiscrete(unittest.TestCase):
1313

1414
def test_image_multi_discrete(self):
1515
ds4 = Discrete(4)
16+
ds4 = [ds4.n]
17+
print(ds4)
1618
imd = ImageMultiDiscrete(ds4, transforms='shift')
1719
from PIL import Image
1820
# img1 = Image.fromarray(imd.disjoint_states[0][1], 'L')

‎plot_experiments.ipynb

+93-4
Original file line numberDiff line numberDiff line change
@@ -448,7 +448,7 @@
448448
"source": [
449449
"# Save configs in list_exp_data_ (hacky variable name)\n",
450450
"import pickle\n",
451-
"pik = \"mdpp_hydra_reward_scales_pickle.dat\"\n",
451+
"pik = \"mdpp_hydra_configs_pickle.dat\"\n",
452452
"\n",
453453
"import os.path\n",
454454
"if not os.path.exists(pik):\n",
@@ -465,7 +465,7 @@
465465
"metadata": {},
466466
"outputs": [],
467467
"source": [
468-
"# Save configs in list_exp_data_reward_scales\n",
468+
"# Save reward_scales in list_exp_data_reward_scales\n",
469469
"import pickle\n",
470470
"pik = \"mdpp_hydra_reward_scales_pickle.dat\"\n",
471471
"\n",
@@ -529,6 +529,31 @@
529529
"del list_exp_data_reward_scales[259]"
530530
]
531531
},
532+
{
533+
"cell_type": "code",
534+
"execution_count": null,
535+
"metadata": {},
536+
"outputs": [],
537+
"source": [
538+
"contents = []\n",
539+
"for key in list_exp_data[0]:\n",
540+
" contents.append(key)\n",
541+
"print(contents)\n",
542+
"# print(list_exp_data[0]['train_stats'])\n",
543+
"print(len(list_exp_data[0]['dims_values']))\n",
544+
"\n",
545+
"print(len(list_exp_data_with_configs))\n",
546+
"print(list_exp_data_with_configs[0]['train_stats'].iloc[0,:])\n",
547+
"print(list_exp_data_with_configs[0]['train_stats'].iloc[1,:])\n",
548+
"\n",
549+
"# print(list_exp_data_with_configs[0]['train_stats']['learning_starts'])\n",
550+
"# learn_startss_mean = list_exp_data_with_configs[0]['train_stats']['learning_starts'].mean()\n",
551+
"# print(\"mean(learn_startss):\", learn_startss_mean)\n",
552+
"\n",
553+
"print(len(list_exp_data_reward_scales))\n",
554+
"# print(list_exp_data_reward_scales[0]['train_stats'])"
555+
]
556+
},
532557
{
533558
"cell_type": "code",
534559
"execution_count": null,
@@ -587,20 +612,24 @@
587612
"top_configs = {}\n",
588613
"top_configs_mins = {}\n",
589614
"perfs_all_envs = {}\n",
615+
"\n",
616+
"print(\"env x agent grid size:\", num_env_configs, num_agent_configs)\n",
590617
"for perf_set in perf_sets:\n",
591618
" top_configs[perf_set] = []\n",
592619
" top_configs_mins[perf_set] = []\n",
593620
" perfs_all_envs[perf_set] = np.zeros(shape=(num_env_configs, num_agent_configs))\n",
594621
"\n",
595622
"corrs = {}\n",
623+
"corrs_spm = {}\n",
596624
"import itertools\n",
597625
"corr_sets = ['train', 'eval', 'train_auc', 'eval_auc']\n",
598626
"corr_combos = list(itertools.combinations(corr_sets, 2))\n",
599627
"\n",
600628
"# corr_sets = ['train_eval', 'train_auc_eval_auc', 'eval_eval_auc', 'train_eval_auc', 'train_train_auc', 'eval_train_auc']\n",
601629
"for corr_combo in corr_combos:\n",
602630
" corrs[corr_combo[0] + ' and ' + corr_combo[1]] = []\n",
603-
"\n",
631+
" corrs_spm[corr_combo[0] + ' and ' + corr_combo[1]] = []\n",
632+
" \n",
604633
"for i in range(num_env_configs):\n",
605634
"# if i == 259:\n",
606635
"# continue\n",
@@ -627,13 +656,18 @@
627656
" for combo in corr_combos:\n",
628657
" corr_ = prs(perfs[combo[0]], perfs[combo[1]])[0]\n",
629658
" corrs[combo[0] + ' and ' + combo[1]].append(corr_)\n",
659+
" \n",
660+
" corr_ = spm(perfs[combo[0]], perfs[combo[1]])[0]\n",
661+
" corrs_spm[combo[0] + ' and ' + combo[1]].append(corr_)\n",
662+
" \n",
630663
"\n",
631664
"# corrs['train_eval']\n",
632665
"# corrs['train_auc_eval_auc'].append(prs(perfs['train_auc'], perfs['eval_auc']))\n",
633666
"# corrs['eval_eval_auc'].append(prs(perfs['eval'], perfs['eval_auc']))\n",
634667
"# corrs['train_eval_auc'].append(prs(perfs['train'], perfs['eval_auc']))\n",
635668
"# corrs['train_train_auc'].append(prs(perfs['train'], perfs['train_auc']))\n",
636669
"# corrs['eval_train_auc'].append(prs(perfs['eval'], perfs['train_auc']))\n",
670+
"\n",
637671
"\n"
638672
]
639673
},
@@ -734,6 +768,8 @@
734768
"\n",
735769
"\n",
736770
" print(\"Final portfolio:\", portfolio[perf_set])\n",
771+
" print(\"Final portfolio perf.:\", np.sum(hydra_perfs[perf_set]))\n",
772+
" print(\"Oracle perf.:\", sum_over_maxes[perf_set][0])\n",
737773
" print(\"Final portfolio mins:\", portfolio_mins[perf_set])\n",
738774
"\n",
739775
" import matplotlib.pyplot as plt\n",
@@ -745,6 +781,8 @@
745781
" plt.legend()\n",
746782
" plt.xlabel('Portfolio building iter.')\n",
747783
" plt.ylabel('Reward or number of configs.')\n",
784+
" plt.yscale('log')\n",
785+
" plt.grid(which='both')\n",
748786
" plt.show()\n",
749787
"\n",
750788
"# print(port_perfs_mins, sum(port_perfs_mins))\n",
@@ -760,6 +798,7 @@
760798
" plt.plot(sum_over_maxes[perf_set], label=\"Sum over maxes\")\n",
761799
" plt.plot(max_over_sums, label=\"Max over sums\")\n",
762800
" plt.legend()\n",
801+
" plt.grid(which='both')\n",
763802
" plt.show()\n",
764803
" \n",
765804
" \n",
@@ -775,14 +814,64 @@
775814
"# print(perfs_all_envs)\n",
776815
"for combo in corr_combos:\n",
777816
"# print(\"Corr. on \" + str(combo[0] + ' and ' + combo[1]), corrs[combo[0] + ' and ' + combo[1]])\n",
778-
" print(\"Max corr. on \" + str(combo[0] + ' and ' + combo[1]), max(corrs[combo[0] + ' and ' + combo[1]]))\n",
817+
" print(\"Max (across envs) corr. on \" + str(combo[0] + ' and ' + combo[1]), max(corrs[combo[0] + ' and ' + combo[1]]))\n",
779818
" print(\"Min corr. on \" + str(combo[0] + ' and ' + combo[1]), min(corrs[combo[0] + ' and ' + combo[1]]))\n",
819+
" print(\"Max spm corr. on \" + str(combo[0] + ' and ' + combo[1]), max(corrs_spm[combo[0] + ' and ' + combo[1]]))\n",
820+
" print(\"Min spm corr. on \" + str(combo[0] + ' and ' + combo[1]), min(corrs_spm[combo[0] + ' and ' + combo[1]]))\n",
821+
" \n",
780822
"# corrs[combo[0] + ' and ' + combo[1]]\n",
781823
"\n",
782824
"# for i in range(num_env_configs):\n",
783825
"# corrs[combo[0] + ' and ' + combo[1]]\n"
784826
]
785827
},
828+
{
829+
"cell_type": "code",
830+
"execution_count": null,
831+
"metadata": {},
832+
"outputs": [],
833+
"source": [
834+
"# Spearman correlation of agent configs on 100 random pairs of envs\n",
835+
"import random\n",
836+
"\n",
837+
"random.seed(0)\n",
838+
"\n",
839+
"# From https://stackoverflow.com/a/48581219/11063709\n",
840+
"n = 1000\n",
841+
"A = list(range(n))\n",
842+
"k = 2\n",
843+
"m = 100\n",
844+
"\n",
845+
"samples = set()\n",
846+
"tries = 0\n",
847+
"while len(samples) < m:\n",
848+
" samples.add(tuple(sorted(random.sample(A, k))))\n",
849+
" tries += 1\n",
850+
"\n",
851+
"samples = list(samples)\n",
852+
"# print(samples)\n",
853+
"# print(tries)\n",
854+
"\n",
855+
"corrs_spm_agents_on_envs = {}\n",
856+
"for perf_set in perf_sets:\n",
857+
" corrs_spm_agents_on_envs[perf_set] = []\n",
858+
"\n",
859+
"print(\"Spearman correlation of agent configs on 100 random pairs of envs:\")\n",
860+
"print(\"Mean, std, max, min\")\n",
861+
"for perf_set in perf_sets: \n",
862+
" for i in range(len(samples)):\n",
863+
"# print(perfs[perf_set])\n",
864+
" env_0_perfs = perfs_all_envs[perf_set][samples[i][0], :]\n",
865+
" env_1_perfs = perfs_all_envs[perf_set][samples[i][1], :]\n",
866+
" \n",
867+
" corr_spm = spm(env_0_perfs, env_1_perfs)[0]\n",
868+
" corrs_spm_agents_on_envs[perf_set].append(corr_spm)\n",
869+
" \n",
870+
"# print(corrs_spm_agents_on_envs[perf_set])\n",
871+
"\n",
872+
" print(perf_set, np.mean(corrs_spm_agents_on_envs[perf_set]), np.std(corrs_spm_agents_on_envs[perf_set]), np.max(corrs_spm_agents_on_envs[perf_set]), np.min(corrs_spm_agents_on_envs[perf_set]))"
873+
]
874+
},
786875
{
787876
"cell_type": "code",
788877
"execution_count": null,

‎run_experiments.py

+16-8
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@
5151
'agent_config and model_config are dicts which hold the '
5252
'static configuration for the current experiment as a '
5353
'normal Python dict.')
54-
# TODO Update docs regarding how to get configs to run: i.e., Cartesian
54+
# ####TODO Update docs regarding how to get configs to run: i.e., Cartesian
5555
# product, or random, etc.
5656
parser.add_argument('-e', '--exp-name', dest='exp_name', action='store',
5757
default='mdpp_default_experiment',
@@ -73,6 +73,8 @@
7373
'for the experiment will be taken and ordered as a list '
7474
'and this number corresponds to the configuration number '
7575
'in this list. Please look in to the code for details.')
76+
# ###TODO Remove? #hack to run 1000 x 1000 env configs x agent configs.
77+
# Storing all million of them in memory may be too inefficient?
7678
parser.add_argument('-a', '--agent-config-num', dest='agent_config_num',
7779
action='store', default=None, type=int,
7880
help='Used for running the configurations of experiments '
@@ -84,7 +86,8 @@
8486
').')
8587
parser.add_argument('-m', '--save-model', dest='save_model', action='store',
8688
default=False, type=bool,
87-
help='Option to save trained NN model at the end of '
89+
help='Option to save trained NN model and framework \
90+
generated files at the end of '
8891
'training.')
8992
parser.add_argument('-t', '--framework-dir', dest='framework_dir',
9093
action='store', default='/tmp/', type=str,
@@ -122,6 +125,7 @@
122125
logging.error("Log level {} not in {}.".format(args.log_level,
123126
log_levels.keys()))
124127

128+
config_file = args.config_file
125129

126130
if args.config_file[-3:] == '.py':
127131
config_file = args.config_file[:-3]
@@ -137,7 +141,7 @@
137141

138142
print("Stats file being written to:", stats_file_name)
139143

140-
config, final_configs = config_processor.process_configs(config_file, stats_file_prefix=stats_file_name, framework=args.framework, config_num=args.config_num, log_level=log_level_)
144+
config, final_configs = config_processor.process_configs(config_file, stats_file_prefix=stats_file_name, framework=args.framework, config_num=args.config_num, log_level=log_level_, framework_dir=args.framework_dir)
141145

142146
print("Configuration number(s) that will be run:", "all" if args.config_num is None else args.config_num)
143147

@@ -154,6 +158,7 @@
154158

155159
if args.config_num is None:
156160
# final_configs = config.final_configs
161+
print("Total number of configs to run:", len(final_configs))
157162
pass
158163
else:
159164
final_configs = [final_configs[args.config_num]]
@@ -187,20 +192,23 @@
187192

188193
analysis = tune.run(
189194
algorithm,
190-
name=algorithm + str(stats_file_name.split('/')[-1]) + '_' \
191-
+ str(args.config_num), ####IMP "name" has to be specified, otherwise,
195+
name=algorithm + '_' + str(stats_file_name.split('/')[-1]) + '_' \
196+
, ####IMP "name" has to be specified, otherwise,
192197
# it may lead to clashing for temp file in ~/ray_results/... directory.
193198
stop={
194199
"timesteps_total": timesteps_total,
195200
},
196201
config=tune_config,
197202
checkpoint_at_end=args.save_model,
198-
local_dir=args.framework_dir + '/_ray_results',
203+
local_dir=args.framework_dir + '/_ray_results_' + str(args.config_num),
199204
#return_trials=True # add trials = tune.run( above
200205
)
201206

202-
pickle.dump(analysis, open("{}_analysis.pickle".format(args.exp_name),
203-
"wb"))
207+
if args.save_model:
208+
pickle.dump(analysis, open("{}_analysis.pickle".format(args.exp_name),
209+
"wb"))
210+
211+
config_processor.post_processing(framework=args.framework)
204212

205213
end = time.time()
206214
print("No. of seconds to run:", end - start)

‎run_experiments_on_cluster_nemo.sh

+8-10
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
#!/bin/bash
22
#MOAB -N mdpp
3-
#MOAB -t 0-19 # specifies array job indices
3+
#MOAB -t 0-49 # specifies array job indices
44
#MOAB -l nodes=1:ppn=5
5-
#MOAB -l walltime=0:50:00:00
5+
#MOAB -l walltime=0:40:00:00
66
#MOAB -l pmem=8GB # Seems like it is memory per CPU core
77
#MOAB -d /work/ws/nemo/fr_rr1034-ws_mdpp-0 # initial working dir.
88

@@ -23,7 +23,7 @@ echo "TMPDIR: " $TMPDIR
2323

2424
printenv
2525

26-
export EXP_NAME='rainbow_hydra_qbert' # Ideally contains Area of research + algorithm + dataset # Could just pass this as job name?
26+
export EXP_NAME='dqn_space_invaders_r_noise' # Ideally contains Area of research + algorithm + dataset # Could just pass this as job name?
2727

2828
echo -e '\033[32m'
2929
# Print some information about the job to STDOUT
@@ -48,7 +48,7 @@ echo Shell used is $SHELL
4848
# source activate /home/rajanr/anaconda2/envs/py36
4949
# source activate /home/rajanr/anaconda3/envs/py36_toy_rl
5050
. /home/fr/fr_fr/fr_rr1034/anaconda3/etc/profile.d/conda.sh # for anaconda3
51-
conda activate /home/fr/fr_fr/fr_rr1034/anaconda3/envs/old_py36_toy_rl # should be conda activate and not source when using anaconda3?
51+
conda activate /home/fr/fr_fr/fr_rr1034/anaconda3/envs/py36_toy_rl # should be conda activate and not source when using anaconda3?
5252
echo $?
5353
echo Paths: $PATH
5454
#/home/rajanr/anaconda3/bin/conda activate /home/rajanr/anaconda2/envs/py36
@@ -74,13 +74,11 @@ JOB_ID=`echo ${MOAB_JOBID} | cut -d'[' -f 1`
7474
mkdir -p mdpp_${JOB_ID}
7575
cd mdpp_${JOB_ID}
7676
# cd /home/rajanr/mdpp
77-
echo ${MOAB_JOBID} ${MOAB_JOBARRAYINDEX} ${MOAB_JOBNAME}
77+
echo "MOAB_JOBID:" ${MOAB_JOBID} "MOAB_JOBARRAYINDEX:" ${MOAB_JOBARRAYINDEX} "MOAB_JOBNAME:" ${MOAB_JOBNAME}
7878

79-
# for i in {0..0}
80-
# do
81-
echo -e "Running env config $i:\n"
82-
\time -v python3 /home/fr/fr_fr/fr_rr1034/mdp-playground/run_experiments.py --exp-name ${EXP_NAME} --config-file /home/fr/fr_fr/fr_rr1034/mdp-playground/experiments/${EXP_NAME} --config-num 0 --agent-config-num ${MOAB_JOBARRAYINDEX} --framework-dir ${TMPDIR}
83-
# done
79+
80+
\time -v python3 /home/fr/fr_fr/fr_rr1034/mdp-playground/run_experiments.py --exp-name ${EXP_NAME} --config-file /home/fr/fr_fr/fr_rr1034/mdp-playground/experiments/${EXP_NAME} --config-num ${MOAB_JOBARRAYINDEX} --framework-dir ${TMPDIR}
81+
#/work/ws/nemo/fr_rr1034-ws_mdpp-0/mdpp_10405451/ray
8482

8583

8684
#python output_argv_1.py

‎setup.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@
6565
# package_dir={"": "src"},
6666
packages=find_packages(),
6767
python_requires=">=3.6",
68-
install_requires=['gym'],
68+
install_requires=['gym<=0.14', 'dill'],
6969
extras_require={
7070
'extras_disc': extras_require,
7171
'extras_cont': extras_require_cont,

‎tests/test_mdp_playground.py

+63
Original file line numberDiff line numberDiff line change
@@ -476,6 +476,69 @@ def test_continuous_dynamics_target_point_sparse(self):
476476
env.close()
477477

478478

479+
def test_continuous_image_representations(self):
480+
''''''
481+
print('\033[32;1;4mTEST_CONTINUOUS_IMAGE_REPRESENTATIONS\033[0m')
482+
config = {}
483+
config["log_filename"] = log_filename
484+
config["seed"] = 0
485+
486+
config["state_space_type"] = "continuous"
487+
config["action_space_type"] = "continuous"
488+
config["state_space_dim"] = 2
489+
config["action_space_dim"] = 2
490+
config["delay"] = 0
491+
config["sequence_length"] = 1 # seq_len is always going to be 1 for move_to_a_point R. assert for this?
492+
config["transition_dynamics_order"] = 1
493+
config["inertia"] = 1.0
494+
config["time_unit"] = 1
495+
496+
config["reward_function"] = "move_to_a_point"
497+
# config["make_denser"] = False
498+
config["state_space_max"] = 5 # Will be a Box in the range [-max, max]
499+
config["target_point"] = [-0.29792, 1.71012]
500+
config["target_radius"] = 0.172 # to give reward in 3rd last step. At each step, the distance reduces by ~0.035355 to the final point of this trajectory which is also the target point by design for this test.
501+
config["reward_scale"] = 2.0
502+
503+
504+
config["image_representations"] = True
505+
config["image_width"] = 100
506+
config["image_height"] = 100
507+
env = RLToyEnv(**config)
508+
state = env.get_augmented_state()['augmented_state'][-1]
509+
# init state: [ 1.9652315 -2.4397445]
510+
expected_image_sums = [7546980, 7546980, 7546980, 7547490, 7587270]
511+
512+
# obs = env.curr_obs
513+
# import PIL.Image as Image
514+
# img1 = Image.fromarray(np.squeeze(obs), 'RGB')
515+
# img1.show()
516+
517+
for i in range(5):
518+
# action = env.action_space.sample()
519+
action = np.array([-0.45, 0.8]) # just to test if acting "in a line" works
520+
next_obs, reward, done, info = env.step(action)
521+
next_state = env.get_augmented_state()['augmented_state'][-1]
522+
print("sars', done =", state, action, reward, next_state, done)
523+
state = next_state.copy()
524+
525+
# obs = env.curr_obs
526+
# import PIL.Image as Image
527+
# img1 = Image.fromarray(np.squeeze(obs), 'RGB')
528+
# img1.show()
529+
530+
if i < len(expected_image_sums):
531+
assert next_obs.sum() == expected_image_sums[i], "Expected sum over image pixels: " + str(expected_image_sums[i]) + ". Was: " + str(next_obs.sum())
532+
533+
final_dist = np.linalg.norm(state - np.array(config["target_point"]))
534+
assert final_dist < config["target_radius"]
535+
536+
# test_ = np.allclose(a, b, rtol=1e-05, atol=1e-08, equal_nan=False)
537+
# self.assertAlmostEqual(state, np.array([21.59339006, 20.68189965, 21.49608203, 20.19183292]), places=3) # Error
538+
env.reset()
539+
env.close()
540+
541+
479542
def test_discrete_dynamics(self):
480543
'''Tests the P dynamics. Tests whether actions taken in terminal states lead back to the same terminal state. Tests if state in discrete environments is an int.
481544
'''

0 commit comments

Comments
 (0)
Please sign in to comment.