-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathdouble_q_learn_tabular_seq.py
118 lines (103 loc) · 3.64 KB
/
double_q_learn_tabular_seq.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
num_seeds = 10
from collections import OrderedDict
var_env_configs = OrderedDict({
'state_space_size': [8],#, 10, 12, 14] # [2**i for i in range(1,6)]
'action_space_size': [8],#2, 4, 8, 16] # [2**i for i in range(1,6)]
'delay': [0], # + [2**i for i in range(4)],
'sequence_length': [i for i in range(1, 5)],
'reward_density': [0.25], # np.linspace(0.0, 1.0, num=5)
'make_denser': [False],
'terminal_state_density': [0.25], # np.linspace(0.1, 1.0, num=5)
'transition_noise': [0],#, 0.01, 0.02, 0.10, 0.25]
'reward_noise': [0],#, 1, 5, 10, 25] # Std dev. of normal dist.
'dummy_seed': [i for i in range(num_seeds)],
})
var_configs = OrderedDict({
"env": var_env_configs
})
env_config = {
"env": "RLToy-v0",
"horizon": 100,
"env_config": {
'seed': 0, #seed
'state_space_type': 'discrete',
'action_space_type': 'discrete',
'generate_random_mdp': True,
'repeats_in_sequences': False,
'reward_scale': 1.0,
'completely_connected': True,
},
}
import yaml
with open("tabular_rl/config.yaml", "r") as stream:
config = yaml.safe_load(stream)
env_name = config["env_name"]
agent_name = config["agent_name"]
agent_config = config["agents"][agent_name]
eval_eps = config["eval_eps"]
seed = config["seed"]
no_render = config["no_render"]
discount_factor = config["discount_factor"]
alpha = agent_config["alpha"]
episodes = agent_config["episodes"]
env_max_steps = agent_config["env_max_steps"]
agent_eps_decay = agent_config["agent_eps_decay"]
agent_eps = agent_config["agent_eps"]
#timesteps_per_iteration = agent_config["timesteps_per_iteration"]
agent_config = eval_eps
agent_config = {
#"env_max_steps": env_max_steps,
"num_episodes": episodes,
"epsilon_decay": agent_eps_decay,
"epsilon": agent_eps,
"render_eval": no_render,
"discount_factor": discount_factor,
"alpha": alpha,
"eval_every": eval_eps,
#"timesteps_per_iteration": timesteps_per_iteration, #todo: perhaps pass this later as an argument to the agent
}
algorithm = "double_q_learn_tabular_seq"
# agent_config = {
# "adam_epsilon": 1e-4,
# "beta_annealing_fraction": 1.0,
# "buffer_size": 1000000,
# "double_q": False,
# "dueling": False,
# "exploration_final_eps": 0.01,
# "exploration_fraction": 0.1,
# "final_prioritized_replay_beta": 1.0,
# "hiddens": None,
# "learning_starts": 1000,
# "lr": 1e-4, # "lr": grid_search([1e-2, 1e-4, 1e-6]),
# "n_step": 1,
# "noisy": False,
# "num_atoms": 1,
# "prioritized_replay": False,
# "prioritized_replay_alpha": 0.5,
# "sample_batch_size": 4,
# "schedule_max_timesteps": 20000,
# "target_network_update_freq": 800,
# "timesteps_per_iteration": 1000,
# "min_iter_time_s": 0,
# "train_batch_size": 32,
# }
model_config = {
"model": {
"fcnet_hiddens": [256, 256],
"custom_preprocessor": "ohe",
"custom_options": {}, # extra options to pass to your preprocessor
"fcnet_activation": "tanh",
"use_lstm": False,
"max_seq_len": 20,
"lstm_cell_size": 256,
"lstm_use_prev_action_reward": False,
},
}
value_tuples = []
for config_type, config_dict in var_configs.items():
for key in config_dict:
assert type(var_configs[config_type][key]) == list, "var_config should be a dict of dicts with lists as the leaf values to allow each configuration option to take multiple possible values"
value_tuples.append(var_configs[config_type][key])
import itertools
cartesian_product_configs = list(itertools.product(*value_tuples))
print("Total number of configs. to run:", len(cartesian_product_configs))