Skip to content

Commit 0c12ada

Browse files
committed
Added optimizers to policy
* Removed save_obs param from run_model * Updated nsra.py * Fixed _get_pos bug in hrl envs
1 parent 31bd5bc commit 0c12ada

File tree

9 files changed

+65
-74
lines changed

9 files changed

+65
-74
lines changed

configs/ns.json

+11-4
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"env": {
3-
"name": "HopperBulletEnv-v0",
3+
"name": "AntMaze-v0",
44
"max_steps": 2000
55
},
66
"noise": {
@@ -10,17 +10,24 @@
1010
"std_decay": 1
1111
},
1212
"policy": {
13-
"ac_std": 0.01,
13+
"layer_sizes": [
14+
256,
15+
256,
16+
256
17+
],
18+
"ac_std": 0.05,
19+
"ac_std_decay": 0.99,
1420
"l2coeff": 0.005,
1521
"lr": 0.01,
1622
"lr_limit": 0.001,
1723
"lr_decay": 1,
18-
"save_obs_chance": 0.01
24+
"save_obs_chance": 0.01,
25+
"ob_clip": 5
1926
},
2027
"general": {
2128
"name": "ns",
2229
"gens": 5000,
23-
"policies_per_gen": 9600,
30+
"policies_per_gen": 4800,
2431
"eps_per_policy": 1,
2532
"n_policies": 5,
2633
"batch_size": 500,

multi_agent.py

+10-11
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
from src.gym.unity import UnityGymWrapper
1515
from src.nn.nn import FeedForward
1616
from src.nn.obstat import ObStat
17-
from src.nn.optimizers import Adam, Optimizer
17+
from src.nn.optimizers import Adam
1818
from src.utils import utils
1919
from src.utils.rankers import CenteredRanker
2020
from src.utils.reporters import LoggerReporter, ReporterSet, StdoutReporter, MLFlowReporter
@@ -92,29 +92,28 @@ def custom_test_params(n: int, policies: List[Policy], fit_fn, obstats: List[ObS
9292
# initializing obstat, policy, optimizer, noise and ranker
9393
obstats: List[ObStat] = [ObStat(env.observation_space[i].shape, 1e-2) for i in range(2)]
9494
neuralnets = [FeedForward(cfg.policy.layer_sizes, torch.nn.Tanh(), env, cfg.policy.ac_std, cfg.policy.ob_clip)]
95-
policies: List[Policy] = [Policy(nn, cfg.noise.std) for nn in neuralnets]
96-
optims: List[Optimizer] = [Adam(policy, cfg.policy.lr) for policy in policies]
95+
policies: List[Policy] = [Policy(nn, cfg, Adam) for nn in neuralnets]
9796
nt: NoiseTable = NoiseTable.create_shared(comm, cfg.noise.tbl_size, len(policies[0]), None, cfg.general.seed)
9897
ranker = CenteredRanker()
9998

10099

101100
def r_fn(models: List[torch.nn.Module], use_ac_noise=True) -> TrainingResult:
102101
save_obs = rs.random() < cfg.policy.save_obs_chance
103-
rews, behv, obs, stps = gym_runner.multi_agent_gym_runner(models,
104-
env,
105-
cfg.env.max_steps,
106-
rs if use_ac_noise else None,
107-
save_obs)
108-
return MultiAgentTrainingResult(rews, behv, obs, stps)
102+
rews, behv, obs, steps = gym_runner.multi_agent_gym_runner(models,
103+
env,
104+
cfg.env.max_steps,
105+
rs if use_ac_noise else None)
106+
return MultiAgentTrainingResult(rews, behv,
107+
obs if save_obs else np.array([np.zeros(env.observation_space.shape)]), steps)
109108

110109

111110
for gen in range(cfg.general.gens):
112111
reporter.start_gen()
113112
gen_obstats = [ObStat(env.observation_space[i].shape, 0) for i in range(2)]
114113
results = custom_test_params(eps_per_proc, policies, r_fn, gen_obstats)
115-
for (pos_res, neg_res, inds, steps), policy, optim in zip(results, policies, optims):
114+
for (pos_res, neg_res, inds, steps), policy in zip(results, policies):
116115
ranker.rank(pos_res, neg_res, inds)
117-
es.approx_grad(ranker, nt, policy.flat_params, optim, cfg.general.batch_size, cfg.policy.l2coeff)
116+
es.approx_grad(policy, ranker, nt, policy.flat_params, cfg.general.batch_size, cfg.policy.l2coeff)
118117
noiseless_result = RewardResult([0], [0], np.empty(1), 0)
119118
reporter.log_gen(ranker.fits, noiseless_result, policy, steps)
120119

nsra.py

+11-14
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,7 @@
1515
from src.gym import gym_runner
1616
from src.gym.training_result import NSRResult, NSResult
1717
from src.nn.nn import FeedForward
18-
from src.nn.obstat import ObStat
19-
from src.nn.optimizers import Adam, Optimizer
18+
from src.nn.optimizers import Adam
2019
from src.utils import utils
2120
from src.utils.novelty import update_archive, novelty
2221
from src.utils.rankers import CenteredRanker, MultiObjectiveRanker
@@ -89,23 +88,21 @@ def main(cfg: Munch):
8988

9089
archive: Optional[np.ndarray] = None
9190

92-
def ns_fn(model: torch.nn.Module) -> NSRResult:
91+
def ns_fn(model: torch.nn.Module, use_ac_noise=True) -> NSRResult:
9392
"""Reward function"""
94-
rews, behv, obs, steps = gym_runner.run_model(model, env, cfg.env.max_steps, rs)
95-
return NSRResult(rews, behv, obs, steps, archive, cfg.novelty.k)
93+
save_obs = rs.random() < cfg.policy.save_obs_chance
94+
rews, behv, obs, steps = gym_runner.run_model(model, env, cfg.env.max_steps, rs if use_ac_noise else None)
95+
return NSRResult(rews, behv, obs if save_obs else np.array([np.zeros(env.observation_space.shape)]), steps,
96+
archive, cfg.novelty.k)
9697

9798
# init population
9899
population = []
99100
nns = []
100101
for _ in range(cfg.general.n_policies):
101102
nns.append(FeedForward(cfg.policy.layer_sizes, torch.nn.Tanh(), env, cfg.policy.ac_std, cfg.policy.ob_clip))
102-
population.append(Policy(nns[-1], cfg.noise.std))
103+
population.append(Policy(nns[-1], cfg, Adam))
103104
# init optimizer and noise table
104-
optims: List[Optimizer] = [Adam(policy, cfg.policy.lr) for policy in population]
105105
nt: NoiseTable = NoiseTable.create_shared(comm, cfg.noise.tbl_size, len(population[0]), reporter, cfg.general.seed)
106-
107-
obstat: ObStat = ObStat(env.observation_space.shape, 1e-2) # eps to prevent dividing by zero at the beginning
108-
109106
policies_best_rewards = [-np.inf] * cfg.general.n_policies
110107
time_since_best = [0 for _ in range(cfg.general.n_policies)] # TODO should this be per individual?
111108
obj_weight = [cfg.nsr.initial_w for _ in range(cfg.general.n_policies)]
@@ -120,7 +117,6 @@ def ns_fn(model: torch.nn.Module) -> NSRResult:
120117
idx = random.choices(list(range(len(policies_novelties))), weights=policies_novelties, k=1)[0]
121118
if cfg.nsr.progressive: idx = gen % cfg.general.n_policies
122119
idx = comm.scatter([idx] * comm.size)
123-
nns[idx].set_ob_mean_std(obstat.mean, obstat.std)
124120
ranker = MultiObjectiveRanker(CenteredRanker(), obj_weight[idx])
125121
# reporting
126122
if cfg.general.mlflow: mlflow_reporter.set_active_run(idx)
@@ -129,11 +125,12 @@ def ns_fn(model: torch.nn.Module) -> NSRResult:
129125
reporter.log({'w': obj_weight[idx]})
130126
reporter.log({'time since best': time_since_best[idx]})
131127
# running es
132-
tr, gen_obstat = es.step(cfg, comm, population[idx], optims[idx], nt, env, ns_fn, rs, ranker, reporter)
128+
tr, gen_obstat = es.step(cfg, comm, population[idx], nt, env, ns_fn, rs, ranker, reporter)
129+
for policy in population:
130+
policy.update_obstat(gen_obstat) # shared obstat
131+
133132
# sharing result and obstat
134133
tr = comm.scatter([tr] * comm.size)
135-
gen_obstat.mpi_inc(comm)
136-
obstat += gen_obstat
137134
# updating the weighting for choosing the next policy to be evaluated
138135
behv = comm.scatter([mean_behv(population[idx], ns_fn, cfg.novelty.rollouts)] * comm.size)
139136
nov = comm.scatter([novelty(behv, archive, cfg.novelty.k)] * comm.size)

obj.py

+9-9
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
from src.gym import gym_runner
1313
from src.gym.training_result import TrainingResult, RewardResult
1414
from src.nn.nn import FeedForward, BaseNet
15-
from src.nn.optimizers import Adam, Optimizer
15+
from src.nn.optimizers import Adam
1616
from src.utils import utils
1717
from src.utils.rankers import CenteredRanker, EliteRanker
1818
from src.utils.reporters import LoggerReporter, ReporterSet, StdoutReporter, MLFlowReporter
@@ -42,9 +42,9 @@ def main(cfg):
4242
nn: BaseNet = policy._module
4343
else:
4444
nn: BaseNet = FeedForward(cfg.policy.layer_sizes, torch.nn.Tanh(), env, cfg.policy.ac_std, cfg.policy.ob_clip)
45-
policy: Policy = Policy(nn, cfg.noise.std)
45+
policy: Policy = Policy(nn, cfg, Adam)
46+
# optim: Optimizer = Adam(policy, cfg.policy.lr)
4647

47-
optim: Optimizer = Adam(policy, cfg.policy.lr)
4848
nt: NoiseTable = NoiseTable.create_shared(comm, cfg.noise.tbl_size, len(policy), reporter, cfg.general.seed)
4949

5050
ranker = CenteredRanker()
@@ -59,12 +59,12 @@ def r_fn(model: torch.nn.Module, use_ac_noise=True) -> TrainingResult:
5959
save_obs = rs.random() < cfg.policy.save_obs_chance
6060
rews = np.zeros(cfg.env.max_steps)
6161
for _ in range(max(1, cfg.general.eps_per_policy)):
62-
rew, behv, obs, steps = gym_runner.run_model(model, env, cfg.env.max_steps,
63-
rs if use_ac_noise else None, save_obs)
62+
rew, behv, obs, steps = gym_runner.run_model(model, env, cfg.env.max_steps, rs if use_ac_noise else None)
6463
rews[:len(rew)] += np.array(rew)
6564

6665
rews /= max(1, cfg.general.eps_per_policy)
67-
return RewardResult(rews.tolist(), behv, obs, steps)
66+
return RewardResult(rews.tolist(), behv, obs if save_obs else np.array([np.zeros(env.observation_space.shape)]),
67+
steps)
6868

6969
time_since_best = 0
7070
noise_std_inc = 0.08
@@ -75,16 +75,16 @@ def r_fn(model: torch.nn.Module, use_ac_noise=True) -> TrainingResult:
7575
if cfg.noise.std_decay != 1:
7676
reporter.log({'noise std': policy.std})
7777
if cfg.policy.lr_decay != 1:
78-
reporter.log({'lr': optim.lr})
78+
reporter.log({'lr': policy.optim.lr})
7979
if cfg.policy.ac_std_decay != 1:
8080
reporter.log({'ac std': nn._action_std})
8181

82-
tr, gen_obstat = es.step(cfg, comm, policy, optim, nt, env, r_fn, rs, ranker, reporter)
82+
tr, gen_obstat = es.step(cfg, comm, policy, nt, env, r_fn, rs, ranker, reporter)
8383
policy.update_obstat(gen_obstat)
8484

8585
cfg.policy.ac_std = nn._action_std = nn._action_std * cfg.policy.ac_std_decay
8686
cfg.noise.std = policy.std = max(cfg.noise.std * cfg.noise.std_decay, cfg.noise.std_limit)
87-
cfg.policy.lr = optim.lr = max(cfg.policy.lr * cfg.policy.lr_decay, cfg.policy.lr_limit)
87+
cfg.policy.lr = policy.optim.lr = max(cfg.policy.lr * cfg.policy.lr_decay, cfg.policy.lr_limit)
8888

8989
reporter.log({'obs recorded': policy.obstat.count})
9090

simple_example.py

+6-9
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
from src.gym.training_result import TrainingResult, RewardResult
1111
from src.nn.nn import FeedForward
1212
from src.nn.obstat import ObStat
13-
from src.nn.optimizers import Adam, Optimizer
13+
from src.nn.optimizers import Adam
1414
from src.utils import utils
1515
from src.utils.rankers import CenteredRanker
1616
from src.utils.utils import generate_seed
@@ -28,34 +28,31 @@
2828
rs = utils.seed(comm, cfg.general.seed, env)
2929

3030
# initializing obstat, policy, optimizer, noise and ranker
31-
obstat: ObStat = ObStat(env.observation_space.shape, 1e-2) # eps to prevent dividing by zero at the beginning
3231
nn = FeedForward(cfg.policy.layer_sizes, torch.nn.Tanh(), env, cfg.policy.ac_std, cfg.policy.ob_clip)
33-
policy: Policy = Policy(nn, cfg.noise.std)
34-
optim: Optimizer = Adam(policy, cfg.policy.lr)
32+
policy: Policy = Policy(nn, cfg, Adam)
3533
nt: NoiseTable = NoiseTable.create_shared(comm, cfg.noise.tbl_size, len(policy), None, cfg.general.seed)
3634
ranker = CenteredRanker()
3735

3836

3937
def r_fn(model: torch.nn.Module) -> TrainingResult:
4038
save_obs = (rs.random() if rs is not None else np.random.random()) < cfg.policy.save_obs_chance
41-
rews, behv, obs, steps = gym_runner.run_model(model, env, 10000, rs, save_obs)
42-
return RewardResult(rews, behv, obs, steps)
39+
rews, behv, obs, steps = gym_runner.run_model(model, env, 10000, rs)
40+
return RewardResult(rews, behv, obs if save_obs else np.array([np.zeros(env.observation_space.shape)]), steps)
4341

4442

4543
assert cfg.general.policies_per_gen % comm.size == 0 and (cfg.general.policies_per_gen / comm.size) % 2 == 0
4644
eps_per_proc = int((cfg.general.policies_per_gen / comm.size) / 2)
4745
for gen in range(cfg.general.gens): # main loop
4846
if comm.rank == 0: print(f'Generation:{gen}') # only print on one process
49-
nn.set_ob_mean_std(obstat.mean, obstat.std) # for normalizing the observation space
5047

5148
# the block below is encapsulated in es.step(...), but this is more flexible. Example use can be seen in obj.py
5249
gen_obstat = ObStat(env.observation_space.shape, 0) # for normalizing the observation space
5350
# obtaining the fitnesses from many perturbed policies
5451
pos_fits, neg_fits, inds, steps = es.test_params(comm, eps_per_proc, policy, nt, gen_obstat, r_fn, rs)
55-
obstat += gen_obstat # adding the new observations to the global obstat
52+
policy.update_obstat(gen_obstat)
5653
ranker.rank(pos_fits, neg_fits, inds) # ranking the fitnesses between -1 and 1
5754
# approximating the gradient and updating policy.flat_params (pseudo backprop)
58-
es.approx_grad(ranker, nt, policy.flat_params, optim, cfg.general.batch_size, cfg.policy.l2coeff)
55+
es.approx_grad(policy, ranker, nt, policy.flat_params, cfg.general.batch_size, cfg.policy.l2coeff)
5956

6057
if comm.rank == 0: print(f'avg fitness:{np.mean(np.concatenate((pos_fits, neg_fits)))}\n\n')
6158
if gen % 10 and comm.rank == 0: # save policy every 10 generations

src/core/es.py

+3-5
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@
1616
from src.core.policy import Policy
1717
from src.gym.training_result import TrainingResult
1818
from src.nn.obstat import ObStat
19-
from src.nn.optimizers import Optimizer
2019
from src.utils.rankers import Ranker, CenteredRanker
2120
from src.utils.reporters import StdoutReporter, Reporter
2221
from src.utils.utils import scale_noise
@@ -26,7 +25,6 @@
2625
def step(cfg,
2726
comm: MPI.Comm,
2827
policy: Policy,
29-
optim: Optimizer,
3028
nt: NoiseTable,
3129
env: gym.Env,
3230
fit_fn: Callable[[Module], TrainingResult],
@@ -48,7 +46,7 @@ def step(cfg,
4846
reporter.print(f'n dupes: {len(inds) - len(set(inds))}')
4947

5048
ranker.rank(pos_res, neg_res, inds)
51-
approx_grad(ranker, nt, policy.flat_params, optim, cfg.general.batch_size, cfg.policy.l2coeff)
49+
approx_grad(policy, ranker, nt, policy.flat_params, cfg.general.batch_size, cfg.policy.l2coeff)
5250
noiseless_result = fit_fn(policy.pheno(np.zeros(len(policy))), False)
5351
reporter.log_gen(ranker.fits, noiseless_result, policy, steps)
5452

@@ -99,7 +97,7 @@ def _share_results(comm: MPI.Comm,
9997
return results.reshape((-1, 1 + 2 * objectives)) # flattening the process dim
10098

10199

102-
def approx_grad(ranker: Ranker, nt: NoiseTable, params: ndarray, optim: Optimizer, batch_size: int, l2coeff: float):
100+
def approx_grad(policy: Policy, ranker: Ranker, nt: NoiseTable, params: ndarray, batch_size: int, l2coeff: float):
103101
"""Approximating gradient and update policy params"""
104102
grad = scale_noise(ranker.ranked_fits, ranker.noise_inds, nt, batch_size) / ranker.n_fits_ranked
105-
optim.step(l2coeff * params - grad)
103+
policy.optim.step(l2coeff * params - grad)

src/core/policy.py

+7-7
Original file line numberDiff line numberDiff line change
@@ -2,29 +2,32 @@
22

33
import os
44
import pickle
5+
from typing import Type
56

67
import numpy as np
78
import torch
9+
from munch import Munch
810

911
from src.nn.nn import BaseNet
1012
from src.nn.obstat import ObStat
13+
from src.nn.optimizers import Optimizer
1114

1215

1316
def init_normal(m):
1417
if type(m) == torch.nn.Linear:
1518
torch.nn.init.kaiming_normal_(m.weight)
1619

1720

18-
class Policy(torch.nn.Module):
19-
def __init__(self, module: BaseNet, std: float):
20-
super().__init__()
21+
class Policy:
22+
def __init__(self, module: BaseNet, cfg: Munch, OptimType: Type[Optimizer]):
2123
module.apply(init_normal)
2224

2325
self._module: BaseNet = module
24-
self.std = std
26+
self.std = cfg.noise.std
2527

2628
self.flat_params: np.ndarray = Policy.get_flat(module)
2729
self.obstat: ObStat = ObStat(module._obmean.shape, 1e-2)
30+
self.optim = OptimType(self, cfg.policy.lr)
2831

2932
def __len__(self):
3033
return len(self.flat_params)
@@ -68,6 +71,3 @@ def pheno(self, noise: np.ndarray = None) -> torch.nn.Module:
6871
def update_obstat(self, obstat: ObStat):
6972
self.obstat += obstat # adding the new observations to the global obstat
7073
self._module.set_ob_mean_std(self.obstat.mean, self.obstat.std)
71-
72-
def forward(self, inp):
73-
self._module.forward(inp)

src/gym/gym_runner.py

+3-8
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@ def run_model(model: torch.nn.Module,
1515
env: gym.Env,
1616
max_steps: int,
1717
rs: np.random.RandomState = None,
18-
save_obs: bool = False,
1918
render: bool = False) -> Tuple[List[float], List[float], np.ndarray, int]:
2019
"""
2120
Evaluates model on the provided env
@@ -32,10 +31,9 @@ def run_model(model: torch.nn.Module,
3231

3332
action = model(ob, rs=rs)
3433
ob, rew, done, _ = env.step(action.numpy())
35-
if save_obs:
36-
obs.append(ob)
3734

3835
rews += [rew]
36+
obs.append(ob)
3937
behv.extend(_get_pos(env.unwrapped))
4038

4139
if render:
@@ -44,9 +42,6 @@ def run_model(model: torch.nn.Module,
4442
if done:
4543
break
4644

47-
if not save_obs:
48-
obs.append(np.zeros(ob.shape))
49-
5045
behv += behv[-3:] * (max_steps - int(len(behv) / 3)) # extending the behaviour vector to have `max_steps` elements
5146
return rews, behv, np.array(obs), step
5247

@@ -96,8 +91,8 @@ def multi_agent_gym_runner(policies: List[torch.nn.Module],
9691

9792

9893
def _get_pos(env):
99-
if env.spec.id[:-3] in ["AntMaze", "AntPush", "AntFall"]:
100-
return env._robot_x, env._robot_y, 0
94+
if env.spec.id[:-3] in ["AntMaze", "AntPush", "AntFall"]: # hrl ant env
95+
return env.wrapped_env.get_body_com("torso")[:3]
10196

10297
if BULLET_ENV_SUFFIX in env.spec.id: # bullet env
10398
return env.robot_body.get_pose()[:3]

0 commit comments

Comments
 (0)