Skip to content

Commit 8a90502

Browse files
authored
Merge pull request #24 from HumanCompatibleAI/extend-interpret
Fix objective for feature visualization
2 parents 283e84b + a2bc763 commit 8a90502

File tree

3 files changed

+67
-11
lines changed

3 files changed

+67
-11
lines changed

src/reward_preprocessing/interpret.py

Lines changed: 36 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -174,16 +174,18 @@ def interpret(
174174
# Imitation reward nets have 4 input args, lucent expects models to only have 1.
175175
# This wrapper makes it so rew_net accepts a single input which is a
176176
# transition tensor.
177-
rew_net = TensorTransitionWrapper(rew_net)
177+
model_to_analyse = TensorTransitionWrapper(rew_net)
178178
else: # Use GAN
179179
# Combine rew net with GAN.
180180
gan = th.load(gan_path, map_location=th.device(device))
181-
rew_net = RewardGeneratorCombo(reward_net=rew_net, generator=gan.generator)
181+
model_to_analyse = RewardGeneratorCombo(
182+
reward_net=rew_net, generator=gan.generator
183+
)
182184

183-
rew_net.eval() # Eval for visualization.
185+
model_to_analyse.eval() # Eval for visualization.
184186

185187
custom_logger.log("Available layers:")
186-
custom_logger.log(get_model_layers(rew_net))
188+
custom_logger.log(get_model_layers(model_to_analyse))
187189

188190
# Load the inputs into the model that are used to do dimensionality reduction and
189191
# getting the shape of activations.
@@ -217,7 +219,6 @@ def interpret(
217219
# In our case this is one of the following:
218220
# - A reward net that has been wrapped, so it accepts transition tensors.
219221
# - A combo of GAN and reward net that accepts latent inputs vectors.
220-
model_to_analyse = rew_net
221222
nmf = LayerNMF(
222223
model=model_to_analyse,
223224
features=num_features,
@@ -228,8 +229,8 @@ def interpret(
228229
activation_fn="sigmoid",
229230
)
230231

231-
custom_logger.log(f"Dimensionality reduction (to, from): {nmf.channel_dirs.shape}")
232232
# If these are equal, then of course there is no actual reduction.
233+
custom_logger.log(f"Dimensionality reduction (to, from): {nmf.channel_dirs.shape}")
233234

234235
num_features = nmf.channel_dirs.shape[0]
235236
rows, columns = 2, num_features
@@ -246,14 +247,16 @@ def interpret(
246247
# This does the actual interpretability, i.e. it calculates the
247248
# visualizations.
248249
opt_transitions = nmf.vis_traditional(transforms=transforms)
249-
# This gives as an array that optimizes the objectives, in the shape of the
250+
# This gives us an array that optimizes the objectives, in the shape of the
250251
# input which is a transition tensor. However, lucent helpfully transposes
251252
# the output such that the channel dimension is last. Our functions expect
252253
# channel dim before spatial dims, so we need to transpose it back.
253254
opt_transitions = opt_transitions.transpose(0, 3, 1, 2)
255+
# In the following we need opt_transitions to be a pytorch tensor.
256+
opt_transitions = th.tensor(opt_transitions)
254257
# Split the optimized transitions, one for each feature, into separate
255258
# observations and actions. This function only works with torch tensors.
256-
obs, acts, next_obs = tensor_to_transition(th.tensor(opt_transitions))
259+
obs, acts, next_obs = tensor_to_transition(opt_transitions)
257260
# obs and next_obs output have channel dim last.
258261
# acts is output as one-hot vector.
259262
else:
@@ -272,15 +275,40 @@ def interpret(
272275
opt_transitions = gan.generator(opt_latent_th)
273276
obs, acts, next_obs = tensor_to_transition(opt_transitions)
274277

278+
# What reward does the model output for these generated transitions?
279+
# (done isn't used in the reward function)
280+
# There are three possible options here:
281+
# - The reward net does not use action -> it does not matter what we pass as
282+
# action.
283+
# - The reward net does use action, and we are optimizing an intermediate layer
284+
# -> since action is only used on the final layer (to choose which of the 15
285+
# heads has the correct reward), it does not matter what we pass as action.
286+
# - The reward net does use action, and we are optimizing the final layer
287+
# -> the action index of the action corresponds to the index of the feature.
288+
# Note that since actions is only used to choose which head to use, there are no
289+
# gradients from the reward to the action. Consequently, acts in opt_latent is
290+
# meaningless.
291+
actions = th.tensor(list(range(num_features))).to(device)
292+
assert len(actions) == len(obs)
293+
rews = rew_net(obs.to(device), actions, next_obs.to(device), done=None)
294+
custom_logger.log(f"Rewards: {rews}")
295+
275296
# Use numpy from here.
276297
obs = obs.detach().cpu().numpy()
277298
next_obs = next_obs.detach().cpu().numpy()
299+
rews = rews.detach().cpu().numpy()
278300

279301
# We want to plot the name of the action, if applicable.
280302
features_are_actions = _determine_features_are_actions(nmf, layer_name)
281303

282304
# Set of images, one for each feature, add each to plot
283305
for feature_i in range(next_obs.shape[0]):
306+
# Log the rewards
307+
rew_key = f"rew_feat_{feature_i:02}"
308+
if features_are_actions:
309+
rew_key += f"_{_get_action_meaning(action_id=feature_i)}"
310+
custom_logger.record(rew_key, rews[feature_i])
311+
# Log the images
284312
sub_img_obs = obs[feature_i]
285313
sub_img_next_obs = next_obs[feature_i]
286314
_log_single_transition_wandb(

src/reward_preprocessing/vis/objectives.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,24 @@
11
"""Objectives that extend the objectives available in lucent.optvis.objectives"""
2+
from typing import Optional
3+
24
from lucent.optvis.objectives import handle_batch, wrap_objective
35
from lucent.optvis.objectives_util import _extract_act_pos
46
import torch as th
57

68

9+
@wrap_objective()
10+
def max_index_1d(layer: str, i: int, batch: Optional[int] = None):
11+
"""Maximize the value at a specific index in a 1D tensor."""
12+
13+
@handle_batch(batch)
14+
def inner(model):
15+
layer_t = model(layer)
16+
# This is (batch_size, n), we want to maximize the ith element of each batch.
17+
return -layer_t[:, i].mean()
18+
19+
return inner
20+
21+
722
@wrap_objective()
823
def direction_neuron_dim_agnostic(layer, direction, x=None, y=None, batch=None):
924
"""The lucent direction neuron objective, modified to allow 2-dimensional

src/reward_preprocessing/vis/reward_vis.py

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -263,10 +263,23 @@ def vis_traditional(
263263
feature_list = [feature_list]
264264

265265
obj = sum(
266+
# Original with cosine similarity (for if we go back to interpreting neuron
267+
# directions in intermediate layers_:
268+
# [
269+
# objectives_rfi.direction_neuron_dim_agnostic(
270+
# self.layer_name, self.channel_dirs[feature], batch=feature
271+
# )
272+
# for feature in feature_list
273+
# ]
274+
# New:
275+
# Sum up all objectives such that we simultaneously optimize for all.
276+
# Each objective maximizes the output for one of the activations (in this
277+
# case equivalent to the reward for the respective actions, or overall
278+
# reward if we don't differentiate between actions) and depends only on the
279+
# input at that same index.
280+
# In other words, each input maximizes its respective activation.
266281
[
267-
objectives_rfi.direction_neuron_dim_agnostic(
268-
self.layer_name, self.channel_dirs[feature], batch=feature
269-
)
282+
objectives_rfi.max_index_1d(self.layer_name, feature, batch=feature)
270283
for feature in feature_list
271284
]
272285
)

0 commit comments

Comments
 (0)