@@ -174,16 +174,18 @@ def interpret(
174
174
# Imitation reward nets have 4 input args, lucent expects models to only have 1.
175
175
# This wrapper makes it so rew_net accepts a single input which is a
176
176
# transition tensor.
177
- rew_net = TensorTransitionWrapper (rew_net )
177
+ model_to_analyse = TensorTransitionWrapper (rew_net )
178
178
else : # Use GAN
179
179
# Combine rew net with GAN.
180
180
gan = th .load (gan_path , map_location = th .device (device ))
181
- rew_net = RewardGeneratorCombo (reward_net = rew_net , generator = gan .generator )
181
+ model_to_analyse = RewardGeneratorCombo (
182
+ reward_net = rew_net , generator = gan .generator
183
+ )
182
184
183
- rew_net .eval () # Eval for visualization.
185
+ model_to_analyse .eval () # Eval for visualization.
184
186
185
187
custom_logger .log ("Available layers:" )
186
- custom_logger .log (get_model_layers (rew_net ))
188
+ custom_logger .log (get_model_layers (model_to_analyse ))
187
189
188
190
# Load the inputs into the model that are used to do dimensionality reduction and
189
191
# getting the shape of activations.
@@ -217,7 +219,6 @@ def interpret(
217
219
# In our case this is one of the following:
218
220
# - A reward net that has been wrapped, so it accepts transition tensors.
219
221
# - A combo of GAN and reward net that accepts latent inputs vectors.
220
- model_to_analyse = rew_net
221
222
nmf = LayerNMF (
222
223
model = model_to_analyse ,
223
224
features = num_features ,
@@ -228,8 +229,8 @@ def interpret(
228
229
activation_fn = "sigmoid" ,
229
230
)
230
231
231
- custom_logger .log (f"Dimensionality reduction (to, from): { nmf .channel_dirs .shape } " )
232
232
# If these are equal, then of course there is no actual reduction.
233
+ custom_logger .log (f"Dimensionality reduction (to, from): { nmf .channel_dirs .shape } " )
233
234
234
235
num_features = nmf .channel_dirs .shape [0 ]
235
236
rows , columns = 2 , num_features
@@ -246,14 +247,16 @@ def interpret(
246
247
# This does the actual interpretability, i.e. it calculates the
247
248
# visualizations.
248
249
opt_transitions = nmf .vis_traditional (transforms = transforms )
249
- # This gives as an array that optimizes the objectives, in the shape of the
250
+ # This gives us an array that optimizes the objectives, in the shape of the
250
251
# input which is a transition tensor. However, lucent helpfully transposes
251
252
# the output such that the channel dimension is last. Our functions expect
252
253
# channel dim before spatial dims, so we need to transpose it back.
253
254
opt_transitions = opt_transitions .transpose (0 , 3 , 1 , 2 )
255
+ # In the following we need opt_transitions to be a pytorch tensor.
256
+ opt_transitions = th .tensor (opt_transitions )
254
257
# Split the optimized transitions, one for each feature, into separate
255
258
# observations and actions. This function only works with torch tensors.
256
- obs , acts , next_obs = tensor_to_transition (th . tensor ( opt_transitions ) )
259
+ obs , acts , next_obs = tensor_to_transition (opt_transitions )
257
260
# obs and next_obs output have channel dim last.
258
261
# acts is output as one-hot vector.
259
262
else :
@@ -272,15 +275,40 @@ def interpret(
272
275
opt_transitions = gan .generator (opt_latent_th )
273
276
obs , acts , next_obs = tensor_to_transition (opt_transitions )
274
277
278
+ # What reward does the model output for these generated transitions?
279
+ # (done isn't used in the reward function)
280
+ # There are three possible options here:
281
+ # - The reward net does not use action -> it does not matter what we pass as
282
+ # action.
283
+ # - The reward net does use action, and we are optimizing an intermediate layer
284
+ # -> since action is only used on the final layer (to choose which of the 15
285
+ # heads has the correct reward), it does not matter what we pass as action.
286
+ # - The reward net does use action, and we are optimizing the final layer
287
+ # -> the action index of the action corresponds to the index of the feature.
288
+ # Note that since actions is only used to choose which head to use, there are no
289
+ # gradients from the reward to the action. Consequently, acts in opt_latent is
290
+ # meaningless.
291
+ actions = th .tensor (list (range (num_features ))).to (device )
292
+ assert len (actions ) == len (obs )
293
+ rews = rew_net (obs .to (device ), actions , next_obs .to (device ), done = None )
294
+ custom_logger .log (f"Rewards: { rews } " )
295
+
275
296
# Use numpy from here.
276
297
obs = obs .detach ().cpu ().numpy ()
277
298
next_obs = next_obs .detach ().cpu ().numpy ()
299
+ rews = rews .detach ().cpu ().numpy ()
278
300
279
301
# We want to plot the name of the action, if applicable.
280
302
features_are_actions = _determine_features_are_actions (nmf , layer_name )
281
303
282
304
# Set of images, one for each feature, add each to plot
283
305
for feature_i in range (next_obs .shape [0 ]):
306
+ # Log the rewards
307
+ rew_key = f"rew_feat_{ feature_i :02} "
308
+ if features_are_actions :
309
+ rew_key += f"_{ _get_action_meaning (action_id = feature_i )} "
310
+ custom_logger .record (rew_key , rews [feature_i ])
311
+ # Log the images
284
312
sub_img_obs = obs [feature_i ]
285
313
sub_img_next_obs = next_obs [feature_i ]
286
314
_log_single_transition_wandb (
0 commit comments