Fix interpret objective

PavelCz · PavelCz · commit 554d9efda486 · 2022-12-05T20:52:21.000+01:00
diff --git a/src/reward_preprocessing/interpret.py b/src/reward_preprocessing/interpret.py
@@ -171,7 +171,9 @@ def interpret(
     else:  # Use GAN
         # Combine rew net with GAN.
         gan = th.load(gan_path, map_location=th.device(device))
-        model_to_analyse = RewardGeneratorCombo(reward_net=rew_net, generator=gan.generator)
+        model_to_analyse = RewardGeneratorCombo(
+            reward_net=rew_net, generator=gan.generator
+        )
 
     model_to_analyse.eval()  # Eval for visualization.
 
@@ -220,8 +222,8 @@ def interpret(
         activation_fn="sigmoid",
     )
 
-    custom_logger.log(f"Dimensionality reduction (to, from): {nmf.channel_dirs.shape}")
     # If these are equal, then of course there is no actual reduction.
+    custom_logger.log(f"Dimensionality reduction (to, from): {nmf.channel_dirs.shape}")
 
     num_features = nmf.channel_dirs.shape[0]
     rows, columns = 2, num_features
@@ -282,6 +284,7 @@ def interpret(
         actions = th.tensor(list(range(num_features))).to(device)
         assert len(actions) == len(obs)
         rews = rew_net(obs.to(device), actions, next_obs.to(device), done=None)
+        custom_logger.log(f"Rewards: {rews}")
 
         # Use numpy from here.
         obs = obs.detach().cpu().numpy()
diff --git a/src/reward_preprocessing/vis/objectives.py b/src/reward_preprocessing/vis/objectives.py
@@ -1,9 +1,24 @@
 """Objectives that extend the objectives available in lucent.optvis.objectives"""
+from typing import Optional
+
 from lucent.optvis.objectives import handle_batch, wrap_objective
 from lucent.optvis.objectives_util import _extract_act_pos
 import torch as th
 
 
+@wrap_objective()
+def max_index_1d(layer: str, i: int, batch: Optional[int] = None):
+    """Maximize the value at a specific index in a 1D tensor."""
+
+    @handle_batch(batch)
+    def inner(model):
+        layer_t = model(layer)
+        # This is (batch_size, n), we want to maximize the ith element of each batch.
+        return -layer_t[:, i].mean()
+
+    return inner
+
+
 @wrap_objective()
 def direction_neuron_dim_agnostic(layer, direction, x=None, y=None, batch=None):
     """The lucent direction neuron objective, modified to allow 2-dimensional
diff --git a/src/reward_preprocessing/vis/reward_vis.py b/src/reward_preprocessing/vis/reward_vis.py
@@ -3,6 +3,7 @@
 import logging
 from typing import Callable, Dict, List, Optional, Union
 
+from lucent.optvis import objectives
 from lucent.optvis.objectives import handle_batch, wrap_objective
 import lucent.optvis.param as param
 import lucent.optvis.render as render
@@ -263,10 +264,16 @@ def vis_traditional(
             feature_list = [feature_list]
 
         obj = sum(
+            # Original with cosine similarity:
+            # [
+            #     objectives_rfi.direction_neuron_dim_agnostic(
+            #         self.layer_name, self.channel_dirs[feature], batch=feature
+            #     )
+            #     for feature in feature_list
+            # ]
+            # New:
             [
-                objectives_rfi.direction_neuron_dim_agnostic(
-                    self.layer_name, self.channel_dirs[feature], batch=feature
-                )
+                objectives_rfi.max_index_1d(self.layer_name, feature, batch=feature)
                 for feature in feature_list
             ]
         )