Merge pull request #191 from eggachecat/main

aryamanarora · web-flow · commit f6dbee1d21a9 · 2025-04-09T17:34:28.000-07:00
feat: add intervenable_model to forward's function signature
diff --git a/pyvene/models/intervenable_base.py b/pyvene/models/intervenable_base.py
@@ -804,6 +804,7 @@ def _intervention_setter(
         keys,
         unit_locations_base,
         subspaces,
+        **intervention_forward_kwargs
     ) -> HandlerList:
         """
         Create a list of setter tracer that will set activations
@@ -848,6 +849,7 @@ def _intervention_setter(
                     None,
                     intervention,
                     subspaces[key_i] if subspaces is not None else None,
+                    **intervention_forward_kwargs
                 )
                 # fail if this is not a fresh collect
                 assert key not in self.activations
@@ -862,6 +864,7 @@ def _intervention_setter(
                             None,
                             intervention,
                             subspaces[key_i] if subspaces is not None else None,
+                            **intervention_forward_kwargs
                         )
                     else:
                         intervened_representation = do_intervention(
@@ -873,6 +876,7 @@ def _intervention_setter(
                             ),
                             intervention,
                             subspaces[key_i] if subspaces is not None else None,
+                            **intervention_forward_kwargs
                         )
                 else:
                     # highly unlikely it's a primitive intervention type
@@ -885,6 +889,7 @@ def _intervention_setter(
                         ),
                         intervention,
                         subspaces[key_i] if subspaces is not None else None,
+                        **intervention_forward_kwargs
                     )
                 if intervened_representation is None:
                     return
@@ -970,6 +975,7 @@ def _sync_forward_with_parallel_intervention(
                             ]
                             if subspaces is not None
                             else None,
+                            **kwargs
                         )
             counterfactual_outputs = self.model.output.save()
         
@@ -997,6 +1003,7 @@ def forward(
         output_original_output: Optional[bool] = False,
         return_dict: Optional[bool] = None,
         use_cache: Optional[bool] = None,
+        **kwargs
     ):
         activations_sources = source_representations
         if sources is not None and not isinstance(sources, list):
@@ -1036,7 +1043,7 @@ def forward(
         try:
 
             # run intervened forward
-            model_kwargs = {}
+            model_kwargs = { **kwargs }
             if labels is not None: # for training
                 model_kwargs["labels"] = labels
             if use_cache is not None and 'use_cache' in self.model.config.to_dict(): # for transformer models
@@ -1526,6 +1533,7 @@ def _intervention_setter(
         keys,
         unit_locations_base,
         subspaces,
+        **intervention_forward_kwargs
     ) -> HandlerList:
         """
         Create a list of setter handlers that will set activations
@@ -1573,6 +1581,7 @@ def hook_callback(model, args, kwargs, output=None):
                         None,
                         intervention,
                         subspaces[key_i] if subspaces is not None else None,
+                        **intervention_forward_kwargs
                     )
                     # fail if this is not a fresh collect
                     assert key not in self.activations
@@ -1588,6 +1597,7 @@ def hook_callback(model, args, kwargs, output=None):
                                 None,
                                 intervention,
                                 subspaces[key_i] if subspaces is not None else None,
+                                **intervention_forward_kwargs
                             )
                             if isinstance(raw_intervened_representation, InterventionOutput):
                                 self.full_intervention_outputs.append(raw_intervened_representation)
@@ -1604,6 +1614,7 @@ def hook_callback(model, args, kwargs, output=None):
                                 ),
                                 intervention,
                                 subspaces[key_i] if subspaces is not None else None,
+                                **intervention_forward_kwargs
                             )
                     else:
                         # highly unlikely it's a primitive intervention type
@@ -1616,6 +1627,7 @@ def hook_callback(model, args, kwargs, output=None):
                             ),
                             intervention,
                             subspaces[key_i] if subspaces is not None else None,
+                            **intervention_forward_kwargs
                         )
                     if intervened_representation is None:
                         return
@@ -1683,6 +1695,7 @@ def _wait_for_forward_with_parallel_intervention(
         unit_locations,
         activations_sources: Optional[Dict] = None,
         subspaces: Optional[List] = None,
+        **intervention_forward_kwargs
     ):
         # torch.autograd.set_detect_anomaly(True)
         all_set_handlers = HandlerList([])
@@ -1738,6 +1751,7 @@ def _wait_for_forward_with_parallel_intervention(
                         ]
                         if subspaces is not None
                         else None,
+                         **intervention_forward_kwargs
                     )
                     # for setters, we don't remove them.
                     all_set_handlers.extend(set_handlers)
@@ -1749,6 +1763,7 @@ def _wait_for_forward_with_serial_intervention(
         unit_locations,
         activations_sources: Optional[Dict] = None,
         subspaces: Optional[List] = None,
+         **intervention_forward_kwargs
     ):
         all_set_handlers = HandlerList([])
         for group_id, keys in self._intervention_group.items():
@@ -1805,6 +1820,7 @@ def _wait_for_forward_with_serial_intervention(
                         ]
                         if subspaces is not None
                         else None,
+                         **intervention_forward_kwargs
                     )
                     # for setters, we don't remove them.
                     all_set_handlers.extend(set_handlers)
@@ -1821,6 +1837,7 @@ def forward(
         output_original_output: Optional[bool] = False,
         return_dict: Optional[bool] = None,
         use_cache: Optional[bool] = None,
+        **intervention_forward_kwargs
     ):
         """
         Main forward function that serves a wrapper to
@@ -1929,6 +1946,7 @@ def forward(
                         unit_locations,
                         activations_sources,
                         subspaces,
+                        **intervention_forward_kwargs
                     )
                 )
             elif self.mode == "serial":
@@ -1938,6 +1956,7 @@ def forward(
                         unit_locations,
                         activations_sources,
                         subspaces,
+                        **intervention_forward_kwargs
                     )
                 )
 
@@ -2071,6 +2090,7 @@ def generate(
                         unit_locations,
                         activations_sources,
                         subspaces,
+                        **kwargs
                     )
                 )
             elif self.mode == "serial":
@@ -2080,6 +2100,7 @@ def generate(
                         unit_locations,
                         activations_sources,
                         subspaces,
+                        **kwargs
                     )
                 )
             
diff --git a/pyvene/models/interventions.py b/pyvene/models/interventions.py
@@ -75,7 +75,7 @@ def set_interchange_dim(self, interchange_dim):
             self.interchange_dim = interchange_dim
             
     @abstractmethod
-    def forward(self, base, source, subspaces=None):
+    def forward(self, base, source, subspaces=None, **kwargs):
         pass
 
 
@@ -153,7 +153,7 @@ class ZeroIntervention(ConstantSourceIntervention, LocalistRepresentationInterve
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
         
-    def forward(self, base, source=None, subspaces=None):
+    def forward(self, base, source=None, subspaces=None, **kwargs):
         return _do_intervention_by_swap(
             base,
             torch.zeros_like(base),
@@ -175,7 +175,7 @@ class CollectIntervention(ConstantSourceIntervention):
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
         
-    def forward(self, base, source=None, subspaces=None):
+    def forward(self, base, source=None, subspaces=None, **kwargs):
         return _do_intervention_by_swap(
             base,
             source,
@@ -197,7 +197,7 @@ class SkipIntervention(BasisAgnosticIntervention, LocalistRepresentationInterven
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
         
-    def forward(self, base, source, subspaces=None):
+    def forward(self, base, source, subspaces=None, **kwargs):
         # source here is the base example input to the hook
         return _do_intervention_by_swap(
             base,
@@ -220,7 +220,7 @@ class VanillaIntervention(Intervention, LocalistRepresentationIntervention):
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
 
-    def forward(self, base, source, subspaces=None): 
+    def forward(self, base, source, subspaces=None, **kwargs): 
         return _do_intervention_by_swap(
             base,
             source if self.source_representation is None else self.source_representation,
@@ -242,7 +242,7 @@ class AdditionIntervention(BasisAgnosticIntervention, LocalistRepresentationInte
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
 
-    def forward(self, base, source, subspaces=None):
+    def forward(self, base, source, subspaces=None, **kwargs):
         return _do_intervention_by_swap(
             base,
             source if self.source_representation is None else self.source_representation,
@@ -264,7 +264,7 @@ class SubtractionIntervention(BasisAgnosticIntervention, LocalistRepresentationI
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
 
-    def forward(self, base, source, subspaces=None):
+    def forward(self, base, source, subspaces=None, **kwargs):
         
         return _do_intervention_by_swap(
             base,
@@ -289,7 +289,7 @@ def __init__(self, **kwargs):
         rotate_layer = RotateLayer(self.embed_dim)
         self.rotate_layer = torch.nn.utils.parametrizations.orthogonal(rotate_layer)
 
-    def forward(self, base, source, subspaces=None):
+    def forward(self, base, source, subspaces=None, **kwargs):
         rotated_base = self.rotate_layer(base)
         rotated_source = self.rotate_layer(source)
         # interchange
@@ -340,7 +340,7 @@ def set_intervention_boundaries(self, intervention_boundaries):
             torch.tensor([intervention_boundaries]), requires_grad=True
         )
         
-    def forward(self, base, source, subspaces=None):
+    def forward(self, base, source, subspaces=None, **kwargs):
         batch_size = base.shape[0]
         rotated_base = self.rotate_layer(base)
         rotated_source = self.rotate_layer(source)
@@ -391,7 +391,7 @@ def get_temperature(self):
     def set_temperature(self, temp: torch.Tensor):
         self.temperature.data = temp
 
-    def forward(self, base, source, subspaces=None):
+    def forward(self, base, source, subspaces=None, **kwargs):
         batch_size = base.shape[0]
         rotated_base = self.rotate_layer(base)
         rotated_source = self.rotate_layer(source)
@@ -431,7 +431,7 @@ def get_temperature(self):
     def set_temperature(self, temp: torch.Tensor):
         self.temperature.data = temp
 
-    def forward(self, base, source, subspaces=None):
+    def forward(self, base, source, subspaces=None, **kwargs):
         batch_size = base.shape[0]
         # get boundary mask between 0 and 1 from sigmoid
         mask_sigmoid = torch.sigmoid(self.mask / torch.tensor(self.temperature)) 
@@ -456,7 +456,7 @@ def __init__(self, **kwargs):
         rotate_layer = LowRankRotateLayer(self.embed_dim, kwargs["low_rank_dimension"])
         self.rotate_layer = torch.nn.utils.parametrizations.orthogonal(rotate_layer)
 
-    def forward(self, base, source, subspaces=None):
+    def forward(self, base, source, subspaces=None, **kwargs):
         rotated_base = self.rotate_layer(base)
         rotated_source = self.rotate_layer(source)
         if subspaces is not None:
@@ -529,7 +529,7 @@ def __init__(self, **kwargs):
         )
         self.trainable = False
 
-    def forward(self, base, source, subspaces=None):
+    def forward(self, base, source, subspaces=None, **kwargs):
         base_norm = (base - self.pca_mean) / self.pca_std
         source_norm = (source - self.pca_mean) / self.pca_std
 
@@ -565,7 +565,7 @@ def __init__(self, **kwargs):
             prng(1, 4, self.embed_dim)))
         self.register_buffer('noise_level', torch.tensor(noise_level))
         
-    def forward(self, base, source=None, subspaces=None):
+    def forward(self, base, source=None, subspaces=None, **kwargs):
         base[..., : self.interchange_dim] += self.noise * self.noise_level
         return base
 
@@ -585,7 +585,7 @@ def __init__(self, **kwargs):
         self.autoencoder = AutoencoderLayer(
                 self.embed_dim, kwargs["latent_dim"])
 
-    def forward(self, base, source, subspaces=None):
+    def forward(self, base, source, subspaces=None, **kwargs):
         base_dtype = base.dtype
         base = base.to(self.autoencoder.encoder[0].weight.dtype)
         base_latent = self.autoencoder.encode(base)
@@ -619,7 +619,7 @@ def encode(self, input_acts):
     def decode(self, acts):
         return acts @ self.W_dec + self.b_dec
 
-    def forward(self, base, source=None, subspaces=None):
+    def forward(self, base, source=None, subspaces=None, **kwargs):
         # generate latents for base and source runs.
         base_latent = self.encode(base)
         source_latent = self.encode(source)
diff --git a/pyvene/models/modeling_utils.py b/pyvene/models/modeling_utils.py
@@ -446,7 +446,7 @@ def scatter_neurons(
 
 
 def do_intervention(
-    base_representation, source_representation, intervention, subspaces
+    base_representation, source_representation, intervention, subspaces, **intervention_forward_kwargs
 ):
     """Do the actual intervention."""
 
@@ -478,7 +478,8 @@ def do_intervention(
         assert False  # what's going on?
 
     intervention_output = intervention(
-        base_representation_f, source_representation_f, subspaces
+        base_representation_f, source_representation_f, subspaces,
+        **intervention_forward_kwargs
     )
     if isinstance(intervention_output, InterventionOutput):
         intervened_representation = intervention_output.output
diff --git a/tests/integration_tests/IntervenableBasicTestCase.py b/tests/integration_tests/IntervenableBasicTestCase.py
@@ -232,7 +232,7 @@ class MultiplierIntervention(
             def __init__(self, embed_dim, **kwargs):
                 super().__init__()
             def forward(
-            self, base, source=None, subspaces=None):
+            self, base, source=None, subspaces=None, **kwargs):
                 return base * 99.0
         # run with new intervention type
         pv_gpt2 = pv.IntervenableModel({
diff --git a/tests/integration_tests/InterventionWithLlamaTestCase.py b/tests/integration_tests/InterventionWithLlamaTestCase.py
@@ -156,6 +156,32 @@ def test_with_multiple_heads_positions_vanilla_intervention_positive(self):
                 heads=[4, 1],
                 positions=[7, 2],
             )
+    
+    def test_with_llm_head(self):
+        that = self
+        _lm_head_collection = {}
+        class AccessIntervenableModelIntervention:
+            is_source_constant = True
+            keep_last_dim = True
+            intervention_types = 'access_intervenable_model_intervention'
+            def __init__(self, layer_index, *args, **kwargs):
+                super().__init__()
+                self.layer_index = layer_index
+            def __call__(self, base, source=None, subspaces=None, model=None, **kwargs):
+                intervenable_model = kwargs.get('intervenable_model', None)
+                assert intervenable_model is not None
+                _lm_head_collection[self.layer_index] = intervenable_model.model.lm_head(base.to(that.device))
+                return base
+        # run with new intervention type
+        pv_llama = IntervenableModel([{
+            "intervention": AccessIntervenableModelIntervention(layer_index=layer),
+            "component": f"model.layers.{layer}.input"
+        } for layer in [1, 3]], model=self.llama)
+        intervened_outputs = pv_llama(
+            base=self.tokenizer("The capital of Spain is", return_tensors="pt").to(that.device), 
+            unit_locations={"base": 3},
+            intervenable_model=pv_llama
+        )
 
             
 def suite():