stanfordnlp · aryamanarora · Jan 29, 2026 · Jan 28, 2026 · Jan 28, 2026 · chatgpt-codex-connector
diff --git a/pyvene/models/intervenable_modelcard.py b/pyvene/models/intervenable_modelcard.py
@@ -19,6 +19,8 @@
 from .esm.modelings_intervenable_esm import *
 from .mllama.modelings_intervenable_mllama import *
 from .gpt_oss.modelings_intervenable_gpt_oss import *
+from .whisper.modelings_intervenable_whisper import *
+from .wav2vec2bert.modelings_intervenable_wav2vec2bert import *
 
 #########################################################################
 """
@@ -89,6 +91,9 @@
     hf_models.mllama.modeling_mllama.MllamaForConditionalGeneration: mllama_type_to_module_mapping,
     hf_models.gpt_oss.modeling_gpt_oss.GptOssModel: gpt_oss_type_to_module_mapping,
     hf_models.gpt_oss.modeling_gpt_oss.GptOssForCausalLM: gpt_oss_lm_type_to_module_mapping,
+    hf_models.whisper.modeling_whisper.WhisperModel: whisper_type_to_module_mapping,
+    hf_models.whisper.modeling_whisper.WhisperForConditionalGeneration: whisper_lm_type_to_module_mapping,
+    hf_models.wav2vec2_bert.modeling_wav2vec2_bert.Wav2Vec2BertModel: wav2vec2bert_type_to_module_mapping,
 }
 if enable_blip:
     type_to_module_mapping[BlipWrapper] = blip_wrapper_type_to_module_mapping
@@ -135,6 +140,9 @@
     hf_models.mllama.modeling_mllama.MllamaForConditionalGeneration: mllama_type_to_dimension_mapping,
     hf_models.gpt_oss.modeling_gpt_oss.GptOssModel: gpt_oss_type_to_dimension_mapping,
     hf_models.gpt_oss.modeling_gpt_oss.GptOssForCausalLM: gpt_oss_lm_type_to_dimension_mapping,
+    hf_models.whisper.modeling_whisper.WhisperModel: whisper_type_to_dimension_mapping,
+    hf_models.whisper.modeling_whisper.WhisperForConditionalGeneration: whisper_lm_type_to_dimension_mapping,
+    hf_models.wav2vec2_bert.modeling_wav2vec2_bert.Wav2Vec2BertModel: wav2vec2bert_type_to_dimension_mapping,
 }
 
 if enable_blip:

diff --git a/pyvene/models/modeling_utils.py b/pyvene/models/modeling_utils.py
@@ -465,7 +465,10 @@ def do_intervention(
     # flatten
     original_base_shape = base_representation.shape
     if len(original_base_shape) == 2 or (
-        isinstance(intervention, LocalistRepresentationIntervention)
+        isinstance(intervention, LocalistRepresentationIntervention) or
+        isinstance(intervention, BoundlessRotatedSpaceIntervention) or
+        isinstance(intervention, VanillaIntervention) or
+        isinstance(intervention, CollectIntervention)
     ) or intervention.keep_last_dim:
         # no pos dimension, e.g., gru, or opt-out concate last two dims
         base_representation_f = base_representation
@@ -492,8 +495,11 @@ def do_intervention(
     post_d = intervened_representation.shape[-1]
 
     # unflatten
-    if len(original_base_shape) == 2 or isinstance(
-        intervention, LocalistRepresentationIntervention
+    if len(original_base_shape) == 2 or (
+        isinstance(intervention, LocalistRepresentationIntervention) or
+        isinstance(intervention, BoundlessRotatedSpaceIntervention) or
+        isinstance(intervention, VanillaIntervention) or
+        isinstance(intervention, CollectIntervention)
     ) or intervention.keep_last_dim:
         # no pos dimension, e.g., gru or opt-out concate last two dims
         pass

diff --git a/pyvene/models/wav2vec2bert/__init__.py b/pyvene/models/wav2vec2bert/__init__.py
diff --git a/pyvene/models/wav2vec2bert/modelings_intervenable_wav2vec2bert.py b/pyvene/models/wav2vec2bert/modelings_intervenable_wav2vec2bert.py
@@ -0,0 +1,61 @@
+"""
+Each modeling file in this library is a mapping between
+abstract naming of intervention anchor points and actual
+model module defined in the huggingface library.
+We also want to let the intervention library know how to
+config the dimensions of intervention based on model config
+defined in the huggingface library.
+"""
+import torch
+from ..constants import *
+
+wav2vec2bert_type_to_module_mapping = {
+    "block_input": ("encoder.layers[%s]", CONST_INPUT_HOOK),
+    "block_output": ("encoder.layers[%s]", CONST_OUTPUT_HOOK),
+    "ffn1_activation": ("encoder.layers[%s].ffn1.intermediate_act_fn", CONST_OUTPUT_HOOK),
+    "ffn1_output": ("encoder.layers[%s].ffn1", CONST_OUTPUT_HOOK),
+    "ffn1_input": ("encoder.layers[%s].ffn1", CONST_INPUT_HOOK),
+    "ffn2_activation": ("encoder.layers[%s].ffn2.intermediate_act_fn", CONST_OUTPUT_HOOK),
+    "ffn2_output": ("encoder.layers[%s].ffn2", CONST_OUTPUT_HOOK),
+    "ffn2_input": ("encoder.layers[%s].ffn2", CONST_INPUT_HOOK),
+    "attention_value_output": ("encoder.layers[%s].self_attn.linear_out", CONST_INPUT_HOOK),
+    "head_attention_value_output": ("encoder.layers[%s].self_attn.linear_out", CONST_INPUT_HOOK, (split_head_and_permute, "n_head")),
+    "attention_output": ("encoder.layers[%s].self_attn", CONST_OUTPUT_HOOK),
+    "attention_input": ("encoder.layers[%s].self_attn", CONST_INPUT_HOOK),
+    "query_output": ("encoder.layers[%s].self_attn.linear_q", CONST_OUTPUT_HOOK),
+    "key_output": ("encoder.layers[%s].self_attn.linear_k", CONST_OUTPUT_HOOK),
+    "value_output": ("encoder.layers[%s].self_attn.linear_v", CONST_OUTPUT_HOOK),
+    "head_query_output": ("encoder.layers[%s].self_attn.linear_q", CONST_OUTPUT_HOOK, (split_head_and_permute, "n_head")),
+    "head_key_output": ("encoder.layers[%s].self_attn.linear_k", CONST_OUTPUT_HOOK, (split_head_and_permute, "n_head")),
+    "head_value_output": ("encoder.layers[%s].self_attn.linear_v", CONST_OUTPUT_HOOK, (split_head_and_permute, "n_head")),
+    "conv_output": ("encoder.layers[%s].conv_module", CONST_OUTPUT_HOOK),
+    "conv_input": ("encoder.layers[%s].conv_module", CONST_INPUT_HOOK),
+    "conv_glu_output": ("encoder.layers[%s].conv_module.glu", CONST_OUTPUT_HOOK),
+    "conv_depth_output": ("encoder.layers[%s].conv_module.depthwise_conv", CONST_OUTPUT_HOOK),
+}
+
+wav2vec2bert_type_to_dimension_mapping = {
+    "n_head": ("num_attention_heads",),
+    "block_input": ("hidden_size",),
+    "block_output": ("hidden_size",),
+    "ffn1_activation": ("intermediate_size",),
+    "ffn1_output": ("hidden_size",),
+    "ffn1_input": ("hidden_size",),
+    "ffn2_activation": ("intermediate_size",),
+    "ffn2_output": ("hidden_size",),
+    "ffn2_input": ("hidden_size",),
+    "attention_value_output": ("hidden_size",),
+    "head_attention_value_output": ("hidden_size/num_attention_heads",),
+    "attention_output": ("hidden_size",),
+    "attention_input": ("hidden_size",),
+    "query_output": ("hidden_size",),
+    "key_output": ("hidden_size",),
+    "value_output": ("hidden_size",),
+    "head_query_output": ("hidden_size/num_attention_heads",),
+    "head_key_output": ("hidden_size/num_attention_heads",),
+    "head_value_output": ("hidden_size/num_attention_heads",),
+    "conv_output": ("hidden_size",),
+    "conv_input": ("hidden_size",),
+    "conv_glu_output": ("hidden_size",),
+    "conv_depth_output": ("hidden_size",),
+}
diff --git a/pyvene/models/whisper/__init__.py b/pyvene/models/whisper/__init__.py
diff --git a/pyvene/models/whisper/modelings_intervenable_whisper.py b/pyvene/models/whisper/modelings_intervenable_whisper.py
@@ -0,0 +1,54 @@
+"""
+Each modeling file in this library is a mapping between
+abstract naming of intervention anchor points and actual
+model module defined in the huggingface library.
+We also want to let the intervention library know how to
+config the dimensions of intervention based on model config
+defined in the huggingface library.
+"""
+import torch
+from ..constants import *
+
+whisper_type_to_module_mapping = {
+    "block_input": ("encoder.layers[%s]", CONST_INPUT_HOOK),
+    "block_output": ("encoder.layers[%s]", CONST_OUTPUT_HOOK),
+    "mlp_activation": ("encoder.layers[%s].activation_fn", CONST_OUTPUT_HOOK),
+    "mlp_output": ("encoder.layers[%s].fc2", CONST_OUTPUT_HOOK),
+    "mlp_input": ("encoder.layers[%s].fc1", CONST_INPUT_HOOK),
+    "attention_value_output": ("encoder.layers[%s].self_attn.out_proj", CONST_INPUT_HOOK),
+    "head_attention_value_output": ("encoder.layers[%s].self_attn.out_proj", CONST_INPUT_HOOK, (split_head_and_permute, "n_head")),
+    "attention_output": ("encoder.layers[%s].self_attn", CONST_OUTPUT_HOOK),
+    "attention_input": ("encoder.layers[%s].self_attn", CONST_INPUT_HOOK),
+    "query_output": ("encoder.layers[%s].self_attn.q_proj", CONST_OUTPUT_HOOK),
+    "key_output": ("encoder.layers[%s].self_attn.k_proj", CONST_OUTPUT_HOOK),
+    "value_output": ("encoder.layers[%s].self_attn.v_proj", CONST_OUTPUT_HOOK),
+    "head_query_output": ("encoder.layers[%s].self_attn.q_proj", CONST_OUTPUT_HOOK, (split_head_and_permute, "n_head")),
+    "head_key_output": ("encoder.layers[%s].self_attn.k_proj", CONST_OUTPUT_HOOK, (split_head_and_permute, "n_head")),
+    "head_value_output": ("encoder.layers[%s].self_attn.v_proj", CONST_OUTPUT_HOOK, (split_head_and_permute, "n_head")),
+}
+
+whisper_type_to_dimension_mapping = {
+    "n_head": ("encoder_attention_heads",),
+    "block_input": ("d_model",),
+    "block_output": ("d_model",),
+    "mlp_activation": ("encoder_ffn_dim",),
+    "mlp_output": ("d_model",),
+    "mlp_input": ("d_model",),
+    "attention_value_output": ("d_model",),
+    "head_attention_value_output": ("d_model/encoder_attention_heads",),
+    "attention_output": ("d_model",),
+    "attention_input": ("d_model",),
+    "query_output": ("d_model",),
+    "key_output": ("d_model",),
+    "value_output": ("d_model",),
+    "head_query_output": ("d_model/encoder_attention_heads",),
+    "head_key_output": ("d_model/encoder_attention_heads",),
+    "head_value_output": ("d_model/encoder_attention_heads",),
+}
+
+"""whisper model with LM head"""
+whisper_lm_type_to_module_mapping = {}
+for k, v in whisper_type_to_module_mapping.items():
+    whisper_lm_type_to_module_mapping[k] = (f"model.{v[0]}", ) + v[1:]
+whisper_lm_type_to_dimension_mapping = whisper_type_to_dimension_mapping
+