From 7cac5c7726881b3b710910819087dd851cc222db Mon Sep 17 00:00:00 2001 From: Taka Yamakoshi Date: Wed, 28 Jan 2026 15:37:19 -0800 Subject: [PATCH 1/2] added speech models --- pyvene/models/intervenable_modelcard.py | 8 +++ pyvene/models/modeling_utils.py | 12 +++- pyvene/models/wav2vec2bert/__init__.py | 0 .../modelings_intervenable_wav2vec2bert.py | 61 +++++++++++++++++++ pyvene/models/whisper/__init__.py | 0 .../whisper/modelings_intervenable_whisper.py | 54 ++++++++++++++++ 6 files changed, 132 insertions(+), 3 deletions(-) create mode 100644 pyvene/models/wav2vec2bert/__init__.py create mode 100644 pyvene/models/wav2vec2bert/modelings_intervenable_wav2vec2bert.py create mode 100644 pyvene/models/whisper/__init__.py create mode 100644 pyvene/models/whisper/modelings_intervenable_whisper.py diff --git a/pyvene/models/intervenable_modelcard.py b/pyvene/models/intervenable_modelcard.py index 2bc08662..0f067f19 100644 --- a/pyvene/models/intervenable_modelcard.py +++ b/pyvene/models/intervenable_modelcard.py @@ -19,6 +19,8 @@ from .esm.modelings_intervenable_esm import * from .mllama.modelings_intervenable_mllama import * from .gpt_oss.modelings_intervenable_gpt_oss import * +from .whisper.modelings_intervenable_whisper import * +from .wav2vec2bert.modelings_intervenable_wav2vec2bert import * ######################################################################### """ @@ -89,6 +91,9 @@ hf_models.mllama.modeling_mllama.MllamaForConditionalGeneration: mllama_type_to_module_mapping, hf_models.gpt_oss.modeling_gpt_oss.GptOssModel: gpt_oss_type_to_module_mapping, hf_models.gpt_oss.modeling_gpt_oss.GptOssForCausalLM: gpt_oss_lm_type_to_module_mapping, + hf_models.whisper.modeling_whisper.WhisperModel: whisper_type_to_module_mapping, + hf_models.whisper.modeling_whisper.WhisperForConditionalGeneration: whisper_lm_type_to_module_mapping, + hf_models.wav2vec2_bert.modeling_wav2vec2_bert.Wav2Vec2BertModel: wav2vec2bert_type_to_module_mapping, } if enable_blip: type_to_module_mapping[BlipWrapper] = blip_wrapper_type_to_module_mapping @@ -135,6 +140,9 @@ hf_models.mllama.modeling_mllama.MllamaForConditionalGeneration: mllama_type_to_dimension_mapping, hf_models.gpt_oss.modeling_gpt_oss.GptOssModel: gpt_oss_type_to_dimension_mapping, hf_models.gpt_oss.modeling_gpt_oss.GptOssForCausalLM: gpt_oss_lm_type_to_dimension_mapping, + hf_models.whisper.modeling_whisper.WhisperModel: whisper_type_to_dimension_mapping, + hf_models.whisper.modeling_whisper.WhisperForConditionalGeneration: whisper_lm_type_to_dimension_mapping, + hf_models.wav2vec2_bert.modeling_wav2vec2_bert.Wav2Vec2BertModel: wav2vec2bert_type_to_dimension_mapping, } if enable_blip: diff --git a/pyvene/models/modeling_utils.py b/pyvene/models/modeling_utils.py index da74a0df..91f50f5f 100644 --- a/pyvene/models/modeling_utils.py +++ b/pyvene/models/modeling_utils.py @@ -465,7 +465,10 @@ def do_intervention( # flatten original_base_shape = base_representation.shape if len(original_base_shape) == 2 or ( - isinstance(intervention, LocalistRepresentationIntervention) + isinstance(intervention, LocalistRepresentationIntervention) or + isinstance(intervention, BoundlessRotatedSpaceIntervention) or + isinstance(intervention, VanillaIntervention) or + isinstance(intervention, CollectIntervention) ) or intervention.keep_last_dim: # no pos dimension, e.g., gru, or opt-out concate last two dims base_representation_f = base_representation @@ -492,8 +495,11 @@ def do_intervention( post_d = intervened_representation.shape[-1] # unflatten - if len(original_base_shape) == 2 or isinstance( - intervention, LocalistRepresentationIntervention + if len(original_base_shape) == 2 or ( + isinstance(intervention, LocalistRepresentationIntervention) or + isinstance(intervention, BoundlessRotatedSpaceIntervention) or + isinstance(intervention, VanillaIntervention) or + isinstance(intervention, CollectIntervention) ) or intervention.keep_last_dim: # no pos dimension, e.g., gru or opt-out concate last two dims pass diff --git a/pyvene/models/wav2vec2bert/__init__.py b/pyvene/models/wav2vec2bert/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/pyvene/models/wav2vec2bert/modelings_intervenable_wav2vec2bert.py b/pyvene/models/wav2vec2bert/modelings_intervenable_wav2vec2bert.py new file mode 100644 index 00000000..fd970068 --- /dev/null +++ b/pyvene/models/wav2vec2bert/modelings_intervenable_wav2vec2bert.py @@ -0,0 +1,61 @@ +""" +Each modeling file in this library is a mapping between +abstract naming of intervention anchor points and actual +model module defined in the huggingface library. +We also want to let the intervention library know how to +config the dimensions of intervention based on model config +defined in the huggingface library. +""" +import torch +from ..constants import * + +w2v2bert_type_to_module_mapping = { + "block_input": ("encoder.layers[%s]", CONST_INPUT_HOOK), + "block_output": ("encoder.layers[%s]", CONST_OUTPUT_HOOK), + "ffn1_activation": ("encoder.layers[%s].ffn1.intermediate_act_fn", CONST_OUTPUT_HOOK), + "ffn1_output": ("encoder.layers[%s].ffn1", CONST_OUTPUT_HOOK), + "ffn1_input": ("encoder.layers[%s].ffn1", CONST_INPUT_HOOK), + "ffn2_activation": ("encoder.layers[%s].ffn2.intermediate_act_fn", CONST_OUTPUT_HOOK), + "ffn2_output": ("encoder.layers[%s].ffn2", CONST_OUTPUT_HOOK), + "ffn2_input": ("encoder.layers[%s].ffn2", CONST_INPUT_HOOK), + "attention_value_output": ("encoder.layers[%s].self_attn.linear_out", CONST_INPUT_HOOK), + "head_attention_value_output": ("encoder.layers[%s].self_attn.linear_out", CONST_INPUT_HOOK, (split_head_and_permute, "n_head")), + "attention_output": ("encoder.layers[%s].self_attn", CONST_OUTPUT_HOOK), + "attention_input": ("encoder.layers[%s].self_attn", CONST_INPUT_HOOK), + "query_output": ("encoder.layers[%s].self_attn.linear_q", CONST_OUTPUT_HOOK), + "key_output": ("encoder.layers[%s].self_attn.linear_k", CONST_OUTPUT_HOOK), + "value_output": ("encoder.layers[%s].self_attn.linear_v", CONST_OUTPUT_HOOK), + "head_query_output": ("encoder.layers[%s].self_attn.linear_q", CONST_OUTPUT_HOOK, (split_head_and_permute, "n_head")), + "head_key_output": ("encoder.layers[%s].self_attn.linear_k", CONST_OUTPUT_HOOK, (split_head_and_permute, "n_head")), + "head_value_output": ("encoder.layers[%s].self_attn.linear_v", CONST_OUTPUT_HOOK, (split_head_and_permute, "n_head")), + "conv_output": ("encoder.layers[%s].conv_module", CONST_OUTPUT_HOOK), + "conv_input": ("encoder.layers[%s].conv_module", CONST_INPUT_HOOK), + "conv_glu_output": ("encoder.layers[%s].conv_module.glu", CONST_OUTPUT_HOOK), + "conv_depth_output": ("encoder.layers[%s].conv_module.depthwise_conv", CONST_OUTPUT_HOOK), +} + +w2v2bert_type_to_dimension_mapping = { + "n_head": ("num_attention_heads",), + "block_input": ("hidden_size",), + "block_output": ("hidden_size",), + "ffn1_activation": ("intermediate_size",), + "ffn1_output": ("hidden_size",), + "ffn1_input": ("hidden_size",), + "ffn2_activation": ("intermediate_size",), + "ffn2_output": ("hidden_size",), + "ffn2_input": ("hidden_size",), + "attention_value_output": ("hidden_size",), + "head_attention_value_output": ("hidden_size/num_attention_heads",), + "attention_output": ("hidden_size",), + "attention_input": ("hidden_size",), + "query_output": ("hidden_size",), + "key_output": ("hidden_size",), + "value_output": ("hidden_size",), + "head_query_output": ("hidden_size/num_attention_heads",), + "head_key_output": ("hidden_size/num_attention_heads",), + "head_value_output": ("hidden_size/num_attention_heads",), + "conv_output": ("hidden_size",), + "conv_input": ("hidden_size",), + "conv_glu_output": ("hidden_size",), + "conv_depth_output": ("hidden_size",), +} diff --git a/pyvene/models/whisper/__init__.py b/pyvene/models/whisper/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/pyvene/models/whisper/modelings_intervenable_whisper.py b/pyvene/models/whisper/modelings_intervenable_whisper.py new file mode 100644 index 00000000..09b2da0d --- /dev/null +++ b/pyvene/models/whisper/modelings_intervenable_whisper.py @@ -0,0 +1,54 @@ +""" +Each modeling file in this library is a mapping between +abstract naming of intervention anchor points and actual +model module defined in the huggingface library. +We also want to let the intervention library know how to +config the dimensions of intervention based on model config +defined in the huggingface library. +""" +import torch +from ..constants import * + +whisper_type_to_module_mapping = { + "block_input": ("encoder.layers[%s]", CONST_INPUT_HOOK), + "block_output": ("encoder.layers[%s]", CONST_OUTPUT_HOOK), + "mlp_activation": ("encoder.layers[%s].activation_fn", CONST_OUTPUT_HOOK), + "mlp_output": ("encoder.layers[%s].fc2", CONST_OUTPUT_HOOK), + "mlp_input": ("encoder.layers[%s].fc1", CONST_INPUT_HOOK), + "attention_value_output": ("encoder.layers[%s].self_attn.out_proj", CONST_INPUT_HOOK), + "head_attention_value_output": ("encoder.layers[%s].self_attn.out_proj", CONST_INPUT_HOOK, (split_head_and_permute, "n_head")), + "attention_output": ("encoder.layers[%s].self_attn", CONST_OUTPUT_HOOK), + "attention_input": ("encoder.layers[%s].self_attn", CONST_INPUT_HOOK), + "query_output": ("encoder.layers[%s].self_attn.q_proj", CONST_OUTPUT_HOOK), + "key_output": ("encoder.layers[%s].self_attn.k_proj", CONST_OUTPUT_HOOK), + "value_output": ("encoder.layers[%s].self_attn.v_proj", CONST_OUTPUT_HOOK), + "head_query_output": ("encoder.layers[%s].self_attn.q_proj", CONST_OUTPUT_HOOK, (split_head_and_permute, "n_head")), + "head_key_output": ("encoder.layers[%s].self_attn.k_proj", CONST_OUTPUT_HOOK, (split_head_and_permute, "n_head")), + "head_value_output": ("encoder.layers[%s].self_attn.v_proj", CONST_OUTPUT_HOOK, (split_head_and_permute, "n_head")), +} + +whisper_type_to_dimension_mapping = { + "n_head": ("encoder_attention_heads",), + "block_input": ("d_model",), + "block_output": ("d_model",), + "mlp_activation": ("encoder_ffn_dim",), + "mlp_output": ("d_model",), + "mlp_input": ("d_model",), + "attention_value_output": ("d_model",), + "head_attention_value_output": ("d_model/encoder_attention_heads",), + "attention_output": ("d_model",), + "attention_input": ("d_model",), + "query_output": ("d_model",), + "key_output": ("d_model",), + "value_output": ("d_model",), + "head_query_output": ("d_model/encoder_attention_heads",), + "head_key_output": ("d_model/encoder_attention_heads",), + "head_value_output": ("d_model/encoder_attention_heads",), +} + +"""whisper model with LM head""" +whisper_lm_type_to_module_mapping = {} +for k, v in whisper_type_to_module_mapping.items(): + whisper_lm_type_to_module_mapping[k] = (f"model.{v[0]}", ) + v[1:] +whisper_lm_type_to_dimension_mapping = whisper_type_to_dimension_mapping + From 30616d78890e77d5f3663f819156afd8d0a3b24e Mon Sep 17 00:00:00 2001 From: Taka Yamakoshi Date: Wed, 28 Jan 2026 15:45:53 -0800 Subject: [PATCH 2/2] fix --- .../wav2vec2bert/modelings_intervenable_wav2vec2bert.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyvene/models/wav2vec2bert/modelings_intervenable_wav2vec2bert.py b/pyvene/models/wav2vec2bert/modelings_intervenable_wav2vec2bert.py index fd970068..8636d371 100644 --- a/pyvene/models/wav2vec2bert/modelings_intervenable_wav2vec2bert.py +++ b/pyvene/models/wav2vec2bert/modelings_intervenable_wav2vec2bert.py @@ -9,7 +9,7 @@ import torch from ..constants import * -w2v2bert_type_to_module_mapping = { +wav2vec2bert_type_to_module_mapping = { "block_input": ("encoder.layers[%s]", CONST_INPUT_HOOK), "block_output": ("encoder.layers[%s]", CONST_OUTPUT_HOOK), "ffn1_activation": ("encoder.layers[%s].ffn1.intermediate_act_fn", CONST_OUTPUT_HOOK), @@ -34,7 +34,7 @@ "conv_depth_output": ("encoder.layers[%s].conv_module.depthwise_conv", CONST_OUTPUT_HOOK), } -w2v2bert_type_to_dimension_mapping = { +wav2vec2bert_type_to_dimension_mapping = { "n_head": ("num_attention_heads",), "block_input": ("hidden_size",), "block_output": ("hidden_size",),