-
Notifications
You must be signed in to change notification settings - Fork 6
Expand file tree
/
Copy pathsample_config.yaml
More file actions
102 lines (87 loc) · 4.12 KB
/
sample_config.yaml
File metadata and controls
102 lines (87 loc) · 4.12 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
task_metric: #list of task-metric pairs, each as a two-item list of strings ["task_name", "metric_name"] - you must only run one task with one metric(no repeat tasks)
# Speech recognition datasets
- ["alpaca_audio_test", "llm_judge_detailed"]
- ["librispeech_test_clean", "word_error_rate"]
# Emotion recognition runspec
- ["emotion_recognition", "llm_judge_binary"]
# Gender recognition datasets
- ["voxceleb_gender_test", "llm_judge_binary"]
- ["iemocap_gender_recognition", "llm_judge_binary"]
# Question-answering datasets and runspecs
- ["big_bench_audio", "llm_judge_big_bench_audio"]
- ["music_understanding", "llm_judge_binary"]
# Optional: Aggregate multiple task-metric pairs into a single score
# aggregate has x two-item lists, each as a two-item list of strings ["metric_name", ["task1", "task2", ..., "taskN"]]
aggregate:
- ["llm_judge_binary", ["emotion_recognition"]]
- ["llm_judge_binary", ["voxceleb_gender_test", "iemocap_gender_recognition"]]
filter:
num_samples: 100 # number of samples to run(remove for all)
length_filter: [0.0, 30.0] #optional - filters for only audio samples in this length(seconds) - only supported for general and callhome preprocessors
judge_settings:
judge_concurrency: 8 #judge call(optional)
judge_model: gpt-4o-mini #optional
judge_type: openai # mandatory (vllm or openai)
judge_api_version: ${API_VERSION} # optional(needed for openai)
judge_api_endpoint: ${ENDPOINT_URL} # mandatory
judge_api_key: ${AUTH_TOKEN} # mandatory
judge_temperature: 0.1 # optional
judge_prompt_model_override: gpt-4o-mini-enhanced # optional
logging:
log_file: "audiobench.log" # Path to the main log file
models:
- name: gpt-4o-mini-audio-preview-1 # must be unique
inference_type: openai # you can use vllm, openai, gemini or transcription
url: ${ENDPOINT_URL} # endpoint url
delay: 100
retry_attempts: 8
timeout: 30
model: gpt-4o-mini-audio-preview
auth_token: ${AUTH_TOKEN}
api_version: ${API_VERSION}
batch_size: 300 # Optional - batch eval size
chunk_size: 30 # Optional - max audio length in seconds fed to model
- name: gpt-4o-mini-audio-preview-2 # must be unique
inference_type: openai # you can use vllm, openai, gemini or transcription
url: ${ENDPOINT_URL} # endpoint url
delay: 100
retry_attempts: 8
timeout: 30
model: gpt-4o-mini-audio-preview
auth_token: ${AUTH_TOKEN}
api_version: ${API_VERSION}
batch_size: 300 # Optional - batch eval size
chunk_size: 30 # Optional - max audio length in seconds fed to model
- name: gemini-2.5-flash # must be unique
inference_type: gemini # you can use vllm, openai, gemini or transcription
location: ${GOOGLE_CLOUD_LOCATION} # GCP Vertex AI configureation
project_id: ${GOOGLE_CLOUD_PROJECT} # GCP Vertex AI configureation
reasoning_effort: medium # Optional - Reasoning effort for supported reasoning models like gemini-2.5-flash, gpt-5,...
delay: 100
retry_attempts: 5
timeout: 300
model: google/gemini-2.5-flash
batch_size: 100 # Optional - batch eval size
chunk_size: 30240 # Optional - max audio length in seconds fed to model
- name: qwen-2.5-omni # must be unique
inference_type: vllm # you can use vllm, openai, gemini or transcription
url: ${ENDPOINT_URL} # endpoint url
delay: 100
retry_attempts: 8
timeout: 30
model: qwen-2.5-omni
auth_token: ${AUTH_TOKEN}
batch_size: 200 # Optional - batch eval size
chunk_size: 40 # Optional - max audio length in seconds fed to model
- name: whisper-large-3 # must be unique
inference_type: transcription # you can use vllm, openai, gemini or transcription
url: ${ENDPOINT_URL} # endpoint url
delay: 100
retry_attempts: 8
timeout: 30
model: whisper-large-3
auth_token: ${AUTH_TOKEN}
batch_size: 100 # Optional - batch eval size
chunk_size: 30 # Optional - max audio length in seconds fed to model
# Data sharding - If two models have same "model" attribute, we implement dataset sharding
# In command line you can also pass custom config file name to read from with bash evaluate.sh --config <config_file>