AU-Harness/sample_config.yaml at main · ServiceNow/AU-Harness · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102

task_metric: #list of task-metric pairs, each as a two-item list of strings ["task_name", "metric_name"] - you must only run one task with one metric(no repeat tasks)
  # Speech recognition datasets
  - ["alpaca_audio_test", "llm_judge_detailed"]
  - ["librispeech_test_clean", "word_error_rate"]

  # Emotion recognition runspec
  - ["emotion_recognition", "llm_judge_binary"]

  # Gender recognition datasets
  - ["voxceleb_gender_test", "llm_judge_binary"]
  - ["iemocap_gender_recognition", "llm_judge_binary"]

  # Question-answering datasets and runspecs
  - ["big_bench_audio", "llm_judge_big_bench_audio"]
  - ["music_understanding", "llm_judge_binary"]

# Optional: Aggregate multiple task-metric pairs into a single score
# aggregate has x two-item lists, each as a two-item list of strings ["metric_name", ["task1", "task2", ..., "taskN"]]
aggregate:
  - ["llm_judge_binary", ["emotion_recognition"]]
  - ["llm_judge_binary", ["voxceleb_gender_test", "iemocap_gender_recognition"]]

filter:
  num_samples: 100 # number of samples to run(remove for all)
  length_filter: [0.0, 30.0] #optional - filters for only audio samples in this length(seconds) - only supported for general and callhome preprocessors

judge_settings:
  judge_concurrency: 8 #judge call(optional)
  judge_model: gpt-4o-mini #optional
  judge_type: openai # mandatory (vllm or openai)
  judge_api_version: ${API_VERSION} # optional(needed for openai)
  judge_api_endpoint: ${ENDPOINT_URL} # mandatory
  judge_api_key: ${AUTH_TOKEN} # mandatory
  judge_temperature: 0.1 # optional
  judge_prompt_model_override: gpt-4o-mini-enhanced # optional

logging:
  log_file: "audiobench.log"  # Path to the main log file

models:
  - name: gpt-4o-mini-audio-preview-1 # must be unique
    inference_type: openai # you can use vllm, openai, gemini or transcription
    url: ${ENDPOINT_URL}  # endpoint url
    delay: 100
    retry_attempts: 8
    timeout: 30
    model: gpt-4o-mini-audio-preview
    auth_token: ${AUTH_TOKEN}
    api_version: ${API_VERSION}
    batch_size: 300 # Optional - batch eval size
    chunk_size: 30 # Optional - max audio length in seconds fed to model

  - name: gpt-4o-mini-audio-preview-2 # must be unique
    inference_type: openai # you can use vllm, openai, gemini or transcription
    url: ${ENDPOINT_URL}  # endpoint url
    delay: 100
    retry_attempts: 8
    timeout: 30
    model: gpt-4o-mini-audio-preview
    auth_token: ${AUTH_TOKEN}
    api_version: ${API_VERSION}
    batch_size: 300 # Optional - batch eval size
    chunk_size: 30 # Optional - max audio length in seconds fed to model

  - name: gemini-2.5-flash # must be unique
    inference_type: gemini # you can use vllm, openai, gemini or transcription
    location: ${GOOGLE_CLOUD_LOCATION} # GCP Vertex AI configureation
    project_id: ${GOOGLE_CLOUD_PROJECT} # GCP Vertex AI configureation
    reasoning_effort: medium # Optional - Reasoning effort for supported reasoning models like gemini-2.5-flash, gpt-5,...
    delay: 100
    retry_attempts: 5
    timeout: 300
    model: google/gemini-2.5-flash
    batch_size: 100 # Optional - batch eval size
    chunk_size: 30240 # Optional - max audio length in seconds fed to model

  - name: qwen-2.5-omni # must be unique
    inference_type: vllm # you can use vllm, openai, gemini or transcription
    url: ${ENDPOINT_URL} # endpoint url
    delay: 100
    retry_attempts: 8
    timeout: 30
    model: qwen-2.5-omni
    auth_token: ${AUTH_TOKEN}
    batch_size: 200 # Optional - batch eval size
    chunk_size: 40 # Optional - max audio length in seconds fed to model

  - name: whisper-large-3 # must be unique
    inference_type: transcription # you can use vllm, openai, gemini or transcription
    url: ${ENDPOINT_URL} # endpoint url
    delay: 100
    retry_attempts: 8
    timeout: 30
    model: whisper-large-3
    auth_token: ${AUTH_TOKEN}
    batch_size: 100 # Optional - batch eval size
    chunk_size: 30 # Optional - max audio length in seconds fed to model

    # Data sharding - If two models have same "model" attribute, we implement dataset sharding

# In command line you can also pass custom config file name to read from with bash evaluate.sh --config <config_file>