-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathabs_diff.py
More file actions
98 lines (76 loc) · 3.01 KB
/
Copy pathabs_diff.py
File metadata and controls
98 lines (76 loc) · 3.01 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
#!/usr/bin/env python3
"""
Computes true absolute difference (no cancellation at any level) between
two evaluations of the same model.
Usage:
python abs_diff.py <model> <trial> <evaluator_a> <evaluator_b>
Example:
python abs_diff.py gemma-4-31b-it 1 reviewed-claude-sonnet-4-6 claude-sonnet-4-6-v2
"""
import json
import os
import re
import sys
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
EVALUATIONS_DIR = os.path.join(SCRIPT_DIR, "Evaluations")
def load_eval_dir(path):
entries = {}
for fname in os.listdir(path):
if not re.match(r"^EX\d+\.json$", fname):
continue
with open(os.path.join(path, fname)) as f:
data = json.load(f)
entries[data["id"]] = data
return entries
def main():
if len(sys.argv) != 5:
print("Usage: python abs_diff.py <model> <trial> <evaluator_a> <evaluator_b>")
print("Example: python abs_diff.py gemma-4-31b-it 1 reviewed-claude-sonnet-4-6 claude-sonnet-4-6-v2")
sys.exit(1)
model, trial, eval_a, eval_b = sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4]
trial_dir = f"Trial{trial}"
dir_a = os.path.join(EVALUATIONS_DIR, model, trial_dir, eval_a)
dir_b = os.path.join(EVALUATIONS_DIR, model, trial_dir, eval_b)
for label, path in [("A", dir_a), ("B", dir_b)]:
if not os.path.isdir(path):
print(f"ERROR: directory not found ({label}): {path}", file=sys.stderr)
sys.exit(1)
a = load_eval_dir(dir_a)
b = load_eval_dir(dir_b)
common = sorted(set(a) & set(b), key=lambda x: int(x[2:]))
if not common:
print("No questions in common.")
sys.exit(1)
by_type = {} # question_type -> list of per-question abs diffs
by_crit = {} # criterion -> total abs diff
total_abs = 0
print(f"{'Question':<10} {'Type':<25} {'Abs Diff':>8} Criterion breakdown")
print("-" * 80)
for qid in common:
qa = {c["criterion"]: c["score"] for c in a[qid]["criteria"]}
qb = {c["criterion"]: c["score"] for c in b[qid]["criteria"]}
crits = sorted(set(qa) & set(qb))
q_abs = sum(abs(qa[c] - qb[c]) for c in crits)
diffs = {c: qa[c] - qb[c] for c in crits if qa[c] != qb[c]}
diff_str = " ".join(f"{c}:{qa[c]}→{qb[c]}({d:+d})" for c, d in diffs.items())
qtype = a[qid]["question_type"]
by_type.setdefault(qtype, []).append(q_abs)
for c in crits:
by_crit[c] = by_crit.get(c, 0) + abs(qa[c] - qb[c])
total_abs += q_abs
print(f"{qid:<10} {qtype:<25} {q_abs:>8} {diff_str}")
print()
print(f"{'By question type'}")
print("-" * 40)
for qtype, diffs in sorted(by_type.items()):
print(f" {qtype:<25} {sum(diffs):>6} ({len(diffs)} questions)")
print()
print(f"{'By criterion'}")
print("-" * 40)
for crit, d in sorted(by_crit.items(), key=lambda x: -x[1]):
if d > 0:
print(f" {crit:<25} {d:>6}")
print()
print(f" {'TOTAL ABS DIFF':<25} {total_abs:>6}")
if __name__ == "__main__":
main()