CSTutorBench/abs_diff.py at main · InviteInstitute/CSTutorBench · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
#!/usr/bin/env python3
"""
Computes true absolute difference (no cancellation at any level) between
two evaluations of the same model.

Usage:
  python abs_diff.py <model> <trial> <evaluator_a> <evaluator_b>

Example:
  python abs_diff.py gemma-4-31b-it 1 reviewed-claude-sonnet-4-6 claude-sonnet-4-6-v2
"""

import json
import os
import re
import sys

SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
EVALUATIONS_DIR = os.path.join(SCRIPT_DIR, "Evaluations")


def load_eval_dir(path):
    entries = {}
    for fname in os.listdir(path):
        if not re.match(r"^EX\d+\.json$", fname):
            continue
        with open(os.path.join(path, fname)) as f:
            data = json.load(f)
        entries[data["id"]] = data
    return entries


def main():
    if len(sys.argv) != 5:
        print("Usage: python abs_diff.py <model> <trial> <evaluator_a> <evaluator_b>")
        print("Example: python abs_diff.py gemma-4-31b-it 1 reviewed-claude-sonnet-4-6 claude-sonnet-4-6-v2")
        sys.exit(1)

    model, trial, eval_a, eval_b = sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4]
    trial_dir = f"Trial{trial}"
    dir_a = os.path.join(EVALUATIONS_DIR, model, trial_dir, eval_a)
    dir_b = os.path.join(EVALUATIONS_DIR, model, trial_dir, eval_b)

    for label, path in [("A", dir_a), ("B", dir_b)]:
        if not os.path.isdir(path):
            print(f"ERROR: directory not found ({label}): {path}", file=sys.stderr)
            sys.exit(1)
    a = load_eval_dir(dir_a)
    b = load_eval_dir(dir_b)

    common = sorted(set(a) & set(b), key=lambda x: int(x[2:]))
    if not common:
        print("No questions in common.")
        sys.exit(1)

    by_type = {}   # question_type -> list of per-question abs diffs
    by_crit = {}   # criterion -> total abs diff
    total_abs = 0

    print(f"{'Question':<10} {'Type':<25} {'Abs Diff':>8}  Criterion breakdown")
    print("-" * 80)

    for qid in common:
        qa = {c["criterion"]: c["score"] for c in a[qid]["criteria"]}
        qb = {c["criterion"]: c["score"] for c in b[qid]["criteria"]}
        crits = sorted(set(qa) & set(qb))

        q_abs = sum(abs(qa[c] - qb[c]) for c in crits)
        diffs = {c: qa[c] - qb[c] for c in crits if qa[c] != qb[c]}
        diff_str = "  ".join(f"{c}:{qa[c]}→{qb[c]}({d:+d})" for c, d in diffs.items())

        qtype = a[qid]["question_type"]
        by_type.setdefault(qtype, []).append(q_abs)
        for c in crits:
            by_crit[c] = by_crit.get(c, 0) + abs(qa[c] - qb[c])
        total_abs += q_abs

        print(f"{qid:<10} {qtype:<25} {q_abs:>8}  {diff_str}")

    print()
    print(f"{'By question type'}")
    print("-" * 40)
    for qtype, diffs in sorted(by_type.items()):
        print(f"  {qtype:<25} {sum(diffs):>6}  ({len(diffs)} questions)")

    print()
    print(f"{'By criterion'}")
    print("-" * 40)
    for crit, d in sorted(by_crit.items(), key=lambda x: -x[1]):
        if d > 0:
            print(f"  {crit:<25} {d:>6}")

    print()
    print(f"  {'TOTAL ABS DIFF':<25} {total_abs:>6}")


if __name__ == "__main__":
    main()