-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathevaluation.py
152 lines (134 loc) · 5.91 KB
/
evaluation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
from __future__ import print_function, division
import subprocess
import argparse
import numpy as np
import os
import sys
from utils import loadtxt, savetxt
def bleu_score(ref_file, hyp_file):
"""Computes corpus level BLEU score with Moses' multi-bleu.pl script
Arguments:
ref_file (str): Path to the reference file
hyp_file (str): Path to the hypothesis file
Returns:
tuple: Tuple (BLEU, details) containing the bleu score
and the detailed output of the perl script
Raises:
ValueError: Raises error if the perl script fails for some reason
"""
command = 'perl scripts/multi-bleu.pl ' + ref_file + ' < ' + hyp_file
c = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
details, error = c.communicate()
details, error = details.decode('utf-8'), error.decode('utf-8')
if not details.startswith('BLEU ='):
raise ValueError('Error in BLEU score computation:\n%s' % error)
else:
BLEU_str = details.split(' ')[2][:-1]
BLEU = float(BLEU_str)
return BLEU, details
parser = argparse.ArgumentParser(
description='program to compare mt results',
)
parser.add_argument('ref', type=str, help='a path to a correct reference file')
parser.add_argument('out', type=str, help='a path to a system output')
parser.add_argument('otherout', nargs='?', type=str, default=None,
help='a path to another system output. add only if '
'you want to compare outputs from two systems.')
parser.add_argument('--num_samples', '-M', type=int, default=100,
help='Number of samples for bootstrap resampling')
parser.add_argument('--sample_size', type=float, default=50,
help='Size of each sample (in percentage of the total size)')
parser.add_argument('--bleufile', type=str, default='bleus.txt',
help='Where to store the bleu scores')
parser.add_argument("--verbose", '-v',
help="increase output verbosity",
action="store_true")
def print_stats(bleus):
print('Mean: %.3f, Std: %.3f, Min: %.3f, Max: %.3f' %
(bleus.mean(), bleus.std(), bleus.min(), bleus.max()))
print('95%% confidence interval: %.3f - %.3f' %
(bleus[int(0.025 * len(bleus))], bleus[int(0.975 * len(bleus))]))
def print_paired_stats(bleus):
win1 = (bleus[:, 0] > bleus[:, 1]).sum() / args.num_samples * 100
win2 = (bleus[:, 0] < bleus[:, 1]).sum() / args.num_samples * 100
ties = (bleus[:, 0] == bleus[:, 1]).sum() / args.num_samples * 100
print('System 1 > system 2: %.3f' % win1)
print('System 1 < system 2: %.3f' % win2)
print('Ties: %.3f' % ties)
def load_if_file(corpus):
if isinstance(corpus, str):
corpus = loadtxt(corpus)
return corpus
def bootstrap_resampling(ref, out, num_samples, sample_percent, dummy_ref, dummy_out, verbose=False):
ref = load_if_file(ref)
out = load_if_file(out)
n = len(ref)
assert n == len(out), 'Mismatched reference and output file size'
k = int(sample_percent * n / 100)
bleus = []
for i in range(num_samples):
subset = np.random.choice(n, k)
savetxt(dummy_out, out[subset])
savetxt(dummy_ref, ref[subset])
bleu, _ = bleu_score(dummy_ref, dummy_out)
bleus.append(bleu)
if verbose and (i + 1) % (num_samples // 10) == 0:
print('%d%% done' % ((i + 1) // (num_samples // 10) * 10))
sys.stdout.flush()
bleus = np.sort(np.asarray(bleus))
return bleus
def paired_bootstrap_resampling(ref, out, otherout, num_samples, sample_percent, dummy_ref, dummy_out, dummy_otherout, verbose=False):
ref = load_if_file(ref)
out = load_if_file(out)
otherout = load_if_file(otherout)
n = len(ref)
assert n == len(out), 'Mismatched reference and output file size'
assert n == len(otherout), 'Mismatched reference and other output file size'
k = int(sample_percent * n / 100)
bleus = []
for i in range(num_samples):
subset = np.random.choice(n, k)
savetxt(dummy_out, out[subset])
savetxt(dummy_otherout, otherout[subset])
savetxt(dummy_ref, ref[subset])
bleu1, _ = bleu_score(dummy_ref, dummy_out)
bleu2, _ = bleu_score(dummy_ref, dummy_otherout)
bleus.append([bleu1, bleu2])
if verbose and (i + 1) % (num_samples // 10) == 0:
print('%d%% done' % ((i + 1) // (num_samples // 10) * 10))
sys.stdout.flush()
bleus = np.asarray(bleus)
return bleus
if __name__ == '__main__':
args = parser.parse_args()
ref = np.asarray(loadtxt(args.ref))
out = np.asarray(loadtxt(args.out))
n = len(ref)
assert n == len(out), 'Mismatched reference and output file size'
k = int(args.sample_size * n / 100)
dummy_num = np.random.randint(1000000)
dummy_out = '%d_out.txt' % dummy_num
dummy_ref = '%d_ref.txt' % dummy_num
if args.otherout is None:
# Normal bootstrap resampling
bleus = bootstrap_resampling(ref, out, args.num_samples,
args.sample_size, dummy_ref,
dummy_out, verbose=args.verbose)
total, _ = bleu_score(args.ref, args.out)
print('Total BLEU: %.3f' % total)
print_stats(bleus)
np.savetxt(args.bleufile, bleus)
else:
otherout = np.asarray(loadtxt(args.otherout))
dummy_otherout = '%d_otherout.txt' % dummy_num
bleus = bootstrap_resampling(ref, out, otherout,
args.num_samples,
args.sample_size,
dummy_ref, dummy_out,
dummy_otherout,
verbose=args.verbose)
print_paired_stats(bleus)
np.savetxt(args.bleufile, bleus)
os.remove(dummy_otherout)
os.remove(dummy_out)
os.remove(dummy_ref)