-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathreproduce_cqe_scores.py
169 lines (139 loc) · 6.73 KB
/
reproduce_cqe_scores.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
from cbtest_wrapper import test_sample, test_sample_refpatch
import cPickle
from collections import OrderedDict
import csv
import glob
import os
#### Settings for this script ####
SCORE_TO_COMPARE = 'PerfScore'
#SCORE_TO_COMPARE = 'ExecTimeOverhead'
ALLOW_RECOMPUTING = False
# Too big to put all into this git, and used just for internal testing anyway:
# https://github.com/CyberGrandChallenge/samples/tree/master/cqe-challenges
SRC_DIR = './cqe-recap/source'
# http://repo.cybergrandchallenge.com/cqe_results/cqe_final_submissions.zip
SUBMISSIONS_DIR = './cqe-recap/submissions'
# http://repo.cybergrandchallenge.com/cqe_results/cqe_results-0d5587e0.zip
SCORES_CSV='./cqe-recap/results/CQE_SCORES.csv'
def compute_reference_scores(cset_name):
sample_dir = SRC_DIR+'/'+cset_name
print "Computing reference score for", cset_name
try:
fails, prevented, _, patched_scores = test_sample_refpatch(sample_dir, only_n_polls=100)
except Exception as e:
print "IGNORING CS THAT CAUSED A test_sample_refpatch EXCEPTION", cset_name, e
return None
if fails:
print "IGNORING CS THAT FAILED ITS OWN POLL TEST", cset_name, "(poll fails:",fails,")"
return None
if len(prevented) != len(glob.glob(sample_dir+"/pov/*.xml")+glob.glob(sample_dir+"/pov/*.pov")):
print "OFFICIAL PATCH FOR ",cset_name," DOES NOT PREVENT ALL PoVs!"
return patched_scores
def get_reference_scores():
if os.path.isfile('reference_scores.pickle'):
with open('reference_scores.pickle', 'rb') as f:
refs = cPickle.load(f)
else:
assert ALLOW_RECOMPUTING
all_refs = { cs:compute_reference_scores(cs) for cs in os.listdir(SRC_DIR) }
#all_refs = { cs:compute_reference_scores(cs) for cs in os.listdir(SRC_DIR) if cs.startswith('YAN') } # XXX DEBUG ONLY XXX
refs = { cs:score for cs,score in all_refs.iteritems() if score is not None }
# Sanity check
for scores in refs.values():
for val in scores.values():
assert val is not None
if ALLOW_RECOMPUTING:
with open('reference_scores.pickle','wb') as f:
cPickle.dump(refs, f, protocol=cPickle.HIGHEST_PROTOCOL)
return refs
def ordered_sets(dic):
"""Turns { key: val } into descending OrderedDict { val: set(k1,k2,...) }"""
sortedvals = sorted(set(dic.values()), reverse=True)
ret = OrderedDict()
for i in sortedvals:
ret[i] = frozenset(k for k,v in dic.iteritems() if v == i)
return ret
def compare_scores(byus, bydarpa):
"""byus = { team: score }, bydarpa = { team: score }"""
assert frozenset(byus.keys()) == frozenset(bydarpa.keys())
our_ranking = ordered_sets(byus)
darpa_ranking = ordered_sets(bydarpa)
our_picks = our_ranking.values()[0]
darpa_picks = darpa_ranking.values()[0]
from scipy import stats
# scipy takes them as ordered lists
teamorder = list(byus.keys())
vals_us = [ byus[t] for t in teamorder ]
vals_darpa = [ bydarpa[t] for t in teamorder ]
tau, p_value = stats.kendalltau(vals_us, vals_darpa)
def names(teams_set):
return '[' + ' '.join(sorted(n.split()[0] for n in teams_set)) + ']'
if our_picks == darpa_picks:
print "[ ] All first choice(s)",names(our_picks),"match, excellent!"
elif our_picks.isdisjoint(darpa_picks):
print "[XX] Our first choice(s)",names(our_picks)," completely different from DARPA's",names(darpa_picks)
else:
print "[__] Partial match between our first choice(s) and DARPA's. Both have",names(darpa_picks&our_picks),"(we also have:",names(our_picks-darpa_picks)," -- darpa also has:",names(darpa_picks-our_picks),")"
print " FOR US:"
for score,teams in our_ranking.iteritems():
print " ","%+.4f"%score,names(teams)
print " DARPA:"
for score,teams in darpa_ranking.iteritems():
print " ","%+.4f"%score,names(teams)
print " %s Kendall tau: %.4f (p-value for being correlated: %.6f)" % (("<7" if tau < 0.7 else "<8") if tau < 0.8 else " ", tau, p_value)
def main():
refs = get_reference_scores() # cset -> our scoring of the official patch
reps = {} # (cset, TeamName) -> our scoring of that team's patch
if os.path.isfile('replacement_scores.pickle'):
with open('replacement_scores.pickle', 'rb') as f:
reps = cPickle.load(f)
darpa = {} # (cset,TeamName) -> darpa's CSV row
csets = set() # list of scored cset names
with open(SCORES_CSV) as f, open('replacement_scores.csv' if ALLOW_RECOMPUTING else '/dev/null','w') as ourf:
if ALLOW_RECOMPUTING:
ourcsv = csv.DictWriter(ourf,fieldnames=('TeamName','cset','PerfScore','ExecTimeOverhead','FileSizeOverhead','MemUseOverhead'), extrasaction='ignore')
ourcsv.writeheader()
for row in csv.DictReader(f):
cs = row['cset']
key = (cs, row['TeamName'])
darpa[key] = row
if cs not in refs:
#print "Skipping", cs
continue
print key
csets.add(cs)
if key in reps:
scores = reps[key]
else:
if not ALLOW_RECOMPUTING:
print "SKIPPING UNSCORED",cs
continue
assert row['package_name'].endswith('.ar')
rcbs = glob.glob(SUBMISSIONS_DIR+'/'+row['package_name'][:-3]+'/RB*')
_, _, _, scores = test_sample(rcbs, SRC_DIR+'/'+cs, reference_score=refs[cs], only_n_polls=100)
reps[key] = scores
if scores['Scoring algorithm'] != 'CQE':
print "Skipping, as the scoring algorithm is not the CQE one: ", scores['Scoring algorithm']
del reps[key]
continue
for metric in sorted(set(scores.keys()) & set(row.keys())):
darpatruth = float(row[metric])
estimate = float(scores[metric])
diff = estimate - darpatruth
print " %s %20s DARPA: %.2f estim.: %.2f (%.2f)" % ("+++" if diff>0.2 else ("---" if diff<-0.2 else " "), metric, darpatruth, estimate, diff)
if ALLOW_RECOMPUTING:
forcsv = scores
forcsv['cset'] = cs
forcsv['TeamName'] = row['TeamName']
ourcsv.writerow(forcsv)
if ALLOW_RECOMPUTING:
with open('replacement_scores.pickle','wb') as f:
cPickle.dump(reps, f, protocol=cPickle.HIGHEST_PROTOCOL)
for cs in csets:
teams = [ TeamName for c,TeamName in reps if c == cs ]
byus = { t:float(reps[(cs,t)][SCORE_TO_COMPARE]) for t in teams }
bydarpa = { t:float(darpa[(cs,t)][SCORE_TO_COMPARE]) for t in teams }
print cs
compare_scores(byus, bydarpa)
if __name__ == "__main__":
main()