-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathcompute_metrics.py
170 lines (143 loc) · 6.89 KB
/
compute_metrics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
from __future__ import division
import sys
from glob import glob
from os.path import join, basename
from util import read_json
from db_schema import (HTTP_REQUESTS_TABLE,
HTTP_RESPONSES_TABLE,
JAVASCRIPT_TABLE)
CMD_FAIL_RATES_JSON = "_command_fail_rate.json"
CMD_TIMEOUT_RATES_JSON = "_command_timeout_rate.json"
NUM_REQUESTS_JSON = "_sv_num_requests.json"
NUM_RESPONSES_JSON = "_sv_num_responses.json"
NUM_ENTRIES_WITHOUT_VISIT_ID_JSON = "_entries_without_visit_id.json"
NUM_ENTRIES_JSON = "_num_entries.json"
ANALYSIS_DIR = "analysis"
DB_SCHEMAS_DIR = "db-schemas"
LOG_FILES_DIR = "log-files"
class CrawlMetrics(object):
num_requests = 0
num_responses = 0
num_javascript = 0
rate_requests_without_visit_id = 0
rate_responses_without_visit_id = 0
rate_javascript_without_visit_id = 0
rate_cmd_failure_get = 0
rate_cmd_failure_browse = 0
rate_cmd_failure_dmp_flash_cookies = 0
rate_cmd_timeout_get = 0
rate_cmd_timeout_browse = 0
rate_cmd_timeout_dmp_flash_cookies = 0
rate_visits_without_responses = 0
class CrawlJsonCheck(object):
"""Analyze the JSON files generated by the DB processing step."""
def __init__(self, root_json_dir, crawl_name):
self.metrics = CrawlMetrics()
self.root_json_dir = root_json_dir
self.crawl_name = crawl_name
self.root_analysis_dir = join(root_json_dir, ANALYSIS_DIR)
def run_checks(self):
self.compute_rate_of_entries_with_missing_visit_id()
self.check_failure_and_timeout_rates()
self.check_requests_and_responses()
def read_num_entries(self):
num_entries = read_json(join(
self.root_analysis_dir, "%s%s" % (
self.crawl_name, NUM_ENTRIES_JSON)))
self.metrics.num_requests = num_entries[HTTP_REQUESTS_TABLE]
self.metrics.num_responses = num_entries[HTTP_RESPONSES_TABLE]
self.metrics.num_javascript = num_entries[JAVASCRIPT_TABLE]
def compute_rate_of_entries_with_missing_visit_id(self):
"""Some requests, responses and JS are stored with visit
id = -1 (i.e. no real visit id).
"""
# read total num of entries so we can compute the rates
self.read_num_entries()
num_entries_without_visit_id = read_json(join(
self.root_analysis_dir, "%s%s" % (
self.crawl_name, NUM_ENTRIES_WITHOUT_VISIT_ID_JSON)))
self.metrics.rate_requests_without_visit_id = \
(num_entries_without_visit_id[HTTP_REQUESTS_TABLE] /
self.metrics.num_requests)
self.metrics.rate_responses_without_visit_id = \
(num_entries_without_visit_id[HTTP_RESPONSES_TABLE] /
self.metrics.num_responses)
self.metrics.rate_javascript_without_visit_id = \
(num_entries_without_visit_id[JAVASCRIPT_TABLE] /
self.metrics.num_javascript)
def check_missing_requests(self, requests, responses):
"""Log sites with zero requests and some responses."""
for domain, num_responses in responses.iteritems():
num_requests = requests.get(domain, 0)
if not num_requests:
print (self.crawl_name, domain, num_requests, num_responses,
"requests are missing")
def check_requests_and_responses(self):
num_sites_w_reqs = 0
num_sites_w_more_responses = 0
num_sites_w_no_responses = 0
requests = read_json(join(self.root_analysis_dir,
"%s%s" % (self.crawl_name,
NUM_REQUESTS_JSON)))
responses = read_json(join(self.root_analysis_dir,
"%s%s" % (self.crawl_name,
NUM_RESPONSES_JSON)))
for domain, num_requests in requests.iteritems():
num_sites_w_reqs += 1
num_responses = responses.get(domain, 0)
if num_responses > num_requests:
num_sites_w_more_responses += 1
elif not num_responses:
num_sites_w_no_responses += 1
# to debug the domain
# print self.crawl_name, domain, num_requests, num_responses
# See, https://github.com/citp/openwpm-data-release/issues/1#issuecomment-415886730 # noqa
# if num_sites_w_more_responses:
# print (num_sites_w_more_responses,
# "have more response than requests. Total:",
# num_sites_w_reqs)
self.metrics.rate_visits_without_responses = (num_sites_w_no_responses
/ num_sites_w_reqs)
# print "Num. sites with zero responses", num_sites_w_no_responses
self.check_missing_requests(requests, responses)
def check_failure_and_timeout_rates(self):
cmd_fail_rates_json = join(self.root_analysis_dir,
"%s%s" % (self.crawl_name,
CMD_FAIL_RATES_JSON))
cmd_failure_rates = read_json(cmd_fail_rates_json)
self.metrics.rate_cmd_failure_get = cmd_failure_rates.get("GET", None)
self.metrics.rate_cmd_failure_browse = cmd_failure_rates.get("BROWSE",
None)
self.metrics.rate_cmd_failure_dmp_flash_cookies = \
cmd_failure_rates.get("DUMP_FLASH_COOKIES", None)
cmd_timeout_rates_json = join(self.root_analysis_dir,
"%s%s" % (self.crawl_name,
CMD_TIMEOUT_RATES_JSON))
cmd_timeout_rates = read_json(cmd_timeout_rates_json)
self.metrics.rate_cmd_timeout_get = cmd_timeout_rates.get("GET", None)
self.metrics.rate_cmd_timeout_browse = cmd_timeout_rates.get(
"BROWSE", None)
self.metrics.rate_cmd_timeout_dmp_flash_cookies = \
cmd_timeout_rates.get("DUMP_FLASH_COOKIES", None)
# print (self.crawl_name,
# self.metrics.rate_cmd_timeout_get,
# self.metrics.rate_cmd_timeout_browse,
# self.metrics.rate_cmd_timeout_dmp_flash_cookies)
def check_jsons_in_dir(root_json_dir):
root_analysis_dir = join(root_json_dir, ANALYSIS_DIR)
# we get the crawl names by iterating over cmd failrate json files
metrics = {}
pattern = join(root_analysis_dir, "*%s" % CMD_FAIL_RATES_JSON)
for json_path in sorted(glob(pattern)):
json_basename = basename(json_path)
crawl_name = json_basename.replace(CMD_FAIL_RATES_JSON, "")
json_check = CrawlJsonCheck(root_json_dir, crawl_name)
try:
json_check.run_checks()
metrics[crawl_name] = json_check.metrics
# Results will be in json_check.metrics
except Exception as e:
print "Exception", e
return metrics
if __name__ == '__main__':
check_jsons_in_dir(sys.argv[1])