-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathmap_redirections.py
316 lines (264 loc) · 14.1 KB
/
map_redirections.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
import re
from anytree import Node, RenderTree, search
import tldextract
def extract_redir_chain(http_entries):
final_redir_list = []
new_http_entries = []
# If there is more than x (2) mins between HTTP requests, we should seperate into sessions
# This means if the user visits the same URL that he visited 2 minutes ago, it will be in a seperate chain
# Note that the x parameter should be determined by experience / trial and error
sessions = extract_sessions(http_entries)
# For each 2 minute sessions
for _, session in enumerate(sessions):
# Keep track of URLs and map redirections that occur
url_list = []
redir_chain_list = []
# Get each URL in the PCAP
for entry in session:
# Make sure this URL isn't already present
if not any(entry['domain'] in url for url in url_list):
url_list.append({entry['domain']: entry['ts']})
# Process each URL
for initial_url in url_list:
for url, ts in initial_url.items():
# If this URL has been redirected to in a previous chain, skip it
if not is_url_processed(url, redir_chain_list):
# Otherwise, build a redirection map for this URL
redir_chain_list, session = process_entry_url(session, redir_chain_list, url_list, url, ts)
# Make a list of all chains with only one URL (prepare for subdomain redirection checking)
single_url_chains = []
for node in redir_chain_list:
if node.height <= 1:
single_url_chains.append(node)
redir_chain_list.remove(node)
# Lastly, process subdomain redirects...
# Any URLs in a single chain that weren't seen in other chains but a URL with the same domain was seen in another chain
# We add the subdomain before/after the other subdomain (depending on which was accessed first)
new_redir_list = []
for chain in redir_chain_list:
new_node, single_url_chains = subdomain_redirects(chain.root, single_url_chains, session)
new_redir_list.append(new_node)
# Add single URL chains to our new chain list (without duplicating URLs
# that were in single chain but have been marked as subdomain
# redirections)
new_redir_list += single_url_chains
# Make sure each node is root or we'll have problems when printing
for node in new_redir_list:
node.parent = None
node = calculate_timings(node, session)
# Add this sessions redirection chains to the total list
final_redir_list += new_redir_list
new_http_entries += session
return final_redir_list, new_http_entries
def extract_sessions(http_entries):
sessions = []
start_pos = 0
for current_pos in range(len(http_entries) - 1):
# Get the difference between time of previous request and current request
difference = (http_entries[current_pos + 1]['ts'] - http_entries[current_pos]['ts']).total_seconds()
# Is there more than 2 minutes between the requests?
if difference >= (2 * 60):
# Extract subset
subset = http_entries[start_pos:current_pos]
# Add it to the list
sessions.append(subset)
# Update the start position
start_pos = current_pos
# If we extracted subsets, we need to grab the final one
if start_pos != 0:
subset = http_entries[start_pos:]
sessions.append(subset)
# Otherwise, there was only one session
else:
sessions.append(http_entries)
return sessions
def calculate_timings(tree, session):
# Convert timestamps into seconds
for _, _, node in RenderTree(tree):
if not node.is_root:
ts = None
for entry in session:
if entry['domain'] == node.parent.name:
ts = entry['ts']
elif entry['domain'] == node.name:
break
if ts:
seconds = round((node.temp_ts - ts).total_seconds(), 3)
else:
seconds = round((node.temp_ts - node.parent.temp_ts).total_seconds(), 3)
if seconds > 0:
node.seconds = seconds
else:
node.seconds = 0
# Delete the old timestamps
del tree.temp_ts
for node in tree.descendants:
del node.temp_ts
return tree
def is_url_processed(url, redir_chain):
# Check if URL already exists in list of redirection chains
for node in redir_chain:
result = search.findall(node, filter_=lambda node: node.name in (url))
if len(result) > 0:
return True
return False
def process_entry_url(http_entries, redir_chain_list, url_list, initial_url, ts):
# Define our initial node
root_node = Node(initial_url, redirs="", temp_ts=ts)
# Get our initial list of http redirections
root_node, _, http_entries = level_extract(http_entries, root_node, root_node, 0)
# Fill in "Unknown" for any redirects that we couldn't identify a source for
for descendant in root_node.root.descendants:
# But has no known redirects
if len(descendant.redirs) == 0:
descendant.redirs.append("Unknown")
# We don't want to model root node redirs
del root_node.root.redirs
# Add this redirection list to a larger list
redir_chain_list.append(root_node)
return redir_chain_list, http_entries
def subdomain_redirects(root_node, single_url_chains, http_entries):
found = False
# If there are redirections in the chain
if root_node.height > 0:
# If there are any URLs that were visited
for leaf in root_node.leaves:
for node in single_url_chains:
# Extract domains
parsed_url = tldextract.extract(node.name)
parsed_leaf = tldextract.extract(leaf.name)
# Make sure domain matches and subdomain doesn't
if parsed_url.domain == parsed_leaf.domain and parsed_url.subdomain != parsed_leaf.subdomain:
for entry in http_entries:
# If we haven't already found the redirection
if not found:
# If the HTTP entry domain matches the single URL chain and it was visited before the leaf but after the leaf's parent
if entry['domain'] == node.name and entry['ts'] < leaf.temp_ts and entry['ts'] > leaf.parent.temp_ts:
# Add it before the leaf
Node(node.name, redirs=["Subdomain"], temp_ts=entry['ts'], parent=leaf.parent, children=[leaf])
found = True
single_url_chains.remove(node)
# If the HTTP entry domain matches the single URL chain and it was visited after the leaf
elif entry['domain'] == node.name and entry['ts'] > leaf.temp_ts:
# Add it to the leaf
Node(node.name, redirs=["Subdomain"], temp_ts=entry['ts'], parent=leaf)
found = True
single_url_chains.remove(node)
return root_node, single_url_chains
def level_extract(http_entries, root_node, current_node, pos):
# Loop through entries, looking for the first redirection
for index, entry in enumerate(http_entries, start=pos):
# Make sure referrer header isn't blank
if 'referrer' in entry:
if entry['referrer']:
# Make sure referrer header matches current node in redirection chain
if re.search(current_node.name, entry['referrer']):
# Make sure the redirection is to a new domain
if not re.search(entry['domain'], entry['referrer']):
# Store the redirection type, this is a useful feature
entry['redir_type'] = 'Referrer'
# Add this new domain to the redir_tree
root_node = add_node(http_entries, root_node, current_node, entry['redir_type'], entry['domain'], entry['ts'], index)
# If there was no change in referrer header, check location
if 'location' in entry:
# If location is blank, there's no third party redirection
if entry['location']:
# If the currently processed node is equal to the domain in the HTTP entry
if re.search(current_node.name, entry['domain']):
# Was there was a redirection via location header?
if not current_node.name == entry['location']:
# Make sure the potential redirected URL was actually visited
if any(d['domain'] == entry['location'].split(':')[0] for d in http_entries):
# Store the redirection type, this is a useful feature
entry['redir_type'] = 'Location'
# Need to find the correct timestamp
ts = ""
for temp_entry in http_entries:
if ts == "":
if temp_entry['ts'] >= entry['ts']:
if re.search(temp_entry['domain'], entry['location']):
ts = temp_entry['ts']
# If we failed to find the correct timestamp
if ts == "":
ts = entry['ts']
# Add this new domain to the redir_tree
root_node = add_node(http_entries, root_node, current_node, entry['redir_type'], entry['location'], ts, index)
# Process content-based redirections
for redir_entry in entry['content_redirs']:
# Make sure the redirect URL is populated
if redir_entry['redir_url']:
# Make sure the redir_entry has the same URL as the current node
if redir_entry['url'] == current_node.name:
# URLs containing 5 or less characters can be discarded as error
if len(redir_entry['redir_url']) > 5:
# Make sure the potential redirect was visited after the current node
if redir_entry['ts'] >= current_node.temp_ts:
# Make sure the potential redirected URL was actually visited
if any(d['domain'] == redir_entry['redir_url'] for d in http_entries):
# Make sure the redirection is to a new domain
if not re.search(redir_entry['redir_url'], redir_entry['url']):
# Add the redirect to the tree
root_node = add_node(
http_entries,
root_node,
current_node,
redir_entry['redir_type'],
redir_entry['redir_url'],
redir_entry['ts'],
index)
return root_node, current_node, http_entries
def update_node_redir_types(redir_type, existing_redir_types, existing_url):
# Update the redirect types for this URL if more than one was found
if redir_type != "Unknown":
if not redir_type in existing_redir_types:
existing_redir_types.append(redir_type)
return existing_redir_types
def add_node(http_entries, root_node, current_node, redir_type, new_node_url, ts, index):
# Create an empty node (we'll populate it if the node hasn't already been added to tree)
new_node = None
# Loop through siblings of the current node
for sibling in current_node.siblings:
# If the sibling URL matches the URL we are trying to add
if sibling.name == new_node_url:
sibling.parent = current_node
current_node.redirs = update_node_redir_types(redir_type, current_node.redirs, current_node.name)
return root_node
# If the new URL is equal to the root URL
if root_node.root.name == new_node_url:
current_node = root_node.root
return root_node
# Ensure this node hasn't already been added
for item in root_node.root.descendants:
# If this item name is the same as the new redir URL + the items parent name is the same as current node URL
if item.name == new_node_url and item.parent.name == current_node.name:
current_node = item
# Update the redirect types for this URL
item.redirs = update_node_redir_types(redir_type, item.redirs, item.name)
return root_node
# If a node of the same URL has been added (but not the same parent), don't update redirections
for item in root_node.root.descendants:
if item.name == new_node_url:
current_node = item
return root_node
# I guess node hasn't been added.. Let's add it!
# Don't add "Unknown", we'll do that later if we fail to find any other redirection types
if redir_type != "Unknown":
new_node = Node(new_node_url.lower(), redirs=[redir_type], temp_ts=ts, parent=current_node)
else:
new_node = Node(new_node_url.lower(), redirs=[], temp_ts=ts, parent=current_node)
# Set the current node to equal the new node
current_node = new_node
# Recursively execute this function
root_node, current_node, http_entries = level_extract(http_entries, root_node, current_node, index)
return root_node
def clean_url(url):
if url != "":
stripped_url = (re.search(r"(https?://)?(www\.)?([a-z0-9\-\.\:]+)", url, flags=re.I))
cleaned_url = ""
try:
cleaned_url = stripped_url.group(3).lower()
if cleaned_url.endswith(':80'):
cleaned_url = cleaned_url.rstrip(':80')
except BaseException:
print('Failed to clean: ' + url)
return cleaned_url