-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawler_methods.py
219 lines (171 loc) · 7.35 KB
/
crawler_methods.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
import urllib
import datetime
import time
import re
import base64
import math
from urllib.request import OpenerDirector
from enum import Enum
from bs4 import BeautifulSoup
# Crawler-wide constants
SEARCH_URL = 'http://elyon1.court.gov.il/verdictssearch/HebrewVerdictsSearch.aspx'
EXTENDED_INFO_URL_PREFIX = 'http://elyon2.court.gov.il/scripts9/mgrqispi93.dll' \
'?Appname=eScourt&Prgname=GetFileDetails&Arguments=-N'
# HTTP requests headers
headers = {
"User-Agent": "Mozilla/5.0 ElyonCrawler v1.0",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "en-US,en;q=0.8,he;q=0.6"
}
# Log message levels
class LogLevel(Enum):
VERBOSE = 1
INFO = 2
ERROR = 3
# Crawler fault increments
class FaultLevel(Enum):
INTERVAL = 75
RESULT_PAGE = 25
VERDICT = 3
def test_connectivity(cookie_jar, timeout=30) -> (OpenerDirector, BeautifulSoup):
"""Tests the initial connectivity of the remote server.
The cookie_jar argument is used to collect any cookies that may be
set by the server.
Returns the BeautifulSoup instance of the page and the request object so
it can be reused
"""
# Build an opener with the cookie jar and try to get the response from the server
req = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cookie_jar))
req.addheaders = list(headers.items())
res = req.open(SEARCH_URL, timeout=timeout)
# Upon 200 OK, try to read the response and return the BeautifulSoup object.
# Otherwise, raise a server exception with the specified HTTP code
if res.code == 200:
soup = BeautifulSoup(res)
else:
raise RuntimeError("Server returned code: " + str(res.code))
return req, soup
def get_first_search_result_page(req, soup, start_date, end_date, technical=False, timeout=30) -> BeautifulSoup:
"""Retrieves the first result page from the server."""
# Modify the HTTP request headers to suit the request
search_headers = headers.copy()
search_headers["Origin"] = "http://elyon1.court.gov.il"
search_headers["Referer"] = "http://elyon1.court.gov.il/verdictssearch/HebrewVerdictsSearch.aspx"
req.addheaders = list(search_headers.items())
post_map = {
"Search$ddlYear": "",
"Search$txtCaseNumber": "",
"Search$txtText": "",
"Search$ddlOper": 0,
"Search$txtJudges": "",
"Search$txtSides": "",
"Search$txtLawyers": "",
"Search$ddlMadors": "",
"Search$ddlPages": 0,
"Search$chkTechnical": "on" if technical else "", # Turns technical resolutions search on/off
"Search$txtDateFrom": start_date.strftime("%d/%m/%Y"),
"Search$txtDateTo": end_date.strftime("%d/%m/%Y"),
"__VIEWSTATE": soup.find_all(id="__VIEWSTATE")[0]['value'],
"__EVENTTARGET": "Search$lnkSearch",
"__LASTFOCUS": "",
"__EVENTARGUMENT": "",
"__EVENTVALIDATION": soup.find_all(id="__EVENTVALIDATION")[0]['value']
}
post_data = urllib.parse.urlencode(post_map).encode("UTF-8")
# Request the search results page and create a BeautifulSoup object from the HTML response
res = req.open(SEARCH_URL, data=post_data, timeout=timeout)
if res.code == 200:
soup = BeautifulSoup(res)
else:
raise RuntimeError("Server returned code: " + str(res.code))
return soup
def calculate_page_count(soup):
"""Calculates the number of pages the search has yielded.
This is required in order to calculate the number of iterations
required to go over the input"""
try:
# Decode the VIEWSTATE as UTF-8, ignoring any malformed bytes
view_state = base64.b64decode(
soup.find_all(id="__VIEWSTATE")[0]['value']
).decode("utf-8", "ignore")
# The information is located after the XML document, so remove anything before it
view_state = re.sub(r'^([\s\S]+?)(</Results>|<Results />)', '', view_state)
# The number of pages is calculated using two fields: The number of results
# and the results per page. These appear in the following format:
# |--|<result_count>|--|TRUE/FALSE|--|1|--|<results_per_page>|...
result_count = view_state[:36].split("|")[2]
results_per_page = view_state[:36].split("|")[8]
# Calculate the number of pages
return math.ceil(int(result_count) / int(results_per_page)), result_count
except:
return 0, 0
def generate_page_postmap(soup):
post_map = {
"__EVENTTARGET": "setPage",
"__VIEWSTATE": soup.find_all(id="__VIEWSTATE")[0]['value'],
"__EVENTVALIDATION": soup.find_all(id="__EVENTVALIDATION")[0]['value']
}
return post_map
# Worker thread function for fetching a results page
def fetch_page(param):
"""Returns the from the search that can be obtained using the post_map, or get_number upon failure
Argument should be a tuple or a list containing the following arguments
in the following order:
post_map -- POST headers map (map)
req -- the OpenerDirector used to make the request (OpenerDirector)
get_number -- the requested page's number (int)
threads -- The crawler's thread count (int)
timeout -- The crawler's timeout (int)"""
# Unpack the parameters
post_map = param[0]
req = param[1]
get_number = param[2]
threads = param[3]
timeout = param[4]
post_map["__EVENTARGUMENT"] = get_number
post_data = urllib.parse.urlencode(post_map).encode("UTF-8")
time.sleep(((get_number - 1) % threads) / 10.0)
try:
res = req.open(SEARCH_URL, data=post_data, timeout=timeout)
req.close()
if res.code == 200:
return get_number, res.read()
except:
return get_number, None
return get_number, None
def generate_interval_list(start_date, end_date):
# Split the date range into equal, week-long subranges
days_count = (end_date - start_date).days
if days_count <= 7:
r = [(start_date, end_date)]
else:
r = [(start_date + datetime.timedelta(7*i), start_date + datetime.timedelta(7*(i+1)-1))
for i in range(0, math.ceil(days_count / 7))]
r[len(r)-1] = (r[len(r)-1][0], end_date) # Fix the last range
return r
def get_entity_type(fault_entity):
"""Read the FaultEntity and use its fields to determine its type"""
if fault_entity.interval is None:
raise ValueError("The fault_entity object is malformed")
if fault_entity.page is None:
return FaultLevel.INTERVAL
if fault_entity.verdicts is None:
return FaultLevel.RESULT_PAGE
return FaultLevel.VERDICT
# Stores an entity (interval / page / single verdict) or a range of entities that failed
class FaultEntity:
interval = () # (start, end)
page = None #
verdicts = None # [verdict_1, verdict_2, ...]
def __init__(self, interval, page=None, verdicts=None):
self.interval = interval
self.page = page
self.verdicts = verdicts
def __str__(self):
return "Entity data: %s - %s, page: %s, verdict: %s" % \
(str(self.interval[0].strftime("%d/%m/%Y")),
str(self.interval[1].strftime("%d/%m/%Y")),
"%" if self.page is None else str(self.page),
"%" if self.verdicts is None else str(self.verdicts)
)