-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathscraper_moray.py
101 lines (85 loc) · 3.5 KB
/
scraper_moray.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import requests
from bs4 import BeautifulSoup
import json
from datetime import datetime
from foi import FOI
def scrape(url):
home = 'http://www.moray.gov.uk'
html = requests.get(url).text
soup = BeautifulSoup(html, features="html.parser")
all_ulists = soup.find_all("ul")
request_list = []
month_links = []
for ulist in all_ulists:
link_list = ulist.find_all("a")
for link in link_list:
if link["href"] != "#":
month_links.append(link["href"])
# month_links = ["/moray_standard/page_140447.html"] # For testing smaller sample set
for month in month_links:
month_url = home + month
raw_month = requests.get(month_url).text
month_soup = BeautifulSoup(raw_month, features="html.parser")
table = month_soup.table
rows = table.find_all("tr")
rows.pop(0) # Removes table header
date_tracker = ""
for row in rows:
row_data = row.find_all("td")
temp_date = row_data[0].contents[0]
tags = ["moray"]
if temp_date != " ": # Carries date down table until new date
date_tracker = temp_date
date = None
if date_tracker.strip():
try:
date = datetime.strptime(date_tracker, "%d-%m-%y")
except ValueError:
try:
date = datetime.strptime(date_tracker, "%d-%m-%Y")
except ValueError:
try:
date = datetime.strptime(date_tracker, "%d.%m.%y")
except ValueError:
try:
date = datetime.strptime(date_tracker, "%d.%m.%Y")
except ValueError:
try:
date = datetime.strptime(date_tracker, "%d/%m/%y")
except ValueError:
date = datetime.strptime(date_tracker, "%d/%m/%Y")
if len(row_data) > 2: # IJB table has no department column
department = row_data[2].contents[0]
tags.append(department.lower())
else:
department = "Integrated Joint Board"
name, number, request_url = process_link(row_data, home)
request_list.append(FOI(last_updated_at=date, title=name, tags=tags,
link=request_url, body_id=number))
# print(request_list[-1])
return request_list
def process_link(row_data, home):
link_data = row_data[1].a
if link_data: # At least once instance of no link provided
rel_url = link_data["href"]
name = link_data.contents[0]
request_url = home + rel_url
raw_request = requests.get(request_url).text
request_soup = BeautifulSoup(raw_request, features="html.parser")
try: # At least one instance of <strong> inside h2
number = request_soup.h2.contents[0][8:]
except:
number = "#"
else:
name = row_data[1].contents[0]
number = "#"
request_url = "#"
return name, number, request_url
url = 'http://www.moray.gov.uk/moray_standard/page_62338.html'
request_index = scrape(url)
dict_list = []
for f in request_index:
# print(f)
dict_list.append(f.serializable())
with open('json_outputs/moray_foi.json', 'w') as file:
json.dump(dict_list, file, indent=2)