-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcensus.py
104 lines (87 loc) · 2.57 KB
/
census.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
#!/usr/bin/env python
from openpyxl import load_workbook
import re
import json
import sys
import os
YEAR = '2013'
if len(sys.argv) != 2:
print("Usage: census.py <dataset>\n (where dataset can be found in input-datasets)")
exit(1)
NAME = sys.argv[1]
wb = load_workbook(
filename='input-datasets/%s.xlsx' % NAME,
read_only=True
)
worksheet_matcher= re.compile(r'area unit', re.I)
ws = None
for name in wb.sheetnames:
if worksheet_matcher.search(name):
ws = wb[name]
break
if ws is None:
raise Exception("Failed to detect worksheet")
rows = iter(ws.rows)
# skip crap at top of table
for _ in range(0, 8):
next(rows)
# parse headers
headers = {}
area_unit_column = None
area_unit_matcher = re.compile(r'area unit code', re.I)
census_year_matcher = re.compile(r'[\d]+ Census', re.I)
h1 = next(rows)
h2 = next(rows)
header_prefix = h1[0].value
for i in range(0, len(h2)):
# read the headers for this column
header_suffix = h2[i].value
next_prefix = h1[i].value
# handle empty (merged) cells
if next_prefix is not None:
header_prefix = next_prefix
elif header_suffix is None:
break
# skip old data
if census_year_matcher.match(header_prefix) and not header_prefix.startswith(YEAR):
continue
# headers generally have categories, we need those
if header_suffix is None:
header = [ header_prefix ]
elif header_prefix is not header_suffix:
header = [ header_prefix, header_suffix ]
else:
header = [ header_suffix ]
headers[i] = header
if area_unit_matcher.match(header[0]):
if area_unit_column is not None:
raise Exception("Detected multiple conflicting area unit columns!")
area_unit_column = i
if area_unit_column is None:
raise Exception("Failed to detect area unit column!")
output = 'processed-datasets/%s' % NAME
os.makedirs(output, exist_ok=True)
os.chdir(output)
for row in rows:
area_unit = row[area_unit_column].value
# Skip empty rows (should just be the final/total row at the end)
if area_unit is None:
continue
obj = {}
# Create the data structure from the row
for column in headers.keys():
pos = obj
last_key = None
for key in headers[column]:
if last_key is not None:
pos = pos[last_key]
if key not in pos:
pos[key] = {}
last_key = key
pos[last_key] = row[column].value
# Write the data to a file
fh = open(area_unit, 'w')
try:
json.dump(obj, fh)
finally:
fh.close()