-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathpreprocessing.py
85 lines (67 loc) · 3.13 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import json
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
# this is the station which has wrong station_id in station_information
# so its capacity info is missing
missing_capacity = {
'7029' : 15 # Bay St / Bloor St W
}
HourlyDF = pd.read_csv('data/2017_hourly.csv', usecols=['season', 'date'])
SeasonDF = HourlyDF.groupby('date').sum()
for i in SeasonDF.index:
SeasonDF.loc[i] /= 24
use_cols = ['trip_duration_seconds',
'trip_start_date',
'from_station_id',
'from_station_name',
'to_station_id',
'to_station_name',
'trip_start_time'
]
EventsDF = pd.read_csv('data/2017_events_joined.csv', usecols=use_cols)
# keep trips between 1 min to 30 min (91% of whole data set)
EventsDF = EventsDF[EventsDF['trip_duration_seconds'] < 1800]
EventsDF = EventsDF[EventsDF['trip_duration_seconds'] > 60]
# just keep the hour
for i in EventsDF.index:
EventsDF.at[i, 'trip_start_time'] = int(EventsDF.trip_start_time[i].split(":")[0])
# station names that appear in the trip events table
station_names = set(EventsDF.from_station_name.values)
capacity = {}
with open('data/station_information.json', "r") as f:
station_info = json.load(f)
for station in station_info['data']['stations']:
if station['station_id'] in missing_capacity.keys():
capacity.update({int(station['station_id']) : missing_capacity[station['station_id']]})
else:
capacity.update({int(station['station_id']) : station['capacity']})
# compute the average hourly time series using all the data, or breaking into
# subsets consisting of winter + spring, and summer + fall averages
usageTS = pd.DataFrame(np.zeros((len(station_names), 24)), index=station_names, columns=range(0,24))
winter_spring_TS = pd.DataFrame(np.zeros((len(station_names), 24)), index=station_names, columns=range(0,24))
summer_fall_TS = pd.DataFrame(np.zeros((len(station_names), 24)), index=station_names, columns=range(0,24))
for i in EventsDF.index:
date = EventsDF.at[i, 'trip_start_date']
season = SeasonDF.loc[date][0]
from_station_id = EventsDF.at[i, 'from_station_id']
to_station_id = EventsDF.at[i, 'to_station_id']
from_station_name = EventsDF.at[i, 'from_station_name']
to_station_name = EventsDF.at[i, 'to_station_name']
hr = EventsDF.at[i, 'trip_start_time']
usageTS.loc[from_station_name][hr] -= 1.0 / capacity[from_station_id]
usageTS.loc[to_station_name][hr] += 1.0 / capacity[to_station_id]
if season == 1 or season == 2:
winter_spring_TS.loc[from_station_name][hr] -= 1.0 / capacity[from_station_id]
winter_spring_TS.loc[to_station_name][hr] += 1.0 / capacity[to_station_id]
else:
summer_fall_TS.loc[from_station_name][hr] -= 1.0 / capacity[from_station_id]
summer_fall_TS.loc[to_station_name][hr] += 1.0 / capacity[to_station_id]
# average by the number of days for each subset
usageTS /= 365.0
winter_spring_TS /= 181.0
summer_fall_TS /= 184.0
usageTS.to_csv('usage_all.csv')
winter_spring_TS.to_csv('usage_q1q2.csv')
summer_fall_TS.to_csv('usage_q3q4.csv')