-
Notifications
You must be signed in to change notification settings - Fork 69
/
Copy pathtaxi_lightGBM_weather.py
432 lines (338 loc) · 15.5 KB
/
taxi_lightGBM_weather.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# Plots
import matplotlib.pyplot as plt
import seaborn as sns
# Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.cluster import MiniBatchKMeans
import gc
# LightGBM framework
import lightgbm as lgb
"""
From github:
A fast, distributed, high performance gradient boosting (GBDT, GBRT, GBM or MART)
framework based on decision tree algorithms,
used for ranking, classification and many other machine learning tasks.
LightGBM is a gradient boosting framework that uses tree based learning algorithms.
It is designed to be distributed and efficient with the following advantages:
Faster training speed and higher efficiency
Lower memory usage
Better accuracy
Parallel and GPU learning supported
Capable of handling large-scale data
"""
########
# Load the data
########
#maing train and test
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
#load supplemental taxi route data
train_fastest_1 = pd.read_csv('fastest_routes_train_part_1.csv')
train_fastest_2 = pd.read_csv('fastest_routes_train_part_2.csv')
test_fastest = pd.read_csv('fastest_routes_test.csv')
#load the hourly weather
weather = pd.read_csv('weather_data_nyc_centralpark_2016.csv')
weather_hour = pd.read_csv('Weather.csv')
#split the training data into a train and validation set
train, valid, _ , _ = train_test_split(train, train.trip_duration,
test_size=0.2, random_state=2017)
# Add set marker
train['eval_set'] = 0; valid['eval_set'] = 1; test['eval_set'] = 2
test['trip_duration'] = np.nan; test['dropoff_datetime'] = np.nan
#merge train, valid and test to a single table
frame = pd.concat([train, valid, test], axis=0)
frame_fastest = pd.concat([train_fastest_1, train_fastest_2, test_fastest], axis = 0)
### Memory optimization
# Main dataframe
frame.eval_set = frame.eval_set.astype(np.uint8)
frame.passenger_count = frame.passenger_count.astype(np.int8)
frame.store_and_fwd_flag = pd.get_dummies(frame['store_and_fwd_flag'],
prefix='store_and_fwd_flag', drop_first=True)
frame.vendor_id = frame.vendor_id.astype(np.int8)
# Weather dataframe
weather.replace('T', 0.001, inplace=True)
weather['date'] = pd.to_datetime(weather['date'], dayfirst=True).dt.date
weather['average temperature'] = weather['average temperature'].astype(np.int64)
weather['precipitation'] = weather['precipitation'].astype(np.float64)
weather['snow fall'] = weather['snow fall'].astype(np.float64)
weather['snow depth'] = weather['snow depth'].astype(np.float64)
# Weather hourly dataframe
weather_hour['Datetime'] = pd.to_datetime(weather_hour['pickup_datetime'], dayfirst=True)
weather_hour['date'] = weather_hour.Datetime.dt.date
weather_hour['hour'] = weather_hour.Datetime.dt.hour
weather_hour['hour'] = weather_hour.hour.astype(np.int8)
weather_hour['fog'] = weather_hour.fog.astype(np.int8)
weather_hour = weather_hour[['date', 'hour', 'tempm', 'dewptm', 'hum', 'wspdm',
'wdird', 'vism', 'pressurei', 'fog']]
# Define clusters
def clusters(df):
coords = np.vstack((df[['pickup_longitude', 'pickup_latitude']].values,
df[['dropoff_longitude', 'dropoff_latitude']].values))
sample_ind = np.random.permutation(len(coords))[:500000]
kmeans = MiniBatchKMeans(n_clusters=100, batch_size=10000).fit(coords[sample_ind])
cl_pickup = kmeans.predict(df[['pickup_longitude', 'pickup_latitude']])
cl_dropoff = kmeans.predict(df[['dropoff_longitude', 'dropoff_latitude']])
return cl_pickup, cl_dropoff
# Rotate the map
def rotate_coords(df, col1, col2, pic=False):
alpha = 0.610865 # angle = 35 degrees
#alpha = 0.506145 # angle = 29 degrees
# Center of rotation
x_c = df[col1].mean()
y_c = df[col2].mean()
# Coordinates
C = df[[col1, col2]] - np.array([x_c, y_c])
# Rotation matrix
R = np.array([[np.cos(alpha), -np.sin(alpha)],
[np.sin(alpha), np.cos(alpha)]])
C_rot = np.matmul(R, C.transpose().values).transpose() + np.array([x_c, y_c])
return C_rot
# Manhattan distances
def my_manhattan_distances(x1, x2, y1, y2):
return np.abs(x1 - x2) + np.abs(y1 - y2)
# Euclidean distances
def my_euclidean_distances(x1, x2, y1, y2):
return np.square(x1 - x2) + np.square(y1 - y2)
my_manhattan_distances = np.vectorize(my_manhattan_distances)
my_euclidean_distances = np.vectorize(my_euclidean_distances)
# Adding features
def add_features(df, predict=False):
# If predict = True, this function will prepare (add new features)
# train set (all train data) and test (all test data); else,
# if predict = False, the function will prepare train and validation datasets
if predict:
train_inds = df[(df.eval_set != 2)].index
else:
df = df[(df.eval_set != 2)].copy()
train_inds = df[df.eval_set != 1].index
### Log trip
print('Log trip duration')
df['trip_duration'] = df.trip_duration.apply(np.log)
### PCA transformation
print('Add PCA geo-coordinates')
coords = np.vstack((df[['pickup_latitude', 'pickup_longitude']],
df[['dropoff_latitude', 'dropoff_longitude']]))
pca = PCA().fit(coords) # define 2 main axis
df['pickup_pca0'] = pca.transform(df[['pickup_longitude', 'pickup_latitude']])[:,0]
df['pickup_pca1'] = pca.transform(df[['pickup_longitude', 'pickup_latitude']])[:,1]
df['dropoff_pca0'] = pca.transform(df[['dropoff_longitude', 'dropoff_latitude']])[:,0]
df['dropoff_pca1'] = pca.transform(df[['dropoff_longitude', 'dropoff_latitude']])[:,1]
df['distance_pca0'] = np.abs(df.pickup_pca0-df.dropoff_pca0)
df['distance_pca1'] = np.abs(df.pickup_pca1-df.dropoff_pca1)
print('Rotate geo-coordinates')
C_rot_pickup = rotate_coords(df, 'pickup_longitude', 'pickup_latitude', not predict)
C_rot_dropoff = rotate_coords(df, 'dropoff_longitude', 'dropoff_latitude', not predict)
df['pickup_longitude_rot'] = C_rot_pickup[:, 0]
df['pickup_latitude_rot'] = C_rot_pickup[:, 1]
df['dropoff_longitude_rot'] = C_rot_dropoff[:, 0]
df['dropoff_latitude_rot'] = C_rot_dropoff[:, 1]
### Add clusters
print('Add clusters')
cl_pu, cl_do = clusters(df)#not predict,
df['pickup_clusters'] = cl_pu
df['dropoff_clusters'] = cl_do
### to DateTime
print('Convert to datetime format')
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])
# Add weather info
df['date'] = df['pickup_datetime'].dt.date # adding date column
df['hour'] = df['pickup_datetime'].dt.hour # adding hour column
df = pd.merge(left=df, right=weather, on='date', how='left')
# Add weather hourly
df = pd.merge(left=df, right=weather_hour.drop_duplicates(subset=['date', 'hour']),
on=['date', 'hour'], how='left')
df.drop(['date'], axis=1, inplace=True)
# Weather added
df['month'] = df['pickup_datetime'].dt.month
df['week_of_year'] = df['pickup_datetime'].dt.week
df['day'] = df['pickup_datetime'].dt.day
df['month_day'] = df['month'] + df['day']
df['day_of_year'] = df['pickup_datetime'].dt.dayofyear
#df['hour'] = df['pickup_datetime'].dt.hour
df['day_of_year_hour'] = 24*df['day_of_year'] + df['hour']
df['hour_minute'] = 60*df['hour'] + df['pickup_datetime'].dt.minute
df['day_week'] = df['pickup_datetime'].dt.weekday
df['month_day_hour'] = 31*24*df['month'] + 24*df['day'] + df['hour']
### Some usfull averages ###
print('Add averages')
train_info = df.iloc[train_inds].copy() # get only train data
# Month average
month_avg = train_info.groupby('month').trip_duration.mean()
month_avg = month_avg.reset_index(); month_avg.columns = ['month', 'month_avg']
df = pd.merge(left=df, right=month_avg, on='month', how='left')
# Week of year average
week_year_avg = train_info.groupby('week_of_year').trip_duration.mean()
week_year_avg = week_year_avg.reset_index()
week_year_avg.columns = ['week_of_year', 'week_of_year_avg']
df = pd.merge(left=df, right=week_year_avg, on='week_of_year', how='left')
# Day of month average
day_month_avg = train_info.groupby('day').trip_duration.mean()
day_month_avg = day_month_avg.reset_index()
day_month_avg.columns = ['day', 'day_of_month_avg']
df = pd.merge(left=df, right=day_month_avg, on='day', how='left')
# Day of year average
day_year_avg = train_info.groupby('day_of_year').trip_duration.mean()
day_year_avg = day_year_avg.reset_index()
day_year_avg.columns = ['day_of_year', 'day_of_year_avg']
df = pd.merge(left=df, right=day_year_avg, on='day_of_year', how='left')
# Hour average
hour_avg = train_info.groupby('hour').trip_duration.mean()
hour_avg = hour_avg.reset_index(); hour_avg.columns = ['hour', 'hour_avg']
df = pd.merge(left=df, right=hour_avg, on='hour', how='left')
# Day week average
day_week_avg = train_info.groupby('day_week').trip_duration.mean()
day_week_avg = day_week_avg.reset_index()
day_week_avg.columns = ['day_week', 'day_week_avg']
df = pd.merge(left=df, right=day_week_avg, on='day_week', how='left')
# Clusters
print('Pickup clusters')
cl_pu_avg = train_info.groupby('pickup_clusters').trip_duration.mean()
cl_pu_avg = cl_pu_avg.reset_index()
cl_pu_avg.columns = ['pickup_clusters', 'pickup_clusters_avg']
df = pd.merge(left=df, right=cl_pu_avg, on='pickup_clusters', how='left')
print('Dropoff clusters')
cl_do_avg = train_info.groupby('dropoff_clusters').trip_duration.mean()
cl_do_avg = cl_do_avg.reset_index()
cl_do_avg.columns = ['dropoff_clusters', 'dropoff_clusters_avg']
df = pd.merge(left=df, right=cl_do_avg, on='dropoff_clusters', how='left')
### Distances ###
print('Add distances')
# Manhattan rot
df['distance_manhattan_rot'] = my_manhattan_distances(df.pickup_longitude_rot,
df.dropoff_longitude_rot,
df.pickup_latitude_rot,
df.dropoff_latitude_rot)
# Manhattan pca
df['distance_manhattan_pca'] = df['distance_pca0'] + df['distance_pca1']
# Euclidean
df['distance_euclidean'] = my_euclidean_distances(df.pickup_latitude,
df.dropoff_latitude,
df.pickup_longitude,
df.dropoff_longitude)
# Fastest route
df = pd.merge(left=df,
right=frame_fastest[['id', 'total_distance',
'total_travel_time', 'number_of_steps']],
on='id', how='left')
# Same destination
print('Add same destination marker')
df['same_destination'] = (df.distance_euclidean == 0).astype(np.uint)
# Remove outliers
mask = (df.eval_set != 2) & (df.trip_duration > np.log(20*24*60*60))
print('Delete outliers:', mask.astype(np.uint).sum())
df = df[~mask]
return df
frame_augm = add_features(frame, predict=False)
frame_augm.head()
frame_augm.columns #63 columns
#split the train and test data, then
drop_cols = ['dropoff_datetime', 'eval_set', 'pickup_datetime'] \
+ ['dropoff_pca1', 'month_avg', 'dropoff_longitude', \
'dropoff_longitude_rot', 'pickup_longitude', \
'pickup_longitude_rot', 'pickup_pca0', \
'pickup_pca1', 'dropoff_pca0']
# Train
X_train = frame_augm[(frame_augm.eval_set==0)].copy()
train_id = X_train.pop('id')
y_train = X_train.pop('trip_duration')
X_train.drop(drop_cols, axis=1, inplace=True)
X_train.head() #now 49 columns, only the training set
# Validation
X_valid = frame_augm[frame_augm.eval_set==1].copy()
valid_id = X_valid.pop('id')
y_valid = X_valid.pop('trip_duration')
X_valid.drop(drop_cols, axis=1, inplace=True)
X_valid.head() #also 49 columns
def lgb_rmsle_score(preds, dtrain):
labels = np.exp(dtrain.get_label())
preds = np.exp(preds.clip(min=0))
return 'rmsle', np.sqrt(np.mean(np.square(np.log1p(preds)-np.log1p(labels)))), False
d_train = lgb.Dataset(X_train, y_train)
#set the paramaters
lgb_params = {
'learning_rate': 0.2, # try 0.2
'max_depth': 8,
'num_leaves': 55,
'objective': 'regression',
#'metric': {'rmse'},
'feature_fraction': 0.9,
'bagging_fraction': 0.5,
#'bagging_freq': 5,
'max_bin': 200}
cv_result_lgb = lgb.cv(lgb_params,
d_train,
num_boost_round=5000,
nfold=3,
feval=lgb_rmsle_score,
early_stopping_rounds=50,
verbose_eval=100,
show_stdv=True)
#how many training rounds did the gradien boost run for?
n_rounds = len(cv_result_lgb['rmsle-mean'])
print('num_boost_rounds_lgb=' + str(n_rounds))
#num_boost_rounds_lgb=953
#to do below
def dummy_rmsle_score(preds, y):
return np.sqrt(np.mean(np.square(np.log1p(np.exp(preds))-np.log1p(np.exp(y)))))
# Train a model
model_lgb = lgb.train(lgb_params,
d_train,
feval=lgb_rmsle_score,
num_boost_round=n_rounds)
# Predict on train
y_train_pred = model_lgb.predict(X_train)
print('RMSLE on train = {}'.format(dummy_rmsle_score(y_train_pred, y_train)))
# Predict on validation
y_valid_pred = model_lgb.predict(X_valid)
print('RMSLE on valid = {}'.format(dummy_rmsle_score(y_valid_pred, y_valid)))
feature_imp = pd.Series(dict(zip(X_train.columns, model_lgb.feature_importance()))) \
.sort_values(ascending=False)
feature_imp
#train with the validation and the train sets combined to one df
#enginer the features
frame_augm = add_features(frame, True)
#drop the initial columns
drop_cols = ['dropoff_datetime', 'eval_set', 'pickup_datetime'] \
+ ['dropoff_pca1', 'month_avg', 'dropoff_longitude', \
'dropoff_longitude_rot', 'pickup_longitude', \
'pickup_longitude_rot', 'pickup_pca0', \
'pickup_pca1', 'dropoff_pca0']
X_train = frame_augm[(frame_augm.eval_set!=2)].copy()
train_id = X_train.pop('id')
y_train = X_train.pop('trip_duration')
X_train.drop(drop_cols, axis=1, inplace=True)
X_train.head() #49 columns again looking good
# Test df building using the
X_test = frame_augm[frame_augm.eval_set==2].copy()
test_id = X_test.pop('id')
X_test.drop(drop_cols + ['trip_duration'], axis=1, inplace=True)
#sanity checks, are the frames the same?
print('Train shape:', X_train.shape, 'Test shape', X_test.shape)
#are the columns all the same?
X_train.columns == X_test.columns
#make the new train df (all train and validation) as single lgb dataset
d_train = lgb.Dataset(X_train, y_train)
d_train
#build the model using the params we selected above
model_lgb = lgb.train(lgb_params,
d_train,
feval=lgb_rmsle_score,
num_boost_round=n_rounds)
#check the model
model_lgb
### Predict on TEST data
y_test_pred = model_lgb.predict(X_test)
y_test_pred
### Submission
subm = pd.DataFrame()
subm['id'] = test_id
subm['trip_duration'] = np.exp(y_test_pred)
subm.to_csv('lightGBM_submission.csv', index=False)
#scored 0.39660 on test data!
#this is an improvement over 0.43682 with the straight GBM
# look at differences in features and in execution,
# main differences is the addition of the weather data,
# and the use of ~5x the training rounds.