-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdataLoader.py
121 lines (89 loc) · 4.61 KB
/
dataLoader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import pandas as pd
def get_exgoFeats(Name):
# load the full text features
textFeat = pd.read_csv('./datasets/text_features.csv')
textFeat.Date = pd.to_datetime(textFeat.Date,utc=True)
textFeat = textFeat.set_index('Date')
# load the text clustering centroids
with open('./datasets/hier_clusters_feat.pkl','rb') as f:
clusters = pickle.load(f)
textCentroids = {}
for i in range(1,11):
kmfeats = clusters[i]
textfeat_H = textFeat[kmfeats]
scaler = StandardScaler()
scaler.fit(textfeat_H)
textfeat_H_normalized = scaler.transform(textfeat_H)
textfeat_H.loc[:, :] = textfeat_H_normalized
textfeat_H = textfeat_H.dropna()
textfeat_H = textfeat_H.mean(axis=1).to_frame()
textCentroids['C'+str(i)] = list(textfeat_H[0])
textCentroids = pd.DataFrame(textCentroids,index = textFeat.index)
textCentroids.index = pd.to_datetime(textCentroids.index,utc=True)
# load economy features
if Name != 'loadIreland':
# Economy data for the UK regions except for Ireland
Ecodata = pd.read_csv('./datasets/Ecodata.csv')
else:
Ecodata = pd.read_csv('./datasets/Ecodata-Ireland.csv')
Ecodata['Date'] = pd.to_datetime(Ecodata['Date'],utc=True)
Ecodata = Ecodata.set_index('Date')
return textFeat, textCentroids, Ecodata
def datapre_lGBM(Name,horizon,Ecodata,textFeat):
"""
Name: The name of dataset, if Name='4region' then for sum of EastMidlands, WestMidlans, SouthWales, and SouthWest
Horizon: from 1d to 30d,
Ecodata,textFeat: Economic and text information
"""
if Name != '4regions':
Y_cols = ['target_Hour_'+str(i) for i in range(24)]
path_XYtable = './datasets/loadData/{}/XYtable_H_{}.pkl'.format(Name, horizon)
with open(path_XYtable,'rb') as f2:
XY_table_h = pickle.load(f2)
XY_table_h = pd.concat([XY_table_h[0],XY_table_h[1]],axis=1)
X_cols = [col for col in XY_table_h.columns if (col not in Y_cols)]
else:
# sum up the Y data, X data for first 24 cols(hour 0 to hour 23)
# average rest X data
Y_cols = ['target_Hour_'+str(i) for i in range(24)]
X_hour_cols = ['Hour_'+str(i) for i in range(24)]
path_XYtable = './datasets/loadData/EastMidlands/XYtable_H_{}.pkl'.format(horizon)
with open(path_XYtable,'rb') as f2:
XY_table_h = pickle.load(f2)
XY_table_h = pd.concat([XY_table_h[0],XY_table_h[1]],axis=1)
for s in ['WestMidlands','SouthWales','SouthWest']:
path_XYtable = './datasets/loadData/{}/XYtable_H_{}.pkl'.format(s, horizon)
with open(path_XYtable,'rb') as f2:
N_XY_table_h = pickle.load(f2)
N_XY_table_h = pd.concat([N_XY_table_h[0],N_XY_table_h[1]],axis=1)
XY_table_h = XY_table_h + N_XY_table_h
XY_table_h[X_hour_cols] = XY_table_h[X_hour_cols]/4
# del the columns with all nan, because the regions don't share same holidays
XY_table_h = XY_table_h.drop(columns=list(XY_table_h.columns[XY_table_h.isna().all()]))
X_cols = [col for col in XY_table_h.columns if (col not in Y_cols)]
# add Economic data
XY_table_h = XY_table_h.merge(Ecodata,left_on='Date',right_on='Date',how='inner')
X_cols = X_cols + list(Ecodata.columns)
# add text data before shift the issued and target date
XY_table_h = XY_table_h.merge(textFeat,left_on='Date',right_on='Date',how='inner')
# split for X and Y data
XY_table_h['issued_date'] = XY_table_h.index
XY_table_h['target_date'] = XY_table_h['issued_date'].shift(-horizon)
XY_table_h = XY_table_h.dropna()
XY_table_h = XY_table_h.set_index('target_date')
XY_table_h = XY_table_h.rename_axis('Date')
X,Y = XY_table_h[X_cols],XY_table_h[Y_cols]
return X,Y,XY_table_h
def getTrainTest(X,Y):
X_train, X_test = X[X.index < '2023-01-01'],X[X.index >= '2023-01-01']
Y_train, Y_test = Y[Y.index < '2023-01-01'],Y[Y.index >= '2023-01-01']
Xcolumns,Ycolumns = X.columns,Y.columns
scaler_x = StandardScaler()
scaler_x.fit(X_train)
scaler_y = StandardScaler()
scaler_y.fit(Y_train)
X_train = scale_data(Xcolumns,scaler_x,X_train)
X_test = scale_data(Xcolumns,scaler_x,X_test)
Y_train = scale_data(Ycolumns,scaler_y,Y_train)
# Y_test = scale_data(Ycolumns,scaler_y,Y_test)
return X_train,X_test,Y_train,Y_test,scaler_x,scaler_y