-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtime_series_grp.py
270 lines (238 loc) · 13 KB
/
time_series_grp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
# Instantiation via class initialization
import numpy as np
import pandas as pd
import properties
from pathlib import Path
'''
Description:
This is core of the time series data class. This class utilizes the raw data and converts into data utilized for predictions.
Encoding of time to fill gap is incorporated in two ways:
1. A simple date handling logic is first built.
2. Another data structure with logic handling with date is built. The idea is explained below.
But, there might be occurances of peaks impacting the algorithm output so used for visualized.
It is a good start to extend it for complicated techniques
3. A new simulate dataset addition is created to add to the existing time series dataset. This is done to evaluate the
obtained nearest neighbors without looking into the actual dataset. The simulation is created not for the entire dataset
but for set of handpicked users such as a digital twin, outliers and normal test instances.
'''
class TimeSeriesGroupProcessing:
norm = True
file_path = properties.ts_file_location
simulate_file_path = properties.simulate_ts_file_location
def __init__(self, method):
self.ts_day = self.pre_process()
self.user_grp_dict_predict = self.get_m_day_ts_enumerate(method=method)
self.user_grp_dict_vis = self.get_m_day_ts_encode(method=method)
def pre_process(self):
tyt_data = pd.read_pickle(self.file_path)
# Do the simulation concatenation if the file exists
p = Path(self.simulate_file_path)
if p.exists() and p.is_file():
simulate_tyt_data = pd.read_pickle(self.simulate_file_path)
# We will initially drop all na - s03
simulate_tyt_data["user_id"] = simulate_tyt_data["user_id"].apply(int)
overall_data = tyt_data.append(simulate_tyt_data)
overall_data.dropna(subset=["s03"], inplace=True)
else:
overall_data = tyt_data.copy()
overall_data.dropna(subset=["s02","s03","s04","s05","s06","s07"], inplace=True)
# Create some features
time_data_set = overall_data.copy()
# Lets simply extract year month date hour min sec from the time series as a separate feature.
############### Working with Time Series Data ############################
type(time_data_set["created_at"][0])
# Converting to timestamp format for observation
time_data_set["created_at"] = pd.to_datetime(time_data_set["created_at"])
# Exploring the datetime
daysofweek = {
0: "Monday",
1: "Tuesday",
2: "Wednesday",
3: "Thursday",
4: "Friday",
5: "Saturday",
6: "Sunday"
}
time_data_set["year"] = time_data_set["created_at"].apply(lambda x: x.year)
time_data_set["month"] = time_data_set["created_at"].apply(lambda x: x.month)
time_data_set["date"] = time_data_set["created_at"].apply(lambda x: x.date())
time_data_set["hour"] = time_data_set["created_at"].apply(lambda x: x.hour)
time_data_set["minute"] = time_data_set["created_at"].apply(lambda x: x.minute)
time_data_set["day"] = time_data_set["created_at"].apply(lambda x: x.day)
time_data_set["day_of_record"] = time_data_set["created_at"].apply(lambda x: x.dayofweek)
# time_data_set["day_of_record"] = time_data_set["created_at"].apply(lambda x: daysofweek[x.dayofweek])
if self.norm:
time_data_set["s02"] = time_data_set["s02"].apply(lambda x: x / 100)
time_data_set["s03"] = time_data_set["s03"].apply(lambda x: x / 100)
time_data_set["s04"] = time_data_set["s04"].apply(lambda x: x / 100)
time_data_set["s05"] = time_data_set["s05"].apply(lambda x: x / 100)
time_data_set["s06"] = time_data_set["s06"].apply(lambda x: x / 100)
time_data_set["s07"] = time_data_set["s07"].apply(lambda x: x / 100)
drop_user_ids = [4, 6, 7, 9, 12, 19, 25, 53, 59, 130, 144, 145, 148, 156, 167]
time_data_set_filtered = time_data_set[~time_data_set.user_id.isin(drop_user_ids)]
time_data_set_filtered = time_data_set_filtered[
["user_id", "month", "day", "date", "s02", "s03", "s04", "s05", "s06", "s07"]]
return time_data_set_filtered
'''
Proposed second method to ts_Encode
'''
def get_m_day_ts_encode(self, method="mean"):
from collections import OrderedDict
usr_grp_dict = dict()
count = 0
user_month_grp = self.ts_day.groupby(["user_id", "month"])
print("Taking by day " + method)
for grp in user_month_grp:
if grp[0][0] not in usr_grp_dict:
print("User- " + str(grp[0][0]))
count += 1
arr = []
group_df = grp[1]
if method == "mean":
group_usr_df = group_df.groupby("day").mean().reset_index()
elif method == "max":
group_usr_df = group_df.groupby("day").max().reset_index()
elif method == "min":
group_usr_df = group_df.groupby("day").min().reset_index()
elif method == "median":
group_usr_df = group_df.groupby("day").median().reset_index()
# Compute to get the fraction of the day from 1-30.
days = np.sort(group_usr_df["day"].to_numpy())
prev_diff = (1 / 30)
norm_day_index = list()
norm_day_index.append(prev_diff)
for i in range(len(days) - 1):
if i >= (len(days) - 1):
break
diff = abs(days[i] - days[i + 1])
if diff > 1:
print(diff)
new_diff = (diff / 30)
print(new_diff)
print(prev_diff)
norm_day_index.append(new_diff + prev_diff)
prev_diff = new_diff + prev_diff
else:
new_diff = (1 / 30)
norm_day_index.append((new_diff + prev_diff))
prev_diff = new_diff + prev_diff
# day_index_count = [i for i in range(0, len(group_usr_df["day"]))]
# Rounding for cleaner view and manipulations of day_index
day_index_count = [round(val, 2) for val in norm_day_index]
idx = len(day_index_count)
group_usr_df["day_session_id"] = day_index_count
usr_grp_dict[grp[0][0]] = group_usr_df[["day", "day_session_id", "s02", "s03",
"s04", "s05", "s06", "s07"]].to_numpy()
else:
group_user_arr = usr_grp_dict[grp[0][0]]
group_df = grp[1]
# print("Taking by day " + method + "at next iterations")
if method == "mean":
group_usr_df = group_df.groupby("day").mean().reset_index()
elif method == "max":
group_usr_df = group_df.groupby("day").max().reset_index()
elif method == "min":
group_usr_df = group_df.groupby("day").min().reset_index()
elif method == "median":
group_usr_df = group_df.groupby("day").median().reset_index()
print("length of df -- for " + str(grp[0][0]) + "and len " + str(len(group_usr_df)))
# print(len(group_user_arr))
# print(len(group_usr_df))
# Computation of the day_index continue from where ever last month was left.
temp_arr = group_user_arr[len(group_user_arr) - 1]
days = np.sort(group_usr_df["day"].to_numpy())
# Computation for gap filling. Index is not a true index, we account for the gap by a logic
prev_mday = temp_arr[0]
curr_mday = group_usr_df["day"].iloc[0]
# We commonly take 30 as the number of days. This should be noted
if curr_mday > prev_mday:
day_diff = 30 + abs(curr_mday - prev_mday)
else:
day_diff = 30 - abs(prev_mday - curr_mday)
# Exceptional case of 31 days
if day_diff == 0:
day_diff = 1
norm_day_index = list()
prev_diff = temp_arr[1]
new_diff = (day_diff / 30)
norm_day_index = list()
norm_day_index.append(round((prev_diff + new_diff), 2))
prev_diff = prev_diff + new_diff
for i in range(len(days) - 1):
if i >= (len(days) - 1):
break
diff = abs(days[i] - days[i + 1])
if diff > 1:
print(diff)
new_diff = (diff / 30)
print(new_diff)
print(prev_diff)
norm_day_index.append((new_diff + prev_diff))
prev_diff = new_diff + prev_diff
else:
new_diff = (1 / 30)
norm_day_index.append(new_diff + prev_diff)
prev_diff = new_diff + prev_diff
# print("start", int(temp_arr[len(temp_arr) - 1][0]) + 1)
# print("end", len(group_user_arr) + len(group_usr_df))
# day_index_count = [round(val, 2) for val in norm_day_index]
day_index_count = [round(val, 2) for val in norm_day_index]
group_usr_df["day_session_id"] = day_index_count
usr_grp_dict[grp[0][0]] = np.append(group_user_arr, group_usr_df[["day", "day_session_id", "s02",
"s03", "s04", "s05", "s06",
"s07"]].to_numpy(), axis=0)
return usr_grp_dict
'''
1st method proposed for ts_encode
'''
def get_m_day_ts_enumerate(self, method="mean"):
from collections import OrderedDict
usr_grp_dict = dict()
user_month_grp = self.ts_day.groupby(["user_id", "month"])
print("Taking by day " + method)
for grp in user_month_grp:
if grp[0][0] not in usr_grp_dict:
arr = []
group_df = grp[1]
if method == "mean":
group_usr_df = group_df.groupby("day").mean().reset_index()
elif method == "max":
group_usr_df = group_df.groupby("day").max().reset_index()
elif method == "min":
group_usr_df = group_df.groupby("day").min().reset_index()
elif method == "median":
group_usr_df = group_df.groupby("day").median().reset_index()
day_index_count = [i for i in range(0, len(group_usr_df["day"]))]
idx = len(day_index_count)
group_usr_df["day_session_id"] = day_index_count
usr_grp_dict[grp[0][0]] = group_usr_df[["day", "day_session_id", "s02", "s03",
"s04", "s05", "s06", "s07"]].to_numpy()
else:
group_user_arr = usr_grp_dict[grp[0][0]]
group_df = grp[1]
# print("Taking by day " + method + "at next iterations")
if method == "mean":
group_usr_df = group_df.groupby("day").mean().reset_index()
elif method == "max":
group_usr_df = group_df.groupby("day").max().reset_index()
elif method == "min":
group_usr_df = group_df.groupby("day").min().reset_index()
elif method == "median":
group_usr_df = group_df.groupby("day").median().reset_index()
print("length of df -- for " + str(grp[0][0]) + "and len " + str(len(group_usr_df)))
# print(len(group_user_arr))
# print(len(group_usr_df))
temp_arr = group_user_arr[len(group_user_arr) - 1]
# print("start", int(temp_arr[len(temp_arr) - 1][0]) + 1)
# print("end", len(group_user_arr) + len(group_usr_df))
day_index_count = [i for i in range(int(temp_arr[1]) + 1,
int(temp_arr[1]) + 1 + len(group_usr_df))]
group_usr_df["day_session_id"] = day_index_count
usr_grp_dict[grp[0][0]] = np.append(group_user_arr, group_usr_df[["day", "day_session_id", "s02",
"s03", "s04", "s05", "s06",
"s07"]].to_numpy(), axis=0)
return usr_grp_dict
def get_user_mday_ts_visualize(self, user_id):
return self.user_grp_dict_vis[user_id]
def get_usr_mday_ts_predict(self, user_id):
return self.user_grp_dict_predict[user_id]