@@ -70,192 +70,81 @@ def list_log_files(fs, devices, start_times, verbose=True):
70
70
71
71
72
72
# -----------------------------------------------
73
- class SetupInflux :
74
- def __init__ (self , influx_url , token , org_id , influx_bucket , debug = False , verbose = True ):
75
- from influxdb_client import InfluxDBClient
76
-
77
- self .influx_url = influx_url
78
- self .token = token
79
- self .org_id = org_id
80
- self .influx_bucket = influx_bucket
81
- self .debug = debug
82
- self .verbose = verbose
83
- self .client = InfluxDBClient (url = self .influx_url , token = self .token , org = self .org_id , debug = False )
84
- self .test = self .test_influx ()
85
- return
86
-
87
- def __del__ (self ):
88
- self .client .__del__ ()
89
-
90
- def get_start_times (self , devices , default_start , dynamic ):
91
- """Get latest InfluxDB timestamps for devices for use as 'start times' for listing log files from S3
92
- """
93
- from datetime import datetime , timedelta
94
- from dateutil .tz import tzutc
95
-
96
- default_start_dt = datetime .strptime (default_start , "%Y-%m-%d %H:%M:%S" ).replace (tzinfo = tzutc ())
97
- device_ids = [device .split ("/" )[1 ] for device in devices ]
98
- start_times = []
99
-
100
- if self .test == 0 :
101
- print ("Warning: Unable to connect to InfluxDB" )
102
- else :
103
- for device in device_ids :
104
- influx_time = self .client .query_api ().query (
105
- f'from(bucket:"{ self .influx_bucket } ") |> range(start: 0, stop: now()) |> filter(fn: (r) => r["_measurement"] == "{ device } ") |> keep(columns: ["_time"]) |> sort(columns: ["_time"], desc: false) |> last(column: "_time")'
106
- )
107
-
108
- if len (influx_time ) == 0 or dynamic == False :
109
- last_time = default_start_dt
110
- else :
111
- last_time = influx_time [0 ].records [0 ]["_time" ]
112
- last_time = last_time + timedelta (seconds = 2 )
113
-
114
- start_times .append (last_time )
115
-
116
- return start_times
117
-
118
- def write_influx (self , name , df ):
119
- """Helper function to write data to InfluxDB
120
- """
121
- from influxdb_client import WriteOptions
122
-
123
- if self .test == 0 :
124
- return
125
-
126
- _write_client = self .client .write_api (
127
- write_options = WriteOptions (batch_size = 5000 , flush_interval = 1_000 , jitter_interval = 2_000 , retry_interval = 5_000 ,)
128
- )
129
-
130
- _write_client .write (
131
- self .influx_bucket , record = df , data_frame_measurement_name = name ,
132
- )
133
-
134
- if self .verbose :
135
- print (f"- SUCCESS: { len (df .index )} records of { name } written to InfluxDB\n \n " )
136
-
137
- _write_client .__del__ ()
138
-
139
- def delete_influx (self , device ):
140
- """Given a 'measurement' name (e.g. device ID), delete the related data from InfluxDB
141
- """
142
- start = "1970-01-01T00:00:00Z"
143
- stop = "2099-01-01T00:00:00Z"
144
-
145
- delete_api = self .client .delete_api ()
146
- delete_api .delete (
147
- start , stop , f'_measurement="{ device } "' , bucket = self .influx_bucket , org = self .org_id ,
148
- )
149
-
150
- def test_influx (self ):
151
- if self .influx_url == "influx_endpoint" :
152
- print ("- WARNING: Please add your InfluxDB credentials\n " )
153
- result = 0
154
- else :
155
- try :
156
- test = self .client .query_api ().query (f'from(bucket:"{ self .influx_bucket } ") |> range(start: -10s)' )
157
- result = 1
158
- except Exception as err :
159
- self .print_influx_error (str (err ))
160
- result = 0
161
-
162
- return result
163
-
164
- def print_influx_error (self , err ):
165
- warning = "- WARNING: Unable to write data to InfluxDB |"
166
-
167
- if "CERTIFICATE_VERIFY_FAILED" in err :
168
- print (f"{ warning } check your influx_url ({ self .influx_url } )" )
169
- elif "organization name" in err :
170
- print (f"{ warning } check your org_id ({ self .org_id } )" )
171
- elif "unauthorized access" in err :
172
- print (f"{ warning } check your influx_url and token" )
173
- elif "could not find bucket" in err :
174
- print (f"{ warning } check your influx_bucket ({ self .influx_bucket } )" )
175
- else :
176
- print (err )
177
-
178
-
179
- # -----------------------------------------------
180
- class DataWriter :
181
- def __init__ (self , fs , db_list , signals , res , db_func , days_offset = None , verbose = True ):
182
-
73
+ class ProcessData :
74
+ def __init__ (self , fs , db_list , signals , days_offset = None , verbose = True ):
183
75
self .db_list = db_list
184
76
self .signals = signals
185
- self .res = res
186
77
self .fs = fs
187
- self .db_func = db_func
188
78
self .days_offset = days_offset
189
79
self .verbose = verbose
190
80
return
191
81
192
- def extract_phys (self , df_raw ):
193
- """Given a dataframe of raw CAN data and a list of decoding databases,
194
- this extracts the physical values for each database and creates a new
195
- dataframe of unique physical values
82
+ def extract_phys (self , df_raw , tp_type = None ):
83
+ """Given df of raw data and list of decoding databases, create new def with
84
+ physical values (no duplicate signals and optionally filtered/rebaselined)
196
85
"""
197
86
import can_decoder
198
87
import pandas as pd
199
88
200
89
df_phys = pd .DataFrame ()
201
90
for db in self .db_list :
202
91
df_decoder = can_decoder .DataFrameDecoder (db )
203
- df_phys = df_phys .append (df_decoder .decode_frame (df_raw ))
204
92
93
+ if tp_type != None :
94
+ df_phys_tp = pd .DataFrame ()
95
+ for length , group in df_raw .groupby ("DataLength" ):
96
+ df_phys_group = df_decoder .decode_frame (group )
97
+ df_phys_tp = df_phys_tp .append (df_phys_group )
98
+
99
+ df_phys = df_phys .append (df_phys_tp .sort_index ())
100
+ else :
101
+ df_phys = df_phys .append (df_decoder .decode_frame (df_raw ))
102
+
103
+ # remove duplicates in case multiple DBC files contain identical signals
205
104
df_phys ["datetime" ] = df_phys .index
206
105
df_phys = df_phys .drop_duplicates (keep = "first" )
207
106
df_phys = df_phys .drop ("datetime" , 1 )
208
107
209
- return df_phys
108
+ # optionally filter and rebaseline the data
109
+ df_phys = self .filter_signals (df_phys )
110
+ df_phys = self .rebaseline_data (df_phys )
210
111
211
- def decode_log_files (self , log_files ):
212
- """Given a list of log files, load the raw data from the fs filesystem
213
- (e.g. local or S3) and convert it using a list of conversion rule databases.
112
+ return df_phys
214
113
215
- :param log_files: list of log file paths (e.g. as per output of canedge_browser)
114
+ def rebaseline_data (self , df_phys ):
115
+ """Given a df of physical values, this offsets the timestamp
116
+ to be equal to today, minus a given number of days.
216
117
"""
217
- import mdf_iter , can_decoder
218
- import pandas as pd
118
+ if not df_phys . empty and type ( self . days_offset ) == int :
119
+ from datetime import datetime , timezone
219
120
220
- for log_file in log_files :
221
- with self .fs .open (log_file , "rb" ) as handle :
222
- mdf_file = mdf_iter .MdfFile (handle )
223
- device_id = self .get_device_id (mdf_file )
224
- df_raw = mdf_file .get_data_frame ()
225
-
226
- df_phys = self .extract_phys (df_raw )
227
-
228
- if df_phys .empty :
229
- print ("No signals were extracted" )
230
- else :
231
- # optionally re-baseline data timestamps to 'now - days_offset'
232
- if type (self .days_offset ) == int :
233
- from datetime import datetime , timezone
121
+ delta_days = (datetime .now (timezone .utc ) - df_phys .index .min ()).days - self .days_offset
122
+ df_phys .index = df_phys .index + pd .Timedelta (delta_days , "day" )
234
123
235
- delta_days = (datetime .now (timezone .utc ) - df_phys .index .min ()).days - self .days_offset
236
- df_phys .index = df_phys .index + pd .Timedelta (delta_days , "day" )
124
+ return df_phys
237
125
238
- self .print_log_summary (device_id , log_file , df_phys )
239
- self .write_signals (device_id , df_phys )
126
+ def filter_signals (self , df_phys ):
127
+ """Given a df of physical values, return only signals matched by filter
128
+ """
129
+ if len (self .signals ):
130
+ df_phys = df_phys [df_phys ["Signal" ].isin (self .signals )]
240
131
241
- def write_signals (self , device_id , df_phys ):
242
- """Given a device ID and a dataframe of physical values, optionally
243
- filter, resample and write each signal to a time series database
132
+ return df_phys
244
133
245
- :param device_id: ID of device (used as the 'measurement name')
246
- :param df_phys: Dataframe of physical values (e.g. as per output of can_decoder)
134
+ def get_raw_data ( self , log_file ):
135
+ """Extract a df of raw data and device ID from log file
247
136
"""
137
+ import mdf_iter
248
138
249
- for signal , group in df_phys .groupby ("Signal" )["Physical Value" ]:
250
- if signal in self .signals or len (self .signals ) == 0 :
251
- df_signal = group .to_frame ().rename (columns = {"Physical Value" : signal })
139
+ with self .fs .open (log_file , "rb" ) as handle :
140
+ mdf_file = mdf_iter .MdfFile (handle )
141
+ device_id = self .get_device_id (mdf_file )
142
+ df_raw = mdf_file .get_data_frame ()
252
143
253
- cnt = len (df_signal )
254
- if self .res != "" :
255
- df_signal = df_signal .resample (self .res ).pad ().dropna ()
144
+ return df_raw , device_id
256
145
257
- self . print_signal_summary ( signal , df_signal , cnt )
258
- self . db_func ( device_id , df_signal )
146
+ def get_device_id ( self , mdf_file ):
147
+ return mdf_file . get_metadata ()[ "HDComment.Device Information.serial number" ][ "value_raw" ]
259
148
260
149
def print_log_summary (self , device_id , log_file , df_phys ):
261
150
"""Print summary information for each log file
@@ -265,16 +154,3 @@ def print_log_summary(self, device_id, log_file, df_phys):
265
154
"\n ---------------" ,
266
155
f"\n Device: { device_id } | Log file: { log_file .split (device_id )[- 1 ]} [Extracted { len (df_phys )} decoded frames]\n Period: { df_phys .index .min ()} - { df_phys .index .max ()} \n " ,
267
156
)
268
-
269
- def print_signal_summary (self , signal , df_signal , cnt ):
270
- """Print summary information for each signal
271
- """
272
- if self .verbose :
273
- print (f"Signal: { signal } (mean: { round (df_signal [signal ].mean (),2 )} )" )
274
- if self .res != "" :
275
- print (f"- Resampling to { self .res } ({ cnt } --> { len (df_signal )} records)" )
276
-
277
- def get_device_id (self , mdf_file ):
278
- """Extract device ID (serial number) from MDF4 log file
279
- """
280
- return mdf_file .get_metadata ()["HDComment.Device Information.serial number" ]["value_raw" ]
0 commit comments