Skip to content

Commit f9d7c8b

Browse files
committed
Restructured dashboard writer code to make it easier to add custom modifications by exposing df_raw and df_phys expliclty in the main.py script. Various optimizations/clean-up
1 parent 0f054c5 commit f9d7c8b

File tree

5 files changed

+209
-189
lines changed

5 files changed

+209
-189
lines changed

README.md

+2-2
Original file line numberDiff line numberDiff line change
@@ -58,10 +58,10 @@ Antoher approach is to use event based triggers, e.g. via AWS Lambda functions.
5858
## Other practical information
5959

6060
### Change timestamps
61-
If you wish to test the script using old data, you can change the timestamps so that the data is 'rebaselined' to today, minus an offset number of days. This is useful e.g. if you want to use the InfluxDB Cloud Starter, which will delete data that is older than 30 days. To rebaseline your data to start today minus 2 days, simply add `days_offset=2` in the `DataWriter` initialization.
61+
If you wish to test the script using old data, you can change the timestamps so that the data is 'rebaselined' to today, minus an offset number of days. This is useful e.g. if you want to use the InfluxDB Cloud Starter, which will delete data that is older than 30 days. To rebaseline your data to start today minus 2 days, simply add `days_offset=2` in the `ProcessData` initialization.
6262

6363
### Change verbosity
64-
By default, summary information is printed as part of the processing. You can parse `verbose=False` as an input argument in `list_log_files`, `SetupInflux` and `DataWriter` to avoid this.
64+
By default, summary information is printed as part of the processing. You can parse `verbose=False` as an input argument in `list_log_files`, `SetupInflux` and `ProcessData` to avoid this.
6565

6666
### Delete data from InfluxDB
6767
If you need to delete data in InfluxDB that you e.g. uploaded as part of a test, you can use the `delete_influx(name)` function from the `SetupInflux` class. Call it by parsing the name of the 'measurement' to delete (i.e. the device ID):
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import s3fs
2-
from utils import setup_fs, load_dbc_files, list_log_files, SetupInflux, DataWriter
3-
import inputs
2+
from utils import setup_fs, load_dbc_files, list_log_files, DataWriter
3+
from utils_db import SetupInflux
4+
import inputs as inp
45

56

67
def lambda_handler(event, context=None):
@@ -9,13 +10,16 @@ def lambda_handler(event, context=None):
910
log_files = [bucket + "/" + key]
1011

1112
fs = s3fs.S3FileSystem(anon=False)
12-
db_list = load_dbc_files(inputs.dbc_paths)
13+
db_list = load_dbc_files(inp.dbc_paths)
1314

1415
# initialize connection to InfluxDB
15-
influx = SetupInflux(
16-
influx_url=inputs.influx_url, token=inputs.token, org_id=inputs.org_id, influx_bucket=inputs.influx_bucket
17-
)
16+
influx = SetupInflux(inp.influx_url, inp.token, inp.org_id, inp.influx_bucket, inp.res)
1817

1918
# process the log files and write extracted signals to InfluxDB
20-
writer = DataWriter(fs=fs, db_list=db_list, signals=inputs.signals, res=inputs.res, db_func=influx.write_influx)
21-
writer.decode_log_files(log_files)
19+
proc = ProcessData(fs, db_list, inp.signals)
20+
21+
for log_file in log_files:
22+
df_raw, device_id = proc.get_raw_data(log_file)
23+
df_phys = proc.extract_phys(df_raw)
24+
proc.print_log_summary(device_id, log_file, df_phys)
25+
influx.write_signals(device_id, df_phys)

dashboard-writer/main.py

+18-11
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,22 @@
1-
from utils import setup_fs, load_dbc_files, list_log_files, SetupInflux, DataWriter
2-
import inputs
1+
from utils import setup_fs, load_dbc_files, list_log_files, ProcessData
2+
from utils_db import SetupInflux
3+
import inputs as inp
34

4-
# initialize connection to InfluxDB
5-
influx = SetupInflux(influx_url=inputs.influx_url, token=inputs.token, org_id=inputs.org_id, influx_bucket=inputs.influx_bucket)
6-
start_times = influx.get_start_times(inputs.devices, inputs.default_start, inputs.dynamic)
5+
# initialize connection to InfluxDB + get latest data entries per device
6+
influx = SetupInflux(inp.influx_url, inp.token, inp.org_id, inp.influx_bucket, inp.res)
7+
start_times = influx.get_start_times(inp.devices, inp.default_start, inp.dynamic)
78

89
# setup filesystem (local/S3), load DBC files and list log files for processing
9-
fs = setup_fs(inputs.s3, inputs.key, inputs.secret, inputs.endpoint)
10-
db_list = load_dbc_files(inputs.dbc_paths)
11-
log_files = list_log_files(fs, inputs.devices, start_times)
10+
fs = setup_fs(inp.s3, inp.key, inp.secret, inp.endpoint)
11+
db_list = load_dbc_files(inp.dbc_paths)
12+
log_files = list_log_files(fs, inp.devices, start_times)
1213

13-
# # process the log files and write extracted signals to InfluxDB
14-
writer = DataWriter(fs=fs, db_list=db_list, signals=inputs.signals, res=inputs.res, db_func=influx.write_influx)
15-
writer.decode_log_files(log_files)
14+
# process log files and write extracted signals to InfluxDB
15+
proc = ProcessData(fs, db_list, inp.signals)
16+
17+
for log_file in log_files:
18+
df_raw, device_id = proc.get_raw_data(log_file)
19+
df_phys = proc.extract_phys(df_raw)
20+
21+
proc.print_log_summary(device_id, log_file, df_phys)
22+
influx.write_signals(device_id, df_phys)

dashboard-writer/utils.py

+44-168
Original file line numberDiff line numberDiff line change
@@ -70,192 +70,81 @@ def list_log_files(fs, devices, start_times, verbose=True):
7070

7171

7272
# -----------------------------------------------
73-
class SetupInflux:
74-
def __init__(self, influx_url, token, org_id, influx_bucket, debug=False, verbose=True):
75-
from influxdb_client import InfluxDBClient
76-
77-
self.influx_url = influx_url
78-
self.token = token
79-
self.org_id = org_id
80-
self.influx_bucket = influx_bucket
81-
self.debug = debug
82-
self.verbose = verbose
83-
self.client = InfluxDBClient(url=self.influx_url, token=self.token, org=self.org_id, debug=False)
84-
self.test = self.test_influx()
85-
return
86-
87-
def __del__(self):
88-
self.client.__del__()
89-
90-
def get_start_times(self, devices, default_start, dynamic):
91-
"""Get latest InfluxDB timestamps for devices for use as 'start times' for listing log files from S3
92-
"""
93-
from datetime import datetime, timedelta
94-
from dateutil.tz import tzutc
95-
96-
default_start_dt = datetime.strptime(default_start, "%Y-%m-%d %H:%M:%S").replace(tzinfo=tzutc())
97-
device_ids = [device.split("/")[1] for device in devices]
98-
start_times = []
99-
100-
if self.test == 0:
101-
print("Warning: Unable to connect to InfluxDB")
102-
else:
103-
for device in device_ids:
104-
influx_time = self.client.query_api().query(
105-
f'from(bucket:"{self.influx_bucket}") |> range(start: 0, stop: now()) |> filter(fn: (r) => r["_measurement"] == "{device}") |> keep(columns: ["_time"]) |> sort(columns: ["_time"], desc: false) |> last(column: "_time")'
106-
)
107-
108-
if len(influx_time) == 0 or dynamic == False:
109-
last_time = default_start_dt
110-
else:
111-
last_time = influx_time[0].records[0]["_time"]
112-
last_time = last_time + timedelta(seconds=2)
113-
114-
start_times.append(last_time)
115-
116-
return start_times
117-
118-
def write_influx(self, name, df):
119-
"""Helper function to write data to InfluxDB
120-
"""
121-
from influxdb_client import WriteOptions
122-
123-
if self.test == 0:
124-
return
125-
126-
_write_client = self.client.write_api(
127-
write_options=WriteOptions(batch_size=5000, flush_interval=1_000, jitter_interval=2_000, retry_interval=5_000,)
128-
)
129-
130-
_write_client.write(
131-
self.influx_bucket, record=df, data_frame_measurement_name=name,
132-
)
133-
134-
if self.verbose:
135-
print(f"- SUCCESS: {len(df.index)} records of {name} written to InfluxDB\n\n")
136-
137-
_write_client.__del__()
138-
139-
def delete_influx(self, device):
140-
"""Given a 'measurement' name (e.g. device ID), delete the related data from InfluxDB
141-
"""
142-
start = "1970-01-01T00:00:00Z"
143-
stop = "2099-01-01T00:00:00Z"
144-
145-
delete_api = self.client.delete_api()
146-
delete_api.delete(
147-
start, stop, f'_measurement="{device}"', bucket=self.influx_bucket, org=self.org_id,
148-
)
149-
150-
def test_influx(self):
151-
if self.influx_url == "influx_endpoint":
152-
print("- WARNING: Please add your InfluxDB credentials\n")
153-
result = 0
154-
else:
155-
try:
156-
test = self.client.query_api().query(f'from(bucket:"{self.influx_bucket}") |> range(start: -10s)')
157-
result = 1
158-
except Exception as err:
159-
self.print_influx_error(str(err))
160-
result = 0
161-
162-
return result
163-
164-
def print_influx_error(self, err):
165-
warning = "- WARNING: Unable to write data to InfluxDB |"
166-
167-
if "CERTIFICATE_VERIFY_FAILED" in err:
168-
print(f"{warning} check your influx_url ({self.influx_url})")
169-
elif "organization name" in err:
170-
print(f"{warning} check your org_id ({self.org_id})")
171-
elif "unauthorized access" in err:
172-
print(f"{warning} check your influx_url and token")
173-
elif "could not find bucket" in err:
174-
print(f"{warning} check your influx_bucket ({self.influx_bucket})")
175-
else:
176-
print(err)
177-
178-
179-
# -----------------------------------------------
180-
class DataWriter:
181-
def __init__(self, fs, db_list, signals, res, db_func, days_offset=None, verbose=True):
182-
73+
class ProcessData:
74+
def __init__(self, fs, db_list, signals, days_offset=None, verbose=True):
18375
self.db_list = db_list
18476
self.signals = signals
185-
self.res = res
18677
self.fs = fs
187-
self.db_func = db_func
18878
self.days_offset = days_offset
18979
self.verbose = verbose
19080
return
19181

192-
def extract_phys(self, df_raw):
193-
"""Given a dataframe of raw CAN data and a list of decoding databases,
194-
this extracts the physical values for each database and creates a new
195-
dataframe of unique physical values
82+
def extract_phys(self, df_raw, tp_type=None):
83+
"""Given df of raw data and list of decoding databases, create new def with
84+
physical values (no duplicate signals and optionally filtered/rebaselined)
19685
"""
19786
import can_decoder
19887
import pandas as pd
19988

20089
df_phys = pd.DataFrame()
20190
for db in self.db_list:
20291
df_decoder = can_decoder.DataFrameDecoder(db)
203-
df_phys = df_phys.append(df_decoder.decode_frame(df_raw))
20492

93+
if tp_type != None:
94+
df_phys_tp = pd.DataFrame()
95+
for length, group in df_raw.groupby("DataLength"):
96+
df_phys_group = df_decoder.decode_frame(group)
97+
df_phys_tp = df_phys_tp.append(df_phys_group)
98+
99+
df_phys = df_phys.append(df_phys_tp.sort_index())
100+
else:
101+
df_phys = df_phys.append(df_decoder.decode_frame(df_raw))
102+
103+
# remove duplicates in case multiple DBC files contain identical signals
205104
df_phys["datetime"] = df_phys.index
206105
df_phys = df_phys.drop_duplicates(keep="first")
207106
df_phys = df_phys.drop("datetime", 1)
208107

209-
return df_phys
108+
# optionally filter and rebaseline the data
109+
df_phys = self.filter_signals(df_phys)
110+
df_phys = self.rebaseline_data(df_phys)
210111

211-
def decode_log_files(self, log_files):
212-
"""Given a list of log files, load the raw data from the fs filesystem
213-
(e.g. local or S3) and convert it using a list of conversion rule databases.
112+
return df_phys
214113

215-
:param log_files: list of log file paths (e.g. as per output of canedge_browser)
114+
def rebaseline_data(self, df_phys):
115+
"""Given a df of physical values, this offsets the timestamp
116+
to be equal to today, minus a given number of days.
216117
"""
217-
import mdf_iter, can_decoder
218-
import pandas as pd
118+
if not df_phys.empty and type(self.days_offset) == int:
119+
from datetime import datetime, timezone
219120

220-
for log_file in log_files:
221-
with self.fs.open(log_file, "rb") as handle:
222-
mdf_file = mdf_iter.MdfFile(handle)
223-
device_id = self.get_device_id(mdf_file)
224-
df_raw = mdf_file.get_data_frame()
225-
226-
df_phys = self.extract_phys(df_raw)
227-
228-
if df_phys.empty:
229-
print("No signals were extracted")
230-
else:
231-
# optionally re-baseline data timestamps to 'now - days_offset'
232-
if type(self.days_offset) == int:
233-
from datetime import datetime, timezone
121+
delta_days = (datetime.now(timezone.utc) - df_phys.index.min()).days - self.days_offset
122+
df_phys.index = df_phys.index + pd.Timedelta(delta_days, "day")
234123

235-
delta_days = (datetime.now(timezone.utc) - df_phys.index.min()).days - self.days_offset
236-
df_phys.index = df_phys.index + pd.Timedelta(delta_days, "day")
124+
return df_phys
237125

238-
self.print_log_summary(device_id, log_file, df_phys)
239-
self.write_signals(device_id, df_phys)
126+
def filter_signals(self, df_phys):
127+
"""Given a df of physical values, return only signals matched by filter
128+
"""
129+
if len(self.signals):
130+
df_phys = df_phys[df_phys["Signal"].isin(self.signals)]
240131

241-
def write_signals(self, device_id, df_phys):
242-
"""Given a device ID and a dataframe of physical values, optionally
243-
filter, resample and write each signal to a time series database
132+
return df_phys
244133

245-
:param device_id: ID of device (used as the 'measurement name')
246-
:param df_phys: Dataframe of physical values (e.g. as per output of can_decoder)
134+
def get_raw_data(self, log_file):
135+
"""Extract a df of raw data and device ID from log file
247136
"""
137+
import mdf_iter
248138

249-
for signal, group in df_phys.groupby("Signal")["Physical Value"]:
250-
if signal in self.signals or len(self.signals) == 0:
251-
df_signal = group.to_frame().rename(columns={"Physical Value": signal})
139+
with self.fs.open(log_file, "rb") as handle:
140+
mdf_file = mdf_iter.MdfFile(handle)
141+
device_id = self.get_device_id(mdf_file)
142+
df_raw = mdf_file.get_data_frame()
252143

253-
cnt = len(df_signal)
254-
if self.res != "":
255-
df_signal = df_signal.resample(self.res).pad().dropna()
144+
return df_raw, device_id
256145

257-
self.print_signal_summary(signal, df_signal, cnt)
258-
self.db_func(device_id, df_signal)
146+
def get_device_id(self, mdf_file):
147+
return mdf_file.get_metadata()["HDComment.Device Information.serial number"]["value_raw"]
259148

260149
def print_log_summary(self, device_id, log_file, df_phys):
261150
"""Print summary information for each log file
@@ -265,16 +154,3 @@ def print_log_summary(self, device_id, log_file, df_phys):
265154
"\n---------------",
266155
f"\nDevice: {device_id} | Log file: {log_file.split(device_id)[-1]} [Extracted {len(df_phys)} decoded frames]\nPeriod: {df_phys.index.min()} - {df_phys.index.max()}\n",
267156
)
268-
269-
def print_signal_summary(self, signal, df_signal, cnt):
270-
"""Print summary information for each signal
271-
"""
272-
if self.verbose:
273-
print(f"Signal: {signal} (mean: {round(df_signal[signal].mean(),2)})")
274-
if self.res != "":
275-
print(f"- Resampling to {self.res} ({cnt} --> {len(df_signal)} records)")
276-
277-
def get_device_id(self, mdf_file):
278-
"""Extract device ID (serial number) from MDF4 log file
279-
"""
280-
return mdf_file.get_metadata()["HDComment.Device Information.serial number"]["value_raw"]

0 commit comments

Comments
 (0)