-
Notifications
You must be signed in to change notification settings - Fork 9
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Implement measurement field conversion from Vision #240
Draft
zhen0427
wants to merge
7
commits into
main
Choose a base branch
from
feature/measurement-field
base: main
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Draft
Changes from 6 commits
Commits
Show all changes
7 commits
Select commit
Hold shift + click to select a range
c215a43
Add the .yml file and find measured terminal type function.
zhen0427 daa2689
Cover all components for measurement field.
zhen0427 92d9666
Minor changes.
zhen0427 cca2019
Rewrite the filter to exclude sensor on link in a general way.
zhen0427 4f6b60d
commit
zhen0427 47c5bac
Pause this ticket
zhen0427 315124d
Merge branch 'main' into feature/measurement-field
nitbharambe File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change | ||
---|---|---|---|---|
|
@@ -117,7 +117,7 @@ def _parse_data(self, data: TabularData, data_type: str, extra_info: Optional[Ex | |||
|
||||
# For each table in the mapping | ||||
for table in self._mapping.tables(): | ||||
if table not in data or len(data[table]) == 0: | ||||
if table not in data or len(data[table][table_mask]) == 0: | ||||
continue # pragma: no cover (bug in python 3.9) | ||||
for component, attributes in self._mapping.instances(table=table): | ||||
component_data = self._convert_table_to_component( | ||||
|
@@ -178,7 +178,14 @@ def _convert_table_to_component( | |||
if table not in data: | ||||
return None | ||||
|
||||
n_records = len(data[table]) | ||||
table_mask = np.full(len(data[table]), True) | ||||
if "filter" in attributes: | ||||
table_mask = self._parse_filters(data, table) | ||||
|
||||
n_records = np.sum(table_mask) | ||||
|
||||
if n_records == 0: | ||||
return None | ||||
|
||||
try: | ||||
pgm_data = initialize_array(data_type=data_type, component_type=component, shape=n_records) | ||||
|
@@ -200,6 +207,7 @@ def _convert_table_to_component( | |||
attr=attr, | ||||
col_def=col_def, | ||||
extra_info=extra_info, | ||||
table_mask=table_mask | ||||
) | ||||
|
||||
return pgm_data | ||||
|
@@ -214,6 +222,7 @@ def _convert_col_def_to_attribute( | |||
attr: str, | ||||
col_def: Any, | ||||
extra_info: Optional[ExtraInfo], | ||||
table_mask: np.ndarray | ||||
): | ||||
"""This function updates one of the attributes of pgm_data, based on the corresponding table/column in a tabular | ||||
dataset | ||||
|
@@ -242,33 +251,41 @@ def _convert_col_def_to_attribute( | |||
""" | ||||
# To avoid mistakes, the attributes in the mapping should exist. There is one extra attribute called | ||||
# 'extra' in which extra information can be captured. | ||||
if attr not in pgm_data.dtype.names and attr != "extra": | ||||
if attr not in pgm_data.dtype.names and attr not in ["extra", "filter"]: | ||||
attrs = ", ".join(pgm_data.dtype.names) | ||||
raise KeyError(f"Could not find attribute '{attr}' for '{component}s'. (choose from: {attrs})") | ||||
|
||||
if attr == "extra": | ||||
# Extra info must be linked to the object IDs, therefore the uuids should be known before extra info can | ||||
# be parsed. Before this for loop, it is checked that "id" exists and it is placed at the front. | ||||
self._handle_extra_info( | ||||
data=data, table=table, col_def=col_def, uuids=pgm_data["id"], extra_info=extra_info | ||||
data=data, table=table, col_def=col_def, uuids=pgm_data["id"], extra_info=extra_info, table_mask=table_mask | ||||
) | ||||
# Extra info should not be added to the numpy arrays, so let's continue to the next attribute | ||||
return | ||||
|
||||
attr_data = self._parse_col_def(data=data, table=table, col_def=col_def, extra_info=extra_info) | ||||
attr_data = self._parse_col_def(data=data, table=table, col_def=col_def, extra_info=extra_info, table_mask=table_mask) | ||||
|
||||
if len(attr_data.columns) != 1: | ||||
raise ValueError(f"DataFrame for {component}.{attr} should contain a single column ({attr_data.columns})") | ||||
|
||||
pgm_data[attr] = attr_data.iloc[:, 0] | ||||
|
||||
def _parse_filters(self) -> pd.Series: | ||||
mask = True * pd.DataFrame(True, shape=data.shape) | ||||
for function, args in mapping["filter"].items(): | ||||
mask &= data.apply(function, arg) | ||||
|
||||
return pd.Series() | ||||
|
||||
def _handle_extra_info( | ||||
self, | ||||
data: TabularData, | ||||
table: str, | ||||
col_def: Any, | ||||
uuids: np.ndarray, | ||||
extra_info: Optional[ExtraInfo], | ||||
table_mask | ||||
) -> None: | ||||
"""This function can extract extra info from the tabular data and store it in the extra_info dict | ||||
|
||||
|
@@ -292,7 +309,7 @@ def _handle_extra_info( | |||
if extra_info is None: | ||||
return | ||||
|
||||
extra = self._parse_col_def(data=data, table=table, col_def=col_def, extra_info=None).to_dict(orient="records") | ||||
extra = self._parse_col_def(data=data, table=table, col_def=col_def, extra_info=None, table_mask=table_mask).to_dict(orient="records") | ||||
for i, xtr in zip(uuids, extra): | ||||
xtr = { | ||||
k[0] if isinstance(k, tuple) else k: v | ||||
|
@@ -339,7 +356,7 @@ def _serialize_data(self, data: Dataset, extra_info: Optional[ExtraInfo]) -> Tab | |||
return TabularData(logger=self._log, **data) | ||||
|
||||
def _parse_col_def( | ||||
self, data: TabularData, table: str, col_def: Any, extra_info: Optional[ExtraInfo] | ||||
self, data: TabularData, table: str, col_def: Any, extra_info: Optional[ExtraInfo], table_mask | ||||
) -> pd.DataFrame: | ||||
"""Interpret the column definition and extract/convert/create the data as a pandas DataFrame. | ||||
|
||||
|
@@ -353,17 +370,17 @@ def _parse_col_def( | |||
|
||||
""" | ||||
if isinstance(col_def, (int, float)): | ||||
return self._parse_col_def_const(data=data, table=table, col_def=col_def) | ||||
return self._parse_col_def_const(data=data, table=table, col_def=col_def, table_mask=table_mask) | ||||
if isinstance(col_def, str): | ||||
return self._parse_col_def_column_name(data=data, table=table, col_def=col_def) | ||||
return self._parse_col_def_column_name(data=data, table=table, col_def=col_def, table_mask=table_mask) | ||||
if isinstance(col_def, dict): | ||||
return self._parse_col_def_filter(data=data, table=table, col_def=col_def, extra_info=extra_info) | ||||
return self._parse_col_def_filter(data=data, table=table, col_def=col_def, extra_info=extra_info, table_mask=table_mask) | ||||
if isinstance(col_def, list): | ||||
return self._parse_col_def_composite(data=data, table=table, col_def=col_def) | ||||
return self._parse_col_def_composite(data=data, table=table, col_def=col_def, table_mask=table_mask) | ||||
raise TypeError(f"Invalid column definition: {col_def}") | ||||
|
||||
@staticmethod | ||||
def _parse_col_def_const(data: TabularData, table: str, col_def: Union[int, float]) -> pd.DataFrame: | ||||
def _parse_col_def_const(data: TabularData, table: str, col_def: Union[int, float], table_mask) -> pd.DataFrame: | ||||
"""Create a single column pandas DataFrame containing the const value. | ||||
|
||||
Args: | ||||
|
@@ -376,9 +393,9 @@ def _parse_col_def_const(data: TabularData, table: str, col_def: Union[int, floa | |||
|
||||
""" | ||||
assert isinstance(col_def, (int, float)) | ||||
return pd.DataFrame([col_def] * len(data[table])) | ||||
return pd.DataFrame([col_def] * len(data[table][table_mask])) | ||||
|
||||
def _parse_col_def_column_name(self, data: TabularData, table: str, col_def: str) -> pd.DataFrame: | ||||
def _parse_col_def_column_name(self, data: TabularData, table: str, col_def: str, table_mask) -> pd.DataFrame: | ||||
"""Extract a column from the data. If the column doesn't exist, check if the col_def is a special float value, | ||||
like 'inf'. If that's the case, create a single column pandas DataFrame containing the const value. | ||||
|
||||
|
@@ -391,7 +408,7 @@ def _parse_col_def_column_name(self, data: TabularData, table: str, col_def: str | |||
|
||||
""" | ||||
assert isinstance(col_def, str) | ||||
table_data = data[table] | ||||
table_data = data[table][table_mask] | ||||
|
||||
# If multiple columns are given in col_def, return the first column that exists in the dataset | ||||
columns = [col_name.strip() for col_name in col_def.split("|")] | ||||
|
@@ -421,7 +438,7 @@ def _apply_multiplier(self, table: str, column: str, data: pd.Series) -> pd.Seri | |||
return data | ||||
|
||||
def _parse_reference( | ||||
self, data: TabularData, table: str, other_table: str, query_column: str, key_column: str, value_column: str | ||||
self, data: TabularData, table: str, other_table: str, query_column: str, key_column: str, value_column: str, table_mask | ||||
) -> pd.DataFrame: | ||||
""" | ||||
Find and extract a column from a different table. | ||||
|
@@ -437,15 +454,15 @@ def _parse_reference( | |||
Returns: | ||||
|
||||
""" | ||||
queries = self._parse_col_def_column_name(data=data, table=table, col_def=query_column) | ||||
keys = self._parse_col_def_column_name(data=data, table=other_table, col_def=key_column) | ||||
values = self._parse_col_def_column_name(data=data, table=other_table, col_def=value_column) | ||||
queries = self._parse_col_def_column_name(data=data, table=table, col_def=query_column, table_mask=table_mask) | ||||
keys = self._parse_col_def_column_name(data=data, table=other_table, col_def=key_column, table_mask=table_mask) | ||||
values = self._parse_col_def_column_name(data=data, table=other_table, col_def=value_column, table_mask=table_mask) | ||||
other = pd.concat([keys, values], axis=1) | ||||
result = queries.merge(other, how="left", left_on=query_column, right_on=key_column) | ||||
return result[[value_column]] | ||||
|
||||
def _parse_col_def_filter( | ||||
self, data: TabularData, table: str, col_def: Dict[str, Any], extra_info: Optional[ExtraInfo] | ||||
self, data: TabularData, table: str, col_def: Dict[str, Any], extra_info: Optional[ExtraInfo], table_mask | ||||
) -> pd.DataFrame: | ||||
""" | ||||
Parse column filters like 'auto_id', 'reference', 'function', etc | ||||
|
@@ -468,6 +485,7 @@ def _parse_col_def_filter( | |||
ref_name=sub_def.get("name"), | ||||
key_col_def=sub_def["key"], | ||||
extra_info=extra_info, | ||||
table_mask=table_mask, | ||||
) | ||||
elif name == "reference": | ||||
# Check that (only) the required keys are in the definition | ||||
|
@@ -485,11 +503,12 @@ def _parse_col_def_filter( | |||
query_column=sub_def["query_column"], | ||||
key_column=sub_def["key_column"], | ||||
value_column=sub_def["value_column"], | ||||
table_mask=table_mask | ||||
) | ||||
elif isinstance(sub_def, list): | ||||
col_data = self._parse_pandas_function(data=data, table=table, fn_name=name, col_def=sub_def) | ||||
col_data = self._parse_pandas_function(data=data, table=table, fn_name=name, col_def=sub_def, table_mask=table_mask) | ||||
elif isinstance(sub_def, dict): | ||||
col_data = self._parse_function(data=data, table=table, function=name, col_def=sub_def) | ||||
col_data = self._parse_function(data=data, table=table, function=name, col_def=sub_def, table_mask=table_mask) | ||||
else: | ||||
raise TypeError(f"Invalid {name} definition: {sub_def}") | ||||
data_frames.append(col_data) | ||||
|
@@ -503,6 +522,7 @@ def _parse_auto_id( | |||
ref_name: Optional[str], | ||||
key_col_def: Union[str, List[str], Dict[str, str]], | ||||
extra_info: Optional[ExtraInfo], | ||||
table_mask, | ||||
) -> pd.DataFrame: | ||||
""" | ||||
Create (or retrieve) a unique numerical id for each object (row) in `data[table]`, based on the `name` | ||||
|
@@ -517,7 +537,7 @@ def _parse_auto_id( | |||
key_col_def: A column definition which should be unique for each object within the current table | ||||
|
||||
Returns: A single column containing numerical ids | ||||
|
||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. probably the formatter is already complaining so just calling it out
Suggested change
|
||||
""" | ||||
|
||||
# Handle reference table | ||||
|
@@ -535,7 +555,7 @@ def _parse_auto_id( | |||
else: | ||||
raise TypeError(f"Invalid key definition type '{type(key_col_def).__name__}': {key_col_def}") | ||||
|
||||
col_data = self._parse_col_def(data=data, table=table, col_def=key_col_def, extra_info=None) | ||||
col_data = self._parse_col_def(data=data, table=table, col_def=key_col_def, extra_info=None, table_mask=table_mask) | ||||
|
||||
def auto_id(row: np.ndarray): | ||||
key = dict(zip(key_names, row)) | ||||
|
@@ -558,7 +578,7 @@ def auto_id(row: np.ndarray): | |||
|
||||
return col_data.apply(auto_id, axis=1, raw=True) | ||||
|
||||
def _parse_pandas_function(self, data: TabularData, table: str, fn_name: str, col_def: List[Any]) -> pd.DataFrame: | ||||
def _parse_pandas_function(self, data: TabularData, table: str, fn_name: str, col_def: List[Any], table_mask) -> pd.DataFrame: | ||||
"""Special vectorized functions. | ||||
|
||||
Args: | ||||
|
@@ -576,7 +596,7 @@ def _parse_pandas_function(self, data: TabularData, table: str, fn_name: str, co | |||
if fn_name == "multiply": | ||||
fn_name = "prod" | ||||
|
||||
col_data = self._parse_col_def(data=data, table=table, col_def=col_def, extra_info=None) | ||||
col_data = self._parse_col_def(data=data, table=table, col_def=col_def, extra_info=None, table_mask=table_mask) | ||||
|
||||
try: | ||||
fn_ptr = getattr(col_data, fn_name) | ||||
|
@@ -599,7 +619,7 @@ def _parse_pandas_function(self, data: TabularData, table: str, fn_name: str, co | |||
|
||||
return pd.DataFrame(fn_ptr(axis=1)) | ||||
|
||||
def _parse_function(self, data: TabularData, table: str, function: str, col_def: Dict[str, Any]) -> pd.DataFrame: | ||||
def _parse_function(self, data: TabularData, table: str, function: str, col_def: Dict[str, Any], table_mask) -> pd.DataFrame: | ||||
"""Import the function by name and apply it to each row. | ||||
|
||||
Args: | ||||
|
@@ -616,15 +636,14 @@ def _parse_function(self, data: TabularData, table: str, function: str, col_def: | |||
fn_ptr = get_function(function) | ||||
key_words = list(col_def.keys()) | ||||
sub_def = list(col_def.values()) | ||||
col_data = self._parse_col_def(data=data, table=table, col_def=sub_def, extra_info=None) | ||||
col_data = self._parse_col_def(data=data, table=table, col_def=sub_def, extra_info=None, table_mask=table_mask) | ||||
|
||||
if col_data.empty: | ||||
raise ValueError(f"Cannot apply function {function} to an empty DataFrame") | ||||
|
||||
col_data = col_data.apply(lambda row, fn=fn_ptr: fn(**dict(zip(key_words, row))), axis=1, raw=True) | ||||
return pd.DataFrame(col_data) | ||||
|
||||
def _parse_col_def_composite(self, data: TabularData, table: str, col_def: list) -> pd.DataFrame: | ||||
def _parse_col_def_composite(self, data: TabularData, table: str, col_def: list, table_mask) -> pd.DataFrame: | ||||
"""Select multiple columns (each is created from a column definition) and return them as a new DataFrame. | ||||
|
||||
Args: | ||||
|
@@ -636,7 +655,7 @@ def _parse_col_def_composite(self, data: TabularData, table: str, col_def: list) | |||
|
||||
""" | ||||
assert isinstance(col_def, list) | ||||
columns = [self._parse_col_def(data=data, table=table, col_def=sub_def, extra_info=None) for sub_def in col_def] | ||||
columns = [self._parse_col_def(data=data, table=table, col_def=sub_def, extra_info=None, table_mask=table_mask) for sub_def in col_def] | ||||
return pd.concat(columns, axis=1) | ||||
|
||||
def _get_id(self, table: str, key: Mapping[str, int], name: Optional[str]) -> int: | ||||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
pro tip: use
()
for inline things. it's one of the few cases where the python interpreter can optimize during parsing because tuples()
are fixed-size. it's minor in this case, but this stuff accumulates multiplicatively so it can add up