-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #137 from IDEMSInternational/feat/model-inference
Automatic model inference
- Loading branch information
Showing
7 changed files
with
576 additions
and
48 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
# Models | ||
|
||
## Automatic model inference | ||
|
||
Models of sheets can now be automatically inferred if no explicit model is provided. | ||
|
||
This is done exclusively by parsing the header row of a sheet. Headers can be annotated with types (basic types and list; dict and existing models are currently not supported). If no annotation is present, the column is assumed to be a string. | ||
|
||
Examples of what the data in a column can represent: | ||
- `field`: `field` is inferred to be a string | ||
- `field:int`: `field` is inferred to be a int | ||
- `field:list`: `field` is inferred to be a list | ||
- `field:List[int]`: `field` is inferred to be a list of integers | ||
- `field.1`: `field` is inferred to be a list, and this column contains its first entry | ||
- `field.1:int`: `field` is inferred to be a list of integers, and this column contains its first entry | ||
- `field.subfield`: `field` is inferred to be another model with one or multiple subfields, and this column contains values for the `subfield` subfield | ||
- `field.subfield:int`: `field` is inferred to be another model with one or multiple subfields, and this column contains values for the `subfield` subfield which is inferred to be an integer | ||
- `field.1.subfield`: `field` is inferred to be a list of another model with one or multiple subfields, and this column contains values for the `subfield` subfield of the first list entry | ||
|
||
Intermediate models like in the last three examples are created automatically. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,117 @@ | ||
from collections import defaultdict | ||
from typing import List, ForwardRef, _eval_type | ||
from pydoc import locate | ||
from pydantic import create_model | ||
|
||
from rpft.parsers.common.rowparser import ( | ||
ParserModel, | ||
RowParser, | ||
RowParserError, | ||
get_field_name, | ||
is_list_type, | ||
is_parser_model_type, | ||
str_to_bool, | ||
) | ||
|
||
|
||
def type_from_string(string): | ||
if not string: | ||
# By default, assume str | ||
return str | ||
basic_type = locate(string) | ||
if basic_type: | ||
return basic_type | ||
try: | ||
inferred_type = _eval_type(ForwardRef(string), globals(), globals()) | ||
except NameError as e: | ||
raise RowParserError(f'Error while parsing type "{string}": {str(e)}') | ||
return inferred_type | ||
|
||
|
||
def get_value_for_type(type, value=None): | ||
if is_list_type(type): | ||
# We do not support default values for lists. | ||
return [] | ||
if is_parser_model_type(type): | ||
# We do not support default values for ParserModel. | ||
return type() | ||
if value is not None: | ||
if type is bool: | ||
return str_to_bool(value) | ||
return type(value) | ||
return type() | ||
|
||
|
||
def infer_type(string): | ||
if RowParser.TYPE_ANNOTATION_SEPARATOR not in string: | ||
return type_from_string("") | ||
# Take the stuff between colon and equal sign | ||
prefix, suffix = string.split(RowParser.TYPE_ANNOTATION_SEPARATOR, 1) | ||
return type_from_string(suffix.split(RowParser.DEFAULT_VALUE_SEPARATOR)[0].strip()) | ||
|
||
|
||
def infer_default_value(type, string): | ||
if RowParser.DEFAULT_VALUE_SEPARATOR not in string: | ||
# Return the default value for the given type | ||
return get_value_for_type(type) | ||
prefix, suffix = string.split(RowParser.DEFAULT_VALUE_SEPARATOR, 1) | ||
return get_value_for_type(type, suffix.strip()) | ||
|
||
|
||
def parse_header_annotations(string): | ||
inferred_type = infer_type(string) | ||
return inferred_type, infer_default_value(inferred_type, string) | ||
|
||
|
||
def represents_integer(string): | ||
try: | ||
_ = int(string) | ||
return True | ||
except ValueError: | ||
return False | ||
|
||
|
||
def dict_to_list(dict): | ||
out = [None] * (max(dict.keys()) + 1) | ||
for k, v in dict.items(): | ||
out[k] = v | ||
return out | ||
|
||
|
||
def model_from_headers(name, headers): | ||
return model_from_headers_rec(name, headers)[0] | ||
|
||
|
||
def model_from_headers_rec(name, headers): | ||
# Returns a model and a default value | ||
fields = {} | ||
complex_fields = defaultdict(list) | ||
for header in headers: | ||
if RowParser.HEADER_FIELD_SEPARATOR in header: | ||
field, subheader = header.split(RowParser.HEADER_FIELD_SEPARATOR, 1) | ||
complex_fields[field].append(subheader) | ||
else: | ||
field = get_field_name(header) | ||
field_type, default_value = parse_header_annotations(header) | ||
fields[field] = (field_type, default_value) | ||
for field, subheaders in complex_fields.items(): | ||
# Assign model and default value | ||
fields[field] = model_from_headers_rec(name.title() + field.title(), subheaders) | ||
|
||
# In case the model that we're creating is a list, | ||
# all its fields are numbers (indices). | ||
list_model = None | ||
list_default_values = {} | ||
for field, value in fields.items(): | ||
if represents_integer(field): | ||
# We do not check whether the models for each list entry match. | ||
# We just take one of them. | ||
list_model = value[0] | ||
# Index shift: because in the headers, we count from 1 | ||
list_default_values[int(field) - 1] = value[1] | ||
if list_model is not None: | ||
return List[list_model], dict_to_list(list_default_values) | ||
|
||
# If the model we're creating is not a list, it's a class | ||
model = create_model(name.title(), __base__=ParserModel, **fields) | ||
return model, get_value_for_type(model) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.