From 11f1f6458e55c83175d6f65708cd3d5f615e8f50 Mon Sep 17 00:00:00 2001 From: Kristen Armes <6732445+kristenarmes@users.noreply.github.com> Date: Thu, 7 Jul 2022 10:48:49 -0700 Subject: [PATCH] feat: Adding new Trino type parser and other type metadata updates (#1917) * Adding new Trino type parser and other type metadata updates Signed-off-by: Kristen Armes * Revert change to _format_as_list function name Signed-off-by: Kristen Armes * Updating warning logging messages for when badges and descriptions aren't supported Signed-off-by: Kristen Armes * lint Signed-off-by: Kristen Armes * Bump databuilder version Signed-off-by: Kristen Armes --- databuilder/databuilder/models/badge.py | 2 +- .../databuilder/models/type_metadata.py | 165 ++++++++-- .../utils/hive_complex_type_parser.py | 3 - .../utils/trino_complex_type_parser.py | 85 +++++ databuilder/setup.py | 2 +- .../tests/unit/models/test_type_metadata.py | 109 ++++++- .../test_complex_type_transformer.py | 33 ++ .../utils/test_trino_complex_type_parser.py | 302 ++++++++++++++++++ 8 files changed, 660 insertions(+), 41 deletions(-) create mode 100644 databuilder/databuilder/utils/trino_complex_type_parser.py create mode 100644 databuilder/tests/unit/utils/test_trino_complex_type_parser.py diff --git a/databuilder/databuilder/models/badge.py b/databuilder/databuilder/models/badge.py index 2ebcbf2d5d..fbccf21831 100644 --- a/databuilder/databuilder/models/badge.py +++ b/databuilder/databuilder/models/badge.py @@ -50,7 +50,7 @@ class BadgeMetadata(GraphSerializable, TableSerializable, AtlasSerializable): BADGE_RELATION_TYPE = 'HAS_BADGE' INVERSE_BADGE_RELATION_TYPE = 'BADGE_FOR' - LABELS_PERMITTED_TO_HAVE_BADGE = ['Table', 'Dashboard', 'Column', 'Feature'] + LABELS_PERMITTED_TO_HAVE_BADGE = ['Table', 'Dashboard', 'Column', 'Feature', 'Type_Metadata'] def __init__(self, start_label: str, diff --git a/databuilder/databuilder/models/type_metadata.py b/databuilder/databuilder/models/type_metadata.py index 8c7de8fc12..e31c16ab54 100644 --- a/databuilder/databuilder/models/type_metadata.py +++ b/databuilder/databuilder/models/type_metadata.py @@ -2,15 +2,19 @@ # SPDX-License-Identifier: Apache-2.0 import abc +import logging from typing import ( - Any, Dict, Iterator, Optional, Union, + Any, Dict, Iterator, List, Optional, Union, ) +from databuilder.models.badge import Badge, BadgeMetadata from databuilder.models.description_metadata import DescriptionMetadata from databuilder.models.graph_node import GraphNode from databuilder.models.graph_relationship import GraphRelationship from databuilder.models.graph_serializable import GraphSerializable -from databuilder.models.table_metadata import ColumnMetadata +from databuilder.models.table_metadata import ColumnMetadata, _format_as_list + +LOGGER = logging.getLogger(__name__) class TypeMetadata(abc.ABC, GraphSerializable): @@ -29,21 +33,55 @@ def __init__(self, name: str, parent: Union[ColumnMetadata, 'TypeMetadata'], type_str: str, - description: Optional[str] = None, sort_order: Optional[int] = None) -> None: self.name = name self.parent = parent self.type_str = type_str - self.description = DescriptionMetadata.create_description_metadata( - source=None, - text=description - ) # Sort order among TypeMetadata objects with the same parent self.sort_order = sort_order + self._description: Optional[DescriptionMetadata] = None + self._badges: Optional[List[Badge]] = None + self._node_iter = self.create_node_iterator() self._relation_iter = self.create_relation_iterator() + def get_description(self) -> Optional[DescriptionMetadata]: + return self._description + + def set_description(self, description: str) -> None: + if isinstance(self.parent, ColumnMetadata): + LOGGER.warning("""Frontend does not currently support setting descriptions for type metadata + objects with a ColumnMetadata parent, since the top level type metadata does + not have its own row in the column table""") + elif isinstance(self.parent, ArrayTypeMetadata): + LOGGER.warning("""Frontend does not currently support setting descriptions for type metadata + objects with an ArrayTypeMetadata parent, since this level in the nesting + hierarchy is not named and therefore is represented by short row that is not + clickable""") + else: + self._description = DescriptionMetadata.create_description_metadata( + source=None, + text=description + ) + + def get_badges(self) -> Optional[List[Badge]]: + return self._badges + + def set_badges(self, badges: Union[List[str], None] = None) -> None: + if isinstance(self.parent, ColumnMetadata): + LOGGER.warning("""Frontend does not currently support setting badges for type metadata + objects with a ColumnMetadata parent, since the top level type metadata does + not have its own row in the column table""") + elif isinstance(self.parent, ArrayTypeMetadata): + LOGGER.warning("""Frontend does not currently support setting badges for type metadata + objects with an ArrayTypeMetadata parent, since this level in the nesting + hierarchy is not named and therefore is represented by short row that is not + clickable""") + else: + formatted_badges = _format_as_list(badges) + self._badges = [Badge(badge, 'type_metadata') for badge in formatted_badges] + @abc.abstractmethod def __eq__(self, other: Any) -> bool: raise NotImplementedError @@ -82,8 +120,8 @@ def key(self) -> str: return f"{self.parent_key()}/{self.name}" def description_key(self) -> Optional[str]: - if self.description: - description_id = self.description.get_description_id() + if self._description: + description_id = self._description.get_description_id() return f"{self.key()}/{description_id}" return None @@ -124,8 +162,9 @@ def __eq__(self, other: Any) -> bool: if isinstance(other, ArrayTypeMetadata): return (self.name == other.name and self.type_str == other.type_str and - self.description == other.description and self.sort_order == other.sort_order and + self._description == other._description and + self._badges == other._badges and self.array_inner_type == other.array_inner_type and self.key() == other.key()) return False @@ -149,10 +188,18 @@ def create_node_iterator(self) -> Iterator[GraphNode]: attributes=node_attributes ) - if self.description: + if self._description: description_key = self.description_key() assert description_key is not None, f"Could not retrieve description key for {self.name}" - yield self.description.get_node(description_key) + yield self._description.get_node(description_key) + + if self._badges: + badge_metadata = BadgeMetadata(start_label=TypeMetadata.NODE_LABEL, + start_key=self.key(), + badges=self._badges) + badge_nodes = badge_metadata.get_badge_nodes() + for node in badge_nodes: + yield node if not self.is_terminal_type(): assert self.array_inner_type is not None, f"Array inner type must be set for {self.name}" @@ -169,15 +216,23 @@ def create_relation_iterator(self) -> Iterator[GraphRelationship]: attributes={} ) - if self.description: + if self._description: description_key = self.description_key() assert description_key is not None, f"Could not retrieve description key for {self.name}" - yield self.description.get_relation( + yield self._description.get_relation( TypeMetadata.NODE_LABEL, self.key(), description_key ) + if self._badges: + badge_metadata = BadgeMetadata(start_label=TypeMetadata.NODE_LABEL, + start_key=self.key(), + badges=self._badges) + badge_relations = badge_metadata.get_badge_relations() + for relation in badge_relations: + yield relation + if not self.is_terminal_type(): assert self.array_inner_type is not None, f"Array inner type must be set for {self.name}" yield from self.array_inner_type.create_relation_iterator() @@ -197,8 +252,9 @@ def __eq__(self, other: Any) -> bool: self.map_key_type == other.map_key_type and self.map_value_type == other.map_value_type and self.type_str == other.type_str and - self.description == other.description and self.sort_order == other.sort_order and + self._description == other._description and + self._badges == other._badges and self.key() == other.key()) return False @@ -221,10 +277,18 @@ def create_node_iterator(self) -> Iterator[GraphNode]: attributes=node_attributes ) - if self.description: + if self._description: description_key = self.description_key() assert description_key is not None, f"Could not retrieve description key for {self.name}" - yield self.description.get_node(description_key) + yield self._description.get_node(description_key) + + if self._badges: + badge_metadata = BadgeMetadata(start_label=TypeMetadata.NODE_LABEL, + start_key=self.key(), + badges=self._badges) + badge_nodes = badge_metadata.get_badge_nodes() + for node in badge_nodes: + yield node if not self.is_terminal_type(): assert self.map_key_type is not None, f"Map key type must be set for {self.name}" @@ -243,15 +307,23 @@ def create_relation_iterator(self) -> Iterator[GraphRelationship]: attributes={} ) - if self.description: + if self._description: description_key = self.description_key() assert description_key is not None, f"Could not retrieve description key for {self.name}" - yield self.description.get_relation( + yield self._description.get_relation( TypeMetadata.NODE_LABEL, self.key(), description_key ) + if self._badges: + badge_metadata = BadgeMetadata(start_label=TypeMetadata.NODE_LABEL, + start_key=self.key(), + badges=self._badges) + badge_relations = badge_metadata.get_badge_relations() + for relation in badge_relations: + yield relation + if not self.is_terminal_type(): assert self.map_key_type is not None, f"Map key type must be set for {self.name}" assert self.map_value_type is not None, f"Map value type must be set for {self.name}" @@ -274,8 +346,9 @@ def __eq__(self, other: Any) -> bool: if isinstance(other, ScalarTypeMetadata): return (self.name == other.name and self.type_str == other.type_str and - self.description == other.description and self.sort_order == other.sort_order and + self._description == other._description and + self._badges == other._badges and self.key() == other.key()) return False @@ -298,10 +371,18 @@ def create_node_iterator(self) -> Iterator[GraphNode]: attributes=node_attributes ) - if self.description: + if self._description: description_key = self.description_key() assert description_key is not None, f"Could not retrieve description key for {self.name}" - yield self.description.get_node(description_key) + yield self._description.get_node(description_key) + + if self._badges: + badge_metadata = BadgeMetadata(start_label=TypeMetadata.NODE_LABEL, + start_key=self.key(), + badges=self._badges) + badge_nodes = badge_metadata.get_badge_nodes() + for node in badge_nodes: + yield node def create_relation_iterator(self) -> Iterator[GraphRelationship]: yield GraphRelationship( @@ -314,15 +395,23 @@ def create_relation_iterator(self) -> Iterator[GraphRelationship]: attributes={} ) - if self.description: + if self._description: description_key = self.description_key() assert description_key is not None, f"Could not retrieve description key for {self.name}" - yield self.description.get_relation( + yield self._description.get_relation( TypeMetadata.NODE_LABEL, self.key(), description_key ) + if self._badges: + badge_metadata = BadgeMetadata(start_label=TypeMetadata.NODE_LABEL, + start_key=self.key(), + badges=self._badges) + badge_relations = badge_metadata.get_badge_relations() + for relation in badge_relations: + yield relation + class StructTypeMetadata(TypeMetadata): kind = 'struct' @@ -333,12 +422,12 @@ def __init__(self, *args: Any, **kwargs: Any) -> None: def __eq__(self, other: Any) -> bool: if isinstance(other, StructTypeMetadata): - return (self.name == other.name and self.struct_items == other.struct_items and self.type_str == other.type_str and - self.description == other.description and self.sort_order == other.sort_order and + self._description == other._description and + self._badges == other._badges and self.key() == other.key()) return False @@ -361,10 +450,18 @@ def create_node_iterator(self) -> Iterator[GraphNode]: attributes=node_attributes ) - if self.description: + if self._description: description_key = self.description_key() assert description_key is not None, f"Could not retrieve description key for {self.name}" - yield self.description.get_node(description_key) + yield self._description.get_node(description_key) + + if self._badges: + badge_metadata = BadgeMetadata(start_label=TypeMetadata.NODE_LABEL, + start_key=self.key(), + badges=self._badges) + badge_nodes = badge_metadata.get_badge_nodes() + for node in badge_nodes: + yield node if not self.is_terminal_type(): assert self.struct_items, f"Struct items must be set for {self.name}" @@ -382,15 +479,23 @@ def create_relation_iterator(self) -> Iterator[GraphRelationship]: attributes={} ) - if self.description: + if self._description: description_key = self.description_key() assert description_key is not None, f"Could not retrieve description key for {self.name}" - yield self.description.get_relation( + yield self._description.get_relation( TypeMetadata.NODE_LABEL, self.key(), description_key ) + if self._badges: + badge_metadata = BadgeMetadata(start_label=TypeMetadata.NODE_LABEL, + start_key=self.key(), + badges=self._badges) + badge_relations = badge_metadata.get_badge_relations() + for relation in badge_relations: + yield relation + if not self.is_terminal_type(): assert self.struct_items, f"Struct items must be set for {self.name}" for name, data_type in self.struct_items.items(): diff --git a/databuilder/databuilder/utils/hive_complex_type_parser.py b/databuilder/databuilder/utils/hive_complex_type_parser.py index fd66032af0..c2b8c7e62d 100644 --- a/databuilder/databuilder/utils/hive_complex_type_parser.py +++ b/databuilder/databuilder/utils/hive_complex_type_parser.py @@ -1,7 +1,6 @@ # Copyright Contributors to the Amundsen project. # SPDX-License-Identifier: Apache-2.0 -import logging from typing import Union from pyparsing import ( @@ -13,8 +12,6 @@ ArrayTypeMetadata, MapTypeMetadata, ScalarTypeMetadata, StructTypeMetadata, TypeMetadata, ) -LOGGER = logging.getLogger(__name__) - array_keyword = Keyword("array") map_keyword = Keyword("map") struct_keyword = Keyword("struct") diff --git a/databuilder/databuilder/utils/trino_complex_type_parser.py b/databuilder/databuilder/utils/trino_complex_type_parser.py new file mode 100644 index 0000000000..288521089e --- /dev/null +++ b/databuilder/databuilder/utils/trino_complex_type_parser.py @@ -0,0 +1,85 @@ +# Copyright Contributors to the Amundsen project. +# SPDX-License-Identifier: Apache-2.0 + +from typing import Union + +from pyparsing import ( + Forward, Group, Keyword, OneOrMore, Optional, Word, alphanums, delimitedList, nestedExpr, nums, originalTextFor, +) + +from databuilder.models.table_metadata import ColumnMetadata +from databuilder.models.type_metadata import ( + ArrayTypeMetadata, MapTypeMetadata, ScalarTypeMetadata, StructTypeMetadata, TypeMetadata, +) + +array_keyword = Keyword("array") +map_keyword = Keyword("map") +struct_keyword = Keyword("row") + +field_name = Word(alphanums + "_") +field_type = Forward() + +# Scalar types +scalar_quantifier = "(" + Word(nums) + Optional(")" | "," + Word(nums) + ")") +scalar_type = OneOrMore(Word(alphanums + "_")) + Optional(scalar_quantifier) + +# Complex types +array_field = "(" + field_type("type") +map_field = originalTextFor(scalar_type)("key") + "," + field_type("type") +struct_field = field_name("name") + field_type("type") +struct_list = delimitedList(Group(struct_field)) +array_type = nestedExpr( + opener=array_keyword, closer=")", content=array_field, ignoreExpr=None +) +map_type = nestedExpr( + opener=map_keyword + "(", closer=")", content=map_field, ignoreExpr=None +) +struct_type = nestedExpr( + opener=struct_keyword + "(", closer=")", content=struct_list, ignoreExpr=None +) + +field_type <<= originalTextFor(array_type | map_type | struct_type | scalar_type) + +complex_type = (array_type("array_type") | map_type("map_type") | struct_type("struct_type") | + scalar_type("scalar_type")) + + +def parse_trino_type(type_str: str, name: str, parent: Union[ColumnMetadata, TypeMetadata]) -> TypeMetadata: + type_str = type_str.lower() + type_str = type_str.replace('\"', '') # Remove quotes around names that are added when querying from HMS + parsed_type = complex_type.parseString(type_str, parseAll=True) + + if parsed_type.scalar_type: + return ScalarTypeMetadata(name=name, + parent=parent, + type_str=type_str) + + results = parsed_type[0] + if parsed_type.array_type: + array_type_metadata = ArrayTypeMetadata(name=name, + parent=parent, + type_str=type_str) + array_inner_type = parse_trino_type(results.type, '_inner_', array_type_metadata) + if not isinstance(array_inner_type, ScalarTypeMetadata): + array_type_metadata.array_inner_type = array_inner_type + return array_type_metadata + elif parsed_type.map_type: + map_type_metadata = MapTypeMetadata(name=name, + parent=parent, + type_str=type_str) + map_type_metadata.map_key_type = parse_trino_type(results.key, '_map_key', map_type_metadata) + map_type_metadata.map_value_type = parse_trino_type(results.type, '_map_value', map_type_metadata) + return map_type_metadata + elif parsed_type.struct_type: + struct_type_metadata = StructTypeMetadata(name=name, + parent=parent, + type_str=type_str) + struct_items = {} + for index, result in enumerate(results): + struct_items[result.name] = parse_trino_type(result.type, result.name, struct_type_metadata) + struct_items[result.name].sort_order = index + + struct_type_metadata.struct_items = struct_items + return struct_type_metadata + else: + raise Exception(f"Unrecognized type: {type_str}") diff --git a/databuilder/setup.py b/databuilder/setup.py index 4abe9d9675..ca02241a31 100644 --- a/databuilder/setup.py +++ b/databuilder/setup.py @@ -5,7 +5,7 @@ from setuptools import find_packages, setup -__version__ = '6.11.1' +__version__ = '6.12.0' requirements_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'requirements.txt') diff --git a/databuilder/tests/unit/models/test_type_metadata.py b/databuilder/tests/unit/models/test_type_metadata.py index a0bfa0cb05..dd5c39c836 100644 --- a/databuilder/tests/unit/models/test_type_metadata.py +++ b/databuilder/tests/unit/models/test_type_metadata.py @@ -492,7 +492,6 @@ def test_serialize_struct_type_metadata(self) -> None: name='c3', parent=nested_struct_type_metadata_level2, type_str='string', - description='description of c3' ) nested_scalar_type_metadata_c4 = ScalarTypeMetadata( name='c4', @@ -503,7 +502,6 @@ def test_serialize_struct_type_metadata(self) -> None: name='c5', parent=struct_type_metadata, type_str='string', - description='description of c5' ) struct_type_metadata.struct_items = {'c1': nested_struct_type_metadata_level1, @@ -517,6 +515,11 @@ def test_serialize_struct_type_metadata(self) -> None: nested_scalar_type_metadata_c3.sort_order = 0 nested_scalar_type_metadata_c4.sort_order = 1 + nested_scalar_type_metadata_c3.set_description('description of c3') + nested_scalar_type_metadata_c3.set_badges(['badge1']) + nested_scalar_type_metadata_c5.set_description('description of c5') + nested_scalar_type_metadata_c5.set_badges(['badge1', 'badge2']) + expected_nodes = [ {'kind': 'struct', 'name': 'col1', 'data_type': 'struct>,c5:string>', @@ -533,6 +536,7 @@ def test_serialize_struct_type_metadata(self) -> None: {'description': 'description of c3', 'KEY': 'hive://gold.test_schema1/test_table1/col1/type/col1/c1/c2/c3/_description', 'LABEL': 'Description', 'description_source': 'description'}, + {'KEY': 'badge1', 'LABEL': 'Badge', 'category': 'type_metadata'}, {'kind': 'scalar', 'name': 'c4', 'data_type': 'string', 'LABEL': 'Type_Metadata', 'sort_order:UNQUOTED': 1, 'KEY': 'hive://gold.test_schema1/test_table1/col1/type/col1/c1/c2/c4'}, @@ -541,7 +545,9 @@ def test_serialize_struct_type_metadata(self) -> None: 'KEY': 'hive://gold.test_schema1/test_table1/col1/type/col1/c5'}, {'description': 'description of c5', 'KEY': 'hive://gold.test_schema1/test_table1/col1/type/col1/c5/_description', - 'LABEL': 'Description', 'description_source': 'description'} + 'LABEL': 'Description', 'description_source': 'description'}, + {'KEY': 'badge1', 'LABEL': 'Badge', 'category': 'type_metadata'}, + {'KEY': 'badge2', 'LABEL': 'Badge', 'category': 'type_metadata'} ] expected_rels = [ {'END_KEY': 'hive://gold.test_schema1/test_table1/col1/type/col1', @@ -564,6 +570,9 @@ def test_serialize_struct_type_metadata(self) -> None: 'START_LABEL': 'Type_Metadata', 'END_LABEL': 'Description', 'START_KEY': 'hive://gold.test_schema1/test_table1/col1/type/col1/c1/c2/c3', 'TYPE': 'DESCRIPTION', 'REVERSE_TYPE': 'DESCRIPTION_OF'}, + {'END_KEY': 'badge1', 'START_LABEL': 'Type_Metadata', 'END_LABEL': 'Badge', + 'START_KEY': 'hive://gold.test_schema1/test_table1/col1/type/col1/c1/c2/c3', + 'TYPE': 'HAS_BADGE', 'REVERSE_TYPE': 'BADGE_FOR'}, {'END_KEY': 'hive://gold.test_schema1/test_table1/col1/type/col1/c1/c2/c4', 'START_KEY': 'hive://gold.test_schema1/test_table1/col1/type/col1/c1/c2', 'END_LABEL': 'Type_Metadata', 'START_LABEL': 'Type_Metadata', @@ -575,7 +584,13 @@ def test_serialize_struct_type_metadata(self) -> None: {'END_KEY': 'hive://gold.test_schema1/test_table1/col1/type/col1/c5/_description', 'START_LABEL': 'Type_Metadata', 'END_LABEL': 'Description', 'START_KEY': 'hive://gold.test_schema1/test_table1/col1/type/col1/c5', - 'TYPE': 'DESCRIPTION', 'REVERSE_TYPE': 'DESCRIPTION_OF'} + 'TYPE': 'DESCRIPTION', 'REVERSE_TYPE': 'DESCRIPTION_OF'}, + {'END_KEY': 'badge1', 'START_LABEL': 'Type_Metadata', 'END_LABEL': 'Badge', + 'START_KEY': 'hive://gold.test_schema1/test_table1/col1/type/col1/c5', + 'TYPE': 'HAS_BADGE', 'REVERSE_TYPE': 'BADGE_FOR'}, + {'END_KEY': 'badge2', 'START_LABEL': 'Type_Metadata', 'END_LABEL': 'Badge', + 'START_KEY': 'hive://gold.test_schema1/test_table1/col1/type/col1/c5', + 'TYPE': 'HAS_BADGE', 'REVERSE_TYPE': 'BADGE_FOR'} ] node_row = struct_type_metadata.next_node() @@ -611,7 +626,6 @@ def test_serialize_struct_map_array_type_metadata(self) -> None: name='c1', parent=struct_type_metadata, type_str='map>', - description='description of map' ) nested_map_key = ScalarTypeMetadata( name='_map_key', @@ -627,7 +641,6 @@ def test_serialize_struct_map_array_type_metadata(self) -> None: name='c2', parent=struct_type_metadata, type_str='array', - description='description of array' ) struct_type_metadata.struct_items = {'c1': nested_map_type_metadata_level1, @@ -637,6 +650,11 @@ def test_serialize_struct_map_array_type_metadata(self) -> None: nested_map_type_metadata_level1.sort_order = 0 nested_array_type_metadata_level1.sort_order = 1 + nested_map_type_metadata_level1.set_description('description of map') + nested_map_type_metadata_level1.set_badges(['badge1']) + nested_array_type_metadata_level1.set_description('description of array') + nested_array_type_metadata_level1.set_badges(['badge1', 'badge2']) + expected_nodes = [ {'kind': 'struct', 'name': 'col1', 'LABEL': 'Type_Metadata', 'data_type': 'struct>,c2:array>', @@ -647,6 +665,7 @@ def test_serialize_struct_map_array_type_metadata(self) -> None: {'description': 'description of map', 'KEY': 'hive://gold.test_schema1/test_table1/col1/type/col1/c1/_description', 'LABEL': 'Description', 'description_source': 'description'}, + {'KEY': 'badge1', 'LABEL': 'Badge', 'category': 'type_metadata'}, {'kind': 'scalar', 'name': '_map_key', 'data_type': 'string', 'LABEL': 'Type_Metadata', 'KEY': 'hive://gold.test_schema1/test_table1/col1/type/col1/c1/_map_key'}, @@ -659,6 +678,8 @@ def test_serialize_struct_map_array_type_metadata(self) -> None: {'description': 'description of array', 'KEY': 'hive://gold.test_schema1/test_table1/col1/type/col1/c2/_description', 'LABEL': 'Description', 'description_source': 'description'}, + {'KEY': 'badge1', 'LABEL': 'Badge', 'category': 'type_metadata'}, + {'KEY': 'badge2', 'LABEL': 'Badge', 'category': 'type_metadata'} ] expected_rels = [ {'END_KEY': 'hive://gold.test_schema1/test_table1/col1/type/col1', @@ -673,6 +694,9 @@ def test_serialize_struct_map_array_type_metadata(self) -> None: 'START_LABEL': 'Type_Metadata', 'END_LABEL': 'Description', 'START_KEY': 'hive://gold.test_schema1/test_table1/col1/type/col1/c1', 'TYPE': 'DESCRIPTION', 'REVERSE_TYPE': 'DESCRIPTION_OF'}, + {'END_KEY': 'badge1', 'START_LABEL': 'Type_Metadata', 'END_LABEL': 'Badge', + 'START_KEY': 'hive://gold.test_schema1/test_table1/col1/type/col1/c1', + 'TYPE': 'HAS_BADGE', 'REVERSE_TYPE': 'BADGE_FOR'}, {'END_KEY': 'hive://gold.test_schema1/test_table1/col1/type/col1/c1/_map_key', 'START_KEY': 'hive://gold.test_schema1/test_table1/col1/type/col1/c1', 'END_LABEL': 'Type_Metadata', 'START_LABEL': 'Type_Metadata', @@ -689,6 +713,12 @@ def test_serialize_struct_map_array_type_metadata(self) -> None: 'START_LABEL': 'Type_Metadata', 'END_LABEL': 'Description', 'START_KEY': 'hive://gold.test_schema1/test_table1/col1/type/col1/c2', 'TYPE': 'DESCRIPTION', 'REVERSE_TYPE': 'DESCRIPTION_OF'}, + {'END_KEY': 'badge1', 'START_LABEL': 'Type_Metadata', 'END_LABEL': 'Badge', + 'START_KEY': 'hive://gold.test_schema1/test_table1/col1/type/col1/c2', + 'TYPE': 'HAS_BADGE', 'REVERSE_TYPE': 'BADGE_FOR'}, + {'END_KEY': 'badge2', 'START_LABEL': 'Type_Metadata', 'END_LABEL': 'Badge', + 'START_KEY': 'hive://gold.test_schema1/test_table1/col1/type/col1/c2', + 'TYPE': 'HAS_BADGE', 'REVERSE_TYPE': 'BADGE_FOR'} ] node_row = struct_type_metadata.next_node() @@ -711,6 +741,73 @@ def test_serialize_struct_map_array_type_metadata(self) -> None: for i in range(0, len(expected_rels)): self.assertEqual(actual[i], expected_rels[i]) + def test_set_unsupported_descriptions_and_badges(self) -> None: + column = ColumnMetadata('col1', None, 'array>', 0) + column.set_column_key(self.column_key) + + array_type_metadata = ArrayTypeMetadata( + name='col1', + parent=column, + type_str='array>' + ) + nested_array_type_metadata_level1 = ArrayTypeMetadata( + name='_inner_', + parent=array_type_metadata, + type_str='array' + ) + nested_scalar_type_metadata_level2 = ScalarTypeMetadata( + name='_inner_', + parent=nested_array_type_metadata_level1, + type_str='string' + ) + + array_type_metadata.array_inner_type = nested_array_type_metadata_level1 + nested_array_type_metadata_level1.array_inner_type = nested_scalar_type_metadata_level2 + + # Descriptions and badges are set, but they do not appear in the expected nodes and relations + # since they are unsupported for those with parents of ColumnMetadata or ArrayTypeMetadata types + array_type_metadata.set_description('description 1') + array_type_metadata.set_badges(['badge1']) + nested_array_type_metadata_level1.set_description('description 2') + nested_array_type_metadata_level1.set_badges(['badge1']) + + expected_nodes = [ + {'kind': 'array', 'name': 'col1', 'LABEL': 'Type_Metadata', 'data_type': 'array>', + 'KEY': 'hive://gold.test_schema1/test_table1/col1/type/col1'}, + {'kind': 'array', 'name': '_inner_', 'LABEL': 'Type_Metadata', 'data_type': 'array', + 'KEY': 'hive://gold.test_schema1/test_table1/col1/type/col1/_inner_'} + ] + expected_rels = [ + {'END_KEY': 'hive://gold.test_schema1/test_table1/col1/type/col1', + 'START_KEY': 'hive://gold.test_schema1/test_table1/col1', + 'END_LABEL': 'Type_Metadata', 'START_LABEL': 'Column', + 'TYPE': 'TYPE_METADATA', 'REVERSE_TYPE': 'TYPE_METADATA_OF'}, + {'END_KEY': 'hive://gold.test_schema1/test_table1/col1/type/col1/_inner_', + 'START_KEY': 'hive://gold.test_schema1/test_table1/col1/type/col1', + 'END_LABEL': 'Type_Metadata', 'START_LABEL': 'Type_Metadata', + 'TYPE': 'SUBTYPE', 'REVERSE_TYPE': 'SUBTYPE_OF'} + ] + + node_row = array_type_metadata.next_node() + actual = [] + while node_row: + node_row_serialized = neo4_serializer.serialize_node(node_row) + actual.append(node_row_serialized) + node_row = array_type_metadata.next_node() + for i in range(0, len(expected_nodes)): + self.assertEqual(actual[i], expected_nodes[i]) + + relation_row = array_type_metadata.next_relation() + actual = [] + while relation_row: + relation_row_serialized = neo4_serializer.serialize_relationship( + relation_row + ) + actual.append(relation_row_serialized) + relation_row = array_type_metadata.next_relation() + for i in range(0, len(expected_rels)): + self.assertEqual(actual[i], expected_rels[i]) + if __name__ == '__main__': unittest.main() diff --git a/databuilder/tests/unit/transformer/test_complex_type_transformer.py b/databuilder/tests/unit/transformer/test_complex_type_transformer.py index 6273293225..391eba2f5c 100644 --- a/databuilder/tests/unit/transformer/test_complex_type_transformer.py +++ b/databuilder/tests/unit/transformer/test_complex_type_transformer.py @@ -102,6 +102,39 @@ def test_hive_parser_usage(self) -> None: self.assertEqual(transformer.success_count, 1) self.assertEqual(transformer.failure_count, 0) + def test_trino_parser_usage(self) -> None: + transformer = ComplexTypeTransformer() + config = ConfigFactory.from_dict({ + PARSING_FUNCTION: 'databuilder.utils.trino_complex_type_parser.parse_trino_type', + }) + transformer.init(conf=config) + + column = ColumnMetadata('col1', 'array type', 'array(array(int))', 0) + table_metadata = TableMetadata( + 'trino', + 'gold', + 'test_schema', + 'test_table', + 'test_table', + [column] + ) + array_type = ArrayTypeMetadata(name='col1', + parent=column, + type_str='array(array(int))') + inner_array = ArrayTypeMetadata(name='_inner_', + parent=array_type, + type_str='array(int)') + + array_type.array_inner_type = inner_array + + result = transformer.transform(table_metadata) + + for actual in result.columns: + self.assertTrue(isinstance(actual.get_type_metadata(), TypeMetadata)) + self.assertEqual(actual.get_type_metadata(), array_type) + self.assertEqual(transformer.success_count, 1) + self.assertEqual(transformer.failure_count, 0) + if __name__ == '__main__': unittest.main() diff --git a/databuilder/tests/unit/utils/test_trino_complex_type_parser.py b/databuilder/tests/unit/utils/test_trino_complex_type_parser.py new file mode 100644 index 0000000000..9877966dea --- /dev/null +++ b/databuilder/tests/unit/utils/test_trino_complex_type_parser.py @@ -0,0 +1,302 @@ +# Copyright Contributors to the Amundsen project. +# SPDX-License-Identifier: Apache-2.0 + +import unittest + +from pyparsing import ParseException + +from databuilder.models.table_metadata import ColumnMetadata +from databuilder.models.type_metadata import ( + ArrayTypeMetadata, MapTypeMetadata, ScalarTypeMetadata, StructTypeMetadata, +) +from databuilder.utils.trino_complex_type_parser import parse_trino_type + + +class TestTrinoComplexTypeParser(unittest.TestCase): + def setUp(self) -> None: + self.column_key = 'trino://gold.test_schema/test_table/col1' + + def test_transform_no_complex_type(self) -> None: + column = ColumnMetadata('col1', None, 'int', 0) + column.set_column_key(self.column_key) + + scalar_type = ScalarTypeMetadata(name='col1', + parent=column, + type_str='int') + + actual = parse_trino_type(column.type, column.name, column) + self.assertEqual(actual, scalar_type) + + def test_transform_array_type(self) -> None: + column = ColumnMetadata('col1', None, 'array(array(int))', 0) + column.set_column_key(self.column_key) + + array_type = ArrayTypeMetadata(name='col1', + parent=column, + type_str='array(array(int))') + inner_array = ArrayTypeMetadata(name='_inner_', + parent=array_type, + type_str='array(int)') + + array_type.array_inner_type = inner_array + + actual = parse_trino_type(column.type, column.name, column) + self.assertEqual(actual, array_type) + + def test_transform_array_map_nested_type(self) -> None: + column = ColumnMetadata('col1', None, 'array(map(string,int))', 0) + column.set_column_key(self.column_key) + + array_type = ArrayTypeMetadata(name='col1', + parent=column, + type_str='array(map(string,int))') + inner_map = MapTypeMetadata(name='_inner_', + parent=array_type, + type_str='map(string,int)') + inner_map_key = ScalarTypeMetadata(name='_map_key', + parent=inner_map, + type_str='string') + inner_scalar = ScalarTypeMetadata(name='_map_value', + parent=inner_map, + type_str='int') + + array_type.array_inner_type = inner_map + inner_map.map_key_type = inner_map_key + inner_map.map_value_type = inner_scalar + + actual = parse_trino_type(column.type, column.name, column) + self.assertEqual(actual, array_type) + + def test_transform_array_struct_nested_type(self) -> None: + column = ColumnMetadata('col1', None, 'array(row(nest1 int,nest2 int))', 0) + column.set_column_key(self.column_key) + + array_type = ArrayTypeMetadata(name='col1', + parent=column, + type_str='array(row(nest1 int,nest2 int))') + inner_struct = StructTypeMetadata(name='_inner_', + parent=array_type, + type_str='row(nest1 int,nest2 int)') + inner_scalar_nest1 = ScalarTypeMetadata(name='nest1', + parent=inner_struct, + type_str='int') + inner_scalar_nest2 = ScalarTypeMetadata(name='nest2', + parent=inner_struct, + type_str='int') + + array_type.array_inner_type = inner_struct + inner_struct.struct_items = {'nest1': inner_scalar_nest1, 'nest2': inner_scalar_nest2} + inner_scalar_nest1.sort_order = 0 + inner_scalar_nest2.sort_order = 1 + + actual = parse_trino_type(column.type, column.name, column) + self.assertEqual(actual, array_type) + + def test_transform_map_type(self) -> None: + column = ColumnMetadata('col1', None, 'map(string,map(string,int))', 0) + column.set_column_key(self.column_key) + + map_type = MapTypeMetadata(name='col1', + parent=column, + type_str='map(string,map(string,int))') + map_key = ScalarTypeMetadata(name='_map_key', + parent=map_type, + type_str='string') + map_value = MapTypeMetadata(name='_map_value', + parent=map_type, + type_str='map(string,int)') + inner_map_key = ScalarTypeMetadata(name='_map_key', + parent=map_value, + type_str='string') + inner_scalar = ScalarTypeMetadata(name='_map_value', + parent=map_value, + type_str='int') + + map_type.map_key_type = map_key + map_type.map_value_type = map_value + map_value.map_key_type = inner_map_key + map_value.map_value_type = inner_scalar + + actual = parse_trino_type(column.type, column.name, column) + self.assertEqual(actual, map_type) + + def test_transform_map_struct_nested_type(self) -> None: + column = ColumnMetadata('col1', None, 'map(string,row(nest1 int,nest2 int))', 0) + column.set_column_key(self.column_key) + + map_type = MapTypeMetadata(name='col1', + parent=column, + type_str='map(string,row(nest1 int,nest2 int))') + map_key = ScalarTypeMetadata(name='_map_key', + parent=map_type, + type_str='string') + inner_struct = StructTypeMetadata(name='_map_value', + parent=map_type, + type_str='row(nest1 int,nest2 int)') + inner_scalar_nest1 = ScalarTypeMetadata(name='nest1', + parent=inner_struct, + type_str='int') + inner_scalar_nest2 = ScalarTypeMetadata(name='nest2', + parent=inner_struct, + type_str='int') + + map_type.map_key_type = map_key + map_type.map_value_type = inner_struct + inner_struct.struct_items = {'nest1': inner_scalar_nest1, 'nest2': inner_scalar_nest2} + inner_scalar_nest1.sort_order = 0 + inner_scalar_nest2.sort_order = 1 + + actual = parse_trino_type(column.type, column.name, column) + self.assertEqual(actual, map_type) + + def test_transform_struct_type(self) -> None: + column = ColumnMetadata('col1', None, 'row(nest1 int,nest2 int)', 0) + column.set_column_key(self.column_key) + + struct_type = StructTypeMetadata(name='col1', + parent=column, + type_str='row(nest1 int,nest2 int)') + inner_scalar_nest1 = ScalarTypeMetadata(name='nest1', + parent=struct_type, + type_str='int') + inner_scalar_nest2 = ScalarTypeMetadata(name='nest2', + parent=struct_type, + type_str='int') + + struct_type.struct_items = {'nest1': inner_scalar_nest1, 'nest2': inner_scalar_nest2} + inner_scalar_nest1.sort_order = 0 + inner_scalar_nest2.sort_order = 1 + + actual = parse_trino_type(column.type, column.name, column) + self.assertEqual(actual, struct_type) + + def test_transform_struct_map_array_nested_type(self) -> None: + column = ColumnMetadata('col1', None, 'row(nest1 map(string,array(int)),nest2 array(string))', 0) + column.set_column_key(self.column_key) + + struct_type = StructTypeMetadata(name='col1', + parent=column, + type_str='row(nest1 map(string,array(int)),nest2 array(string))') + inner_map = MapTypeMetadata(name='nest1', + parent=struct_type, + type_str='map(string,array(int))') + inner_map_key = ScalarTypeMetadata(name='_map_key', + parent=inner_map, + type_str='string') + inner_map_array = ArrayTypeMetadata(name='_map_value', + parent=inner_map, + type_str='array(int)') + inner_struct_array = ArrayTypeMetadata(name='nest2', + parent=struct_type, + type_str='array(string)') + + struct_type.struct_items = {'nest1': inner_map, 'nest2': inner_struct_array} + inner_map.map_key_type = inner_map_key + inner_map.map_value_type = inner_map_array + inner_map.sort_order = 0 + inner_struct_array.sort_order = 1 + + actual = parse_trino_type(column.type, column.name, column) + self.assertEqual(actual, struct_type) + + def test_transform_struct_nested_type_with_quoted_names(self) -> None: + column = ColumnMetadata('col1', None, 'row("nest1" varchar,"nest2" row("nest3" varchar,' + '"nest4" timestamp(3),"nest5" timestamp(3)))', 0) + column.set_column_key(self.column_key) + + struct_type = StructTypeMetadata(name='col1', + parent=column, + type_str='row(nest1 varchar,nest2 row(nest3 varchar,' + 'nest4 timestamp(3),nest5 timestamp(3)))') + inner_scalar = ScalarTypeMetadata(name='nest1', + parent=struct_type, + type_str='varchar') + inner_struct = StructTypeMetadata(name='nest2', + parent=struct_type, + type_str='row(nest3 varchar,nest4 timestamp(3),nest5 timestamp(3))') + inner_scalar_1 = ScalarTypeMetadata(name='nest3', + parent=inner_struct, + type_str='varchar') + inner_scalar_2 = ScalarTypeMetadata(name='nest4', + parent=inner_struct, + type_str='timestamp(3)') + inner_scalar_3 = ScalarTypeMetadata(name='nest5', + parent=inner_struct, + type_str='timestamp(3)') + + struct_type.struct_items = {'nest1': inner_scalar, 'nest2': inner_struct} + inner_struct.struct_items = {'nest3': inner_scalar_1, 'nest4': inner_scalar_2, 'nest5': inner_scalar_3} + inner_scalar.sort_order = 0 + inner_struct.sort_order = 1 + inner_scalar_1.sort_order = 0 + inner_scalar_2.sort_order = 1 + inner_scalar_3.sort_order = 2 + + actual = parse_trino_type(column.type, column.name, column) + self.assertEqual(actual, struct_type) + + def test_transform_non_alpha_only_types(self) -> None: + column = ColumnMetadata('col1', None, 'row(nest1 decimal(10,2),nest2 double precision,' + 'nest3 varchar(32),nest4 map(varchar(32),decimal(10,2)),' + 'nest5 interval_day_time)', 0) + column.set_column_key(self.column_key) + + struct_type = StructTypeMetadata(name='col1', + parent=column, + type_str='row(nest1 decimal(10,2),nest2 double precision,' + 'nest3 varchar(32),nest4 map(varchar(32),decimal(10,2)),' + 'nest5 interval_day_time)') + inner_scalar_nest1 = ScalarTypeMetadata(name='nest1', + parent=struct_type, + type_str='decimal(10,2)') + inner_scalar_nest2 = ScalarTypeMetadata(name='nest2', + parent=struct_type, + type_str='double precision') + inner_scalar_nest3 = ScalarTypeMetadata(name='nest3', + parent=struct_type, + type_str='varchar(32)') + inner_map_nest4 = MapTypeMetadata(name='nest4', + parent=struct_type, + type_str='map(varchar(32),decimal(10,2))') + inner_map_nest4_key = ScalarTypeMetadata(name='_map_key', + parent=inner_map_nest4, + type_str='varchar(32)') + inner_map_nest4_value = ScalarTypeMetadata(name='_map_value', + parent=inner_map_nest4, + type_str='decimal(10,2)') + inner_scalar_nest5 = ScalarTypeMetadata(name='nest5', + parent=struct_type, + type_str='interval_day_time') + + struct_type.struct_items = {'nest1': inner_scalar_nest1, 'nest2': inner_scalar_nest2, + 'nest3': inner_scalar_nest3, 'nest4': inner_map_nest4, + 'nest5': inner_scalar_nest5} + inner_map_nest4.map_key_type = inner_map_nest4_key + inner_map_nest4.map_value_type = inner_map_nest4_value + inner_scalar_nest1.sort_order = 0 + inner_scalar_nest2.sort_order = 1 + inner_scalar_nest3.sort_order = 2 + inner_map_nest4.sort_order = 3 + inner_scalar_nest5.sort_order = 4 + + actual = parse_trino_type(column.type, column.name, column) + self.assertEqual(actual, struct_type) + + def test_transform_invalid_array_inner_type(self) -> None: + column = ColumnMetadata('col1', None, 'array(array(int*))', 0) + column.set_column_key(self.column_key) + + with self.assertRaises(ParseException): + parse_trino_type(column.type, column.name, column) + + def test_transform_invalid_struct_inner_type(self) -> None: + column = ColumnMetadata('col1', None, 'row(nest1 varchar(256)å,' + 'nest2 (derived from deserializer))', 0) + column.set_column_key(self.column_key) + + with self.assertRaises(ParseException): + parse_trino_type(column.type, column.name, column) + + +if __name__ == '__main__': + unittest.main()