datahub-project
diff --git a/‎metadata-ingestion/scripts/avro_codegen.py
Lines changed: 120 additions & 40 deletions b/‎metadata-ingestion/scripts/avro_codegen.py
Lines changed: 120 additions & 40 deletions
diff --git a/‎metadata-ingestion/scripts/codegen.sh
Lines changed: 4 additions & 2 deletions b/‎metadata-ingestion/scripts/codegen.sh
Lines changed: 4 additions & 2 deletions
diff --git a/‎metadata-ingestion/setup.py
Lines changed: 1 addition & 1 deletion b/‎metadata-ingestion/setup.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎metadata-ingestion/src/datahub/ingestion/graph/client.py
Lines changed: 4 additions & 12 deletions b/‎metadata-ingestion/src/datahub/ingestion/graph/client.py
Lines changed: 4 additions & 12 deletions
diff --git a/‎metadata-ingestion/src/datahub/ingestion/sink/sink_registry.py
Lines changed: 0 additions & 5 deletions b/‎metadata-ingestion/src/datahub/ingestion/sink/sink_registry.py
Lines changed: 0 additions & 5 deletions
@@ -1,8 +1,7 @@
 import json
-import types
-import unittest.mock
+import re
 from pathlib import Path
-from typing import Any, Dict, Iterable, List, Optional, Union
+from typing import Dict, Iterable, List, Optional, Tuple, Union
 
 import avro.schema
 import click
@@ -66,29 +65,89 @@ def load_schemas(schemas_path: str) -> Dict[str, dict]:
     return schemas
 
 
-def merge_schemas(schemas_obj: List[Any]) -> str:
-    # Combine schemas.
-    merged = ["null"] + schemas_obj
+def patch_schemas(schemas: Dict[str, dict], pdl_path: Path) -> Dict[str, dict]:
+    # We can easily find normal urn types using the generated avro schema,
+    # but for arrays of urns there's nothing in the avro schema and hence
+    # we have to look in the PDL files instead.
+    urn_arrays: Dict[
+        str, List[Tuple[str, str]]
+    ] = {}  # schema name -> list of (field name, type)
 
-    # Patch add_name method to NOT complain about duplicate names
-    def add_name(self, name_attr, space_attr, new_schema):
-        to_add = avro.schema.Name(name_attr, space_attr, self.default_namespace)
+    # First, we need to load the PDL files and find all urn arrays.
+    for pdl_file in Path(pdl_path).glob("**/*.pdl"):
+        pdl_text = pdl_file.read_text()
 
-        self.names[to_add.fullname] = new_schema
-        return to_add
+        # TRICKY: We assume that all urn types end with "Urn".
+        arrays = re.findall(
+            r"^\s*(\w+)\s*:\s*(?:optional\s+)?array\[(\w*Urn)\]",
+            pdl_text,
+            re.MULTILINE,
+        )
+        if arrays:
+            schema_name = pdl_file.stem
+            urn_arrays[schema_name] = [(item[0], item[1]) for item in arrays]
 
-    with unittest.mock.patch("avro.schema.Names.add_name", add_name):
-        cleaned_schema = avro.schema.make_avsc_object(merged)
+    # Then, we can patch each schema.
+    patched_schemas = {}
+    for name, schema in schemas.items():
+        patched_schemas[name] = patch_schema(schema, urn_arrays)
 
-    # Convert back to an Avro schema JSON representation.
-    class MappingProxyEncoder(json.JSONEncoder):
-        def default(self, obj):
-            if isinstance(obj, types.MappingProxyType):
-                return dict(obj)
-            return json.JSONEncoder.default(self, obj)
+    return patched_schemas
+
+
+def patch_schema(schema: dict, urn_arrays: Dict[str, List[Tuple[str, str]]]) -> dict:
+    """
+    This method patches the schema to add an "Urn" property to all urn fields.
+    Because the inner type in an array is not a named Avro schema, for urn arrays
+    we annotate the array field and add an "urn_is_array" property.
+    """
+
+    # We're using Names() to generate a full list of embedded schemas.
+    all_schemas = avro.schema.Names()
+    patched = avro.schema.make_avsc_object(schema, names=all_schemas)
+
+    for nested in all_schemas.names.values():
+        if isinstance(nested, (avro.schema.EnumSchema, avro.schema.FixedSchema)):
+            continue
+        assert isinstance(nested, avro.schema.RecordSchema)
+
+        # Patch normal urn types.
+        field: avro.schema.Field
+        for field in nested.fields:
+            java_class: Optional[str] = field.props.get("java", {}).get("class")
+            if java_class and java_class.startswith(
+                "com.linkedin.pegasus2avro.common.urn."
+            ):
+                field.set_prop("Urn", java_class.split(".")[-1])
+
+        # Patch array urn types.
+        if nested.name in urn_arrays:
+            mapping = urn_arrays[nested.name]
+
+            for field_name, type in mapping:
+                field = nested.fields_dict[field_name]
+                field.set_prop("Urn", type)
+                field.set_prop("urn_is_array", True)
+
+    return patched.to_json()
+
+
+def merge_schemas(schemas_obj: List[dict]) -> str:
+    # Combine schemas as a "union" of all of the types.
+    merged = ["null"] + schemas_obj
+
+    # Patch add_name method to NOT complain about duplicate names.
+    class NamesWithDups(avro.schema.Names):
+        def add_name(self, name_attr, space_attr, new_schema):
+            to_add = avro.schema.Name(name_attr, space_attr, self.default_namespace)
+            self.names[to_add.fullname] = new_schema
+            return to_add
+
+    cleaned_schema = avro.schema.make_avsc_object(merged, names=NamesWithDups())
 
+    # Convert back to an Avro schema JSON representation.
     out_schema = cleaned_schema.to_json()
-    encoded = json.dumps(out_schema, cls=MappingProxyEncoder, indent=2)
+    encoded = json.dumps(out_schema, indent=2)
     return encoded
 
 
@@ -149,11 +208,11 @@ def add_avro_python3_warning(filepath: Path) -> None:
 import functools
 import pathlib
 
+@functools.lru_cache(maxsize=None)
 def _load_schema(schema_name: str) -> str:
     return (pathlib.Path(__file__).parent / f"{schema_name}.avsc").read_text()
 """
 individual_schema_method = """
-@functools.lru_cache(maxsize=None)
 def get{schema_name}Schema() -> str:
     return _load_schema("{schema_name}")
 """
@@ -165,6 +224,17 @@ def make_load_schema_methods(schemas: Iterable[str]) -> str:
     )
 
 
+def save_raw_schemas(schema_save_dir: Path, schemas: Dict[str, dict]) -> None:
+    # Save raw avsc files.
+    schema_save_dir.mkdir()
+    for name, schema in schemas.items():
+        (schema_save_dir / f"{name}.avsc").write_text(json.dumps(schema, indent=2))
+
+    # Add getXSchema methods.
+    with open(schema_save_dir / "__init__.py", "w") as schema_dir_init:
+        schema_dir_init.write(make_load_schema_methods(schemas.keys()))
+
+
 def annotate_aspects(aspects: List[dict], schema_class_file: Path) -> None:
     schema_classes_lines = schema_class_file.read_text().splitlines()
     line_lookup_table = {line: i for i, line in enumerate(schema_classes_lines)}
@@ -177,9 +247,9 @@ def annotate_aspects(aspects: List[dict], schema_class_file: Path) -> None:
     ] += """
 
 class _Aspect(DictWrapper):
-    ASPECT_NAME: str = None  # type: ignore
-    ASPECT_TYPE: str = "default"
-    ASPECT_INFO: dict = None  # type: ignore
+    ASPECT_NAME: ClassVar[str] = None  # type: ignore
+    ASPECT_TYPE: ClassVar[str] = "default"
+    ASPECT_INFO: ClassVar[dict] = None  # type: ignore
 
     def __init__(self):
         if type(self) is _Aspect:
@@ -225,16 +295,18 @@ def get_aspect_info(cls) -> dict:
             schema_classes_lines[
                 empty_line
             ] += f"\n    ASPECT_TYPE = '{aspect['Aspect']['type']}'"
-        schema_classes_lines[empty_line] += f"\n    ASPECT_INFO = {aspect['Aspect']}"
+
+        aspect_info = {
+            k: v for k, v in aspect["Aspect"].items() if k not in {"name", "type"}
+        }
+        schema_classes_lines[empty_line] += f"\n    ASPECT_INFO = {aspect_info}"
 
         schema_classes_lines[empty_line + 1] += "\n"
 
     # Finally, generate a big list of all available aspects.
     newline = "\n"
     schema_classes_lines.append(
         f"""
-from typing import Type
-
 ASPECT_CLASSES: List[Type[_Aspect]] = [
     {f',{newline}    '.join(f"{aspect['name']}Class" for aspect in aspects)}
 ]
@@ -252,14 +324,22 @@ def get_aspect_info(cls) -> dict:
 @click.argument(
     "entity_registry", type=click.Path(exists=True, dir_okay=False), required=True
 )
+@click.argument(
+    "pdl_path", type=click.Path(exists=True, file_okay=False), required=True
+)
 @click.argument(
     "schemas_path", type=click.Path(exists=True, file_okay=False), required=True
 )
 @click.argument("outdir", type=click.Path(), required=True)
-def generate(entity_registry: str, schemas_path: str, outdir: str) -> None:
+def generate(
+    entity_registry: str, pdl_path: str, schemas_path: str, outdir: str
+) -> None:
     entities = load_entity_registry(Path(entity_registry))
     schemas = load_schemas(schemas_path)
 
+    # Patch the avsc files.
+    schemas = patch_schemas(schemas, Path(pdl_path))
+
     # Special handling for aspects.
     aspects = {
         schema["Aspect"]["name"]: schema
@@ -288,8 +368,8 @@ def generate(entity_registry: str, schemas_path: str, outdir: str) -> None:
 
     # Check for unused aspects. We currently have quite a few.
     # unused_aspects = set(aspects.keys()) - set().union(
-    #    {entity.keyAspect for entity in entities},
-    #    *(set(entity.aspects) for entity in entities),
+    #     {entity.keyAspect for entity in entities},
+    #     *(set(entity.aspects) for entity in entities),
     # )
 
     merged_schema = merge_schemas(list(schemas.values()))
@@ -303,17 +383,17 @@ def generate(entity_registry: str, schemas_path: str, outdir: str) -> None:
         Path(outdir) / "schema_classes.py",
     )
 
-    # Save raw schema files in codegen as well.
+    # Keep a copy of a few raw avsc files.
+    required_avsc_schemas = {"MetadataChangeEvent", "MetadataChangeProposal"}
     schema_save_dir = Path(outdir) / "schemas"
-    schema_save_dir.mkdir()
-    for schema_out_file, schema in schemas.items():
-        (schema_save_dir / f"{schema_out_file}.avsc").write_text(
-            json.dumps(schema, indent=2)
-        )
-
-    # Add load_schema method.
-    with open(schema_save_dir / "__init__.py", "a") as schema_dir_init:
-        schema_dir_init.write(make_load_schema_methods(schemas.keys()))
+    save_raw_schemas(
+        schema_save_dir,
+        {
+            name: schema
+            for name, schema in schemas.items()
+            if name in required_avsc_schemas
+        },
+    )
 
     # Add headers for all generated files
     generated_files = Path(outdir).glob("**/*.py")
 
@@ -5,8 +5,10 @@ OUTDIR=./src/datahub/metadata
 
 # Note: this assumes that datahub has already been built with `./gradlew build`.
 DATAHUB_ROOT=..
-SCHEMAS_ROOT="$DATAHUB_ROOT/metadata-events/mxe-schemas/src/renamed/avro/com/linkedin"
+
+SCHEMAS_PDL="$DATAHUB_ROOT/metadata-models/src/main/pegasus/com/linkedin"
+SCHEMAS_AVSC="$DATAHUB_ROOT/metadata-events/mxe-schemas/src/renamed/avro/com/linkedin"
 ENTITY_REGISTRY="$DATAHUB_ROOT/metadata-models/src/main/resources/entity-registry.yml"
 
 rm -r $OUTDIR 2>/dev/null || true
-python scripts/avro_codegen.py $ENTITY_REGISTRY $SCHEMAS_ROOT $OUTDIR
+python scripts/avro_codegen.py $ENTITY_REGISTRY $SCHEMAS_PDL $SCHEMAS_AVSC $OUTDIR
@@ -37,7 +37,7 @@ def get_long_description():
     "entrypoints",
     "docker",
     "expandvars>=0.6.5",
-    "avro-gen3==0.7.8",
+    "avro-gen3==0.7.10",
     # "avro-gen3 @ git+https://github.com/acryldata/avro_gen@master#egg=avro-gen3",
     "avro>=1.10.2,<1.11",
     "python-dateutil>=2.8.0",
 
@@ -144,9 +144,7 @@ def get_aspect(
         response_json = response.json()
 
         # Figure out what field to look in.
-        record_schema: RecordSchema = aspect_type.__getattribute__(
-            aspect_type, "RECORD_SCHEMA"
-        )
+        record_schema: RecordSchema = aspect_type.RECORD_SCHEMA
         aspect_type_name = record_schema.fullname.replace(".pegasus2avro", "")
 
         # Deserialize the aspect json into the aspect type.
@@ -335,15 +333,9 @@ def get_aspects_for_entity(
 
         result: Dict[str, Optional[Aspect]] = {}
         for aspect_type in aspect_types:
-            record_schema: RecordSchema = aspect_type.__getattribute__(
-                aspect_type, "RECORD_SCHEMA"
-            )
-            if not record_schema:
-                logger.warning(
-                    f"Failed to infer type name of the aspect from the aspect type class {aspect_type}. Continuing, but this will fail."
-                )
-            else:
-                aspect_type_name = record_schema.props["Aspect"]["name"]
+            record_schema = aspect_type.RECORD_SCHEMA
+            aspect_type_name = record_schema.props["Aspect"]["name"]
+
             aspect_json = response_json.get("aspects", {}).get(aspect_type_name)
             if aspect_json:
                 # need to apply a transform to the response to match rest.li and avro serialization
 
@@ -13,8 +13,3 @@ def _check_sink_classes(cls: Type[Sink]) -> None:
 
 sink_registry = PluginRegistry[Sink](extra_cls_check=_check_sink_classes)
 sink_registry.register_from_entrypoint("datahub.ingestion.sink.plugins")
-
-# These sinks are always enabled
-assert sink_registry.get("console")
-assert sink_registry.get("file")
-assert sink_registry.get("blackhole")