datahub-project · hsheth2 · Feb 7, 2025 · Feb 7, 2025
diff --git a/.github/workflows/metadata-ingestion.yml b/.github/workflows/metadata-ingestion.yml
@@ -27,7 +27,7 @@ concurrency:
 jobs:
   metadata-ingestion:
     runs-on: ubuntu-latest
-    timeout-minutes: 40
+    timeout-minutes: 60
     env:
       DATAHUB_TELEMETRY_ENABLED: false
       # TODO: Enable this once the test is fixed.

diff --git a/metadata-ingestion/src/datahub/testing/check_sql_parser_result.py b/metadata-ingestion/src/datahub/testing/check_sql_parser_result.py
@@ -1,5 +1,4 @@
 import logging
-import os
 import pathlib
 from typing import Any, Dict, Optional
 
@@ -8,11 +7,10 @@
 from datahub.ingestion.source.bigquery_v2.bigquery_audit import BigqueryTableIdentifier
 from datahub.sql_parsing.schema_resolver import SchemaInfo, SchemaResolver
 from datahub.sql_parsing.sqlglot_lineage import SqlParsingResult, sqlglot_lineage
+from datahub.testing.pytest_hooks import get_golden_settings
 
 logger = logging.getLogger(__name__)
 
-UPDATE_FILES = os.environ.get("UPDATE_SQLPARSER_FILES", "false").lower() == "true"
-
 
 def assert_sql_result_with_resolver(
     sql: str,
@@ -22,6 +20,8 @@ def assert_sql_result_with_resolver(
     allow_table_error: bool = False,
     **kwargs: Any,
 ) -> None:
+    settings = get_golden_settings()
+
     # HACK: Our BigQuery source overwrites this value and doesn't undo it.
     # As such, we need to handle that here.
     BigqueryTableIdentifier._BQ_SHARDED_TABLE_SUFFIX = "_yyyymmdd"
@@ -47,15 +47,14 @@ def assert_sql_result_with_resolver(
         )
 
     txt = res.json(indent=4)
-    if UPDATE_FILES:
+    if settings.update_golden:
         expected_file.write_text(txt)
         return
 
     if not expected_file.exists():
         expected_file.write_text(txt)
         raise AssertionError(
-            f"Expected file {expected_file} does not exist. "
-            "Created it with the expected output. Please verify it."
+            f"Missing expected golden file; run with --update-golden-files to create it: {expected_file}"
         )
 
     expected = SqlParsingResult.parse_raw(expected_file.read_text())

diff --git a/metadata-ingestion/src/datahub/testing/compare_metadata_json.py b/metadata-ingestion/src/datahub/testing/compare_metadata_json.py
@@ -16,6 +16,7 @@
 from datahub.ingestion.sink.file import write_metadata_file
 from datahub.ingestion.source.file import read_metadata_file
 from datahub.testing.mcp_diff import CannotCompareMCPs, MCPDiff, get_aspects_by_urn
+from datahub.testing.pytest_hooks import get_golden_settings
 
 logger = logging.getLogger(__name__)
 
@@ -40,26 +41,26 @@ def load_json_file(filename: Union[str, os.PathLike]) -> MetadataJson:
 def assert_metadata_files_equal(
     output_path: Union[str, os.PathLike],
     golden_path: Union[str, os.PathLike],
-    update_golden: bool,
-    copy_output: bool,
     ignore_paths: Sequence[str] = (),
     ignore_paths_v2: Sequence[str] = (),
     ignore_order: bool = True,
 ) -> None:
+    settings = get_golden_settings()
+
     golden_exists = os.path.isfile(golden_path)
 
-    if copy_output:
+    if settings.copy_output:
         shutil.copyfile(str(output_path), str(golden_path) + ".output")
         logger.info(f"Copied output file to {golden_path}.output")
 
-    if not update_golden and not golden_exists:
+    if not settings.update_golden and not golden_exists:
         raise FileNotFoundError(
             "Golden file does not exist. Please run with the --update-golden-files option to create."
         )
 
     output = load_json_file(output_path)
 
-    if update_golden and not golden_exists:
+    if settings.update_golden and not golden_exists:
         shutil.copyfile(str(output_path), str(golden_path))
         return
     else:
@@ -87,7 +88,7 @@ def assert_metadata_files_equal(
     ignore_paths = (*ignore_paths, *default_exclude_paths)
 
     diff = diff_metadata_json(output, golden, ignore_paths, ignore_order=ignore_order)
-    if diff and update_golden:
+    if diff and settings.update_golden:
         if isinstance(diff, MCPDiff) and diff.is_delta_valid:
             logger.info(f"Applying delta to golden file {golden_path}")
             diff.apply_delta(golden)

diff --git a/metadata-ingestion/src/datahub/testing/pytest_hooks.py b/metadata-ingestion/src/datahub/testing/pytest_hooks.py
@@ -0,0 +1,49 @@
+import dataclasses
+from typing import Optional
+
+import pytest
+
+
+@dataclasses.dataclass
+class GoldenFileSettings:
+    update_golden: bool
+    copy_output: bool
+
+
+_registered: bool = False
+_settings: Optional[GoldenFileSettings] = None
+
+
+def register_golden_flags(parser: pytest.Parser) -> None:
+    parser.addoption(
+        "--update-golden-files",
+        action="store_true",
+        default=False,
+    )
+
+    # TODO: Deprecate and remove this flag.
+    parser.addoption("--copy-output-files", action="store_true", default=False)
+
+    global _registered
+    _registered = True
+
+
+@pytest.fixture(scope="session", autouse=True)
+def load_golden_flags(pytestconfig: pytest.Config) -> None:
+    global _settings
+    _settings = GoldenFileSettings(
+        update_golden=pytestconfig.getoption("--update-golden-files"),
+        copy_output=pytestconfig.getoption("--copy-output-files"),
+    )
+
+
+def get_golden_settings() -> GoldenFileSettings:
+    if not _registered:
+        raise ValueError(
+            "Golden files aren't set up properly. Call register_golden_flags from a conftest pytest_addoptions method."
+        )
+    if not _settings:
+        raise ValueError(
+            "Golden files aren't set up properly. Ensure load_golden_flags is imported in your conftest."
+        )
+    return _settings
diff --git a/metadata-ingestion/tests/conftest.py b/metadata-ingestion/tests/conftest.py
@@ -22,6 +22,10 @@
 
 # We need our imports to go below the os.environ updates, since mere act
 # of importing some datahub modules will load env variables.
+from datahub.testing.pytest_hooks import (  # noqa: E402
+    load_golden_flags,
+    register_golden_flags,
+)
 from tests.test_helpers.docker_helpers import (  # noqa: F401,E402
     docker_compose_command,
     docker_compose_runner,
@@ -55,12 +59,11 @@ def fake_time():
 
 
 def pytest_addoption(parser):
-    parser.addoption(
-        "--update-golden-files",
-        action="store_true",
-        default=False,
-    )
-    parser.addoption("--copy-output-files", action="store_true", default=False)
+    register_golden_flags(parser)
+
+
+# It's an autouse fixture, so importing it is sufficient.
+assert load_golden_flags is not None
 
 
 def pytest_collection_modifyitems(

diff --git a/metadata-ingestion/tests/integration/sql_server/test_sql_server.py b/metadata-ingestion/tests/integration/sql_server/test_sql_server.py
@@ -73,9 +73,7 @@ def test_mssql_ingest(mssql_runner, pytestconfig, tmp_path, mock_time, config_fi
 
 @pytest.mark.parametrize("procedure_sql_file", procedure_sqls)
 @pytest.mark.integration
-def test_stored_procedure_lineage(
-    pytestconfig: pytest.Config, procedure_sql_file: str
-) -> None:
+def test_stored_procedure_lineage(procedure_sql_file: str) -> None:
     sql_file_path = PROCEDURE_SQLS_DIR / procedure_sql_file
     procedure_code = sql_file_path.read_text()
 
@@ -105,7 +103,6 @@ def test_stored_procedure_lineage(
         )
     )
     mce_helpers.check_goldens_stream(
-        pytestconfig,
         outputs=mcps,
         golden_path=(
             PROCEDURES_GOLDEN_DIR / Path(procedure_sql_file).with_suffix(".json")

diff --git a/metadata-ingestion/tests/test_helpers/mce_helpers.py b/metadata-ingestion/tests/test_helpers/mce_helpers.py
@@ -85,21 +85,21 @@ def check_golden_file(
     ignore_paths_v2: Sequence[str] = (),
     ignore_order: bool = True,
 ) -> None:
-    update_golden = pytestconfig.getoption("--update-golden-files")
-    copy_output = pytestconfig.getoption("--copy-output-files")
+    # TODO: Remove the pytestconfig parameter since it's redundant.
+    # Or more straightforward - we can remove the `check_golden_file` method
+    # and use assert_metadata_files_equal directly. Maybe call it "check_golden_metadata"?
+    # In a lot of cases, the output_path is also just annoying - our pytest setup
+    # should be responsible for figuring out where to put the temp file.
     assert_metadata_files_equal(
         output_path=output_path,
         golden_path=golden_path,
-        update_golden=update_golden,
-        copy_output=copy_output,
         ignore_paths=ignore_paths,
         ignore_paths_v2=ignore_paths_v2,
         ignore_order=ignore_order,
     )
 
 
 def check_goldens_stream(
-    pytestconfig: pytest.Config,
     outputs: List,
     golden_path: Union[str, os.PathLike],
     ignore_paths: Sequence[str] = (),
@@ -108,8 +108,7 @@ def check_goldens_stream(
     with tempfile.NamedTemporaryFile() as f:
         write_metadata_file(pathlib.Path(f.name), outputs)
 
-        check_golden_file(
-            pytestconfig=pytestconfig,
+        assert_metadata_files_equal(
             output_path=f.name,
             golden_path=golden_path,
             ignore_paths=ignore_paths,

diff --git a/metadata-ingestion/tests/test_helpers/sdk_v2_helpers.py b/metadata-ingestion/tests/test_helpers/sdk_v2_helpers.py
@@ -1,16 +1,11 @@
 import pathlib
 
-import pytest
-
 from datahub.sdk._entity import Entity
 from tests.test_helpers import mce_helpers
 
 
-def assert_entity_golden(
-    pytestconfig: pytest.Config, entity: Entity, golden_path: pathlib.Path
-) -> None:
+def assert_entity_golden(entity: Entity, golden_path: pathlib.Path) -> None:
     mce_helpers.check_goldens_stream(
-        pytestconfig=pytestconfig,
         outputs=entity._as_mcps(),
         golden_path=golden_path,
         ignore_order=False,

diff --git a/metadata-ingestion/tests/unit/api/entities/structuredproperties/test_structuredproperties.py b/metadata-ingestion/tests/unit/api/entities/structuredproperties/test_structuredproperties.py
@@ -32,7 +32,6 @@ def test_structuredproperties_load(pytestconfig: pytest.Config) -> None:
         mcps.extend(property.generate_mcps())
 
     check_goldens_stream(
-        pytestconfig,
         mcps,
         golden_path=RESOURCE_DIR / "example_structured_properties_golden.json",
     )
diff --git a/metadata-ingestion/tests/unit/sdk_v2/test_container.py b/metadata-ingestion/tests/unit/sdk_v2/test_container.py
@@ -20,7 +20,7 @@
 _GOLDEN_DIR = pathlib.Path(__file__).parent / "container_golden"
 
 
-def test_container_basic(pytestconfig: pytest.Config) -> None:
+def test_container_basic() -> None:
     db_key = DatabaseKey(
         platform="bigquery",
         database="my_bq_project",
@@ -60,12 +60,10 @@ def test_container_basic(pytestconfig: pytest.Config) -> None:
         # This should fail. Eventually we should make it suggest calling set_owners instead.
         c.owners = []  # type: ignore
 
-    assert_entity_golden(
-        pytestconfig, c, _GOLDEN_DIR / "test_container_basic_golden.json"
-    )
+    assert_entity_golden(c, _GOLDEN_DIR / "test_container_basic_golden.json")
 
 
-def test_container_complex(pytestconfig: pytest.Config) -> None:
+def test_container_complex() -> None:
     schema_key = SchemaKey(
         platform="snowflake",
         instance="my_instance",
@@ -75,7 +73,7 @@ def test_container_complex(pytestconfig: pytest.Config) -> None:
     created = datetime(2025, 1, 2, 3, 4, 5, tzinfo=timezone.utc)
     updated = datetime(2025, 1, 9, 3, 4, 6, tzinfo=timezone.utc)
 
-    d = Container(
+    c = Container(
         schema_key,
         display_name="MY_SCHEMA",
         qualified_name="MY_DB.MY_SCHEMA",
@@ -100,19 +98,19 @@ def test_container_complex(pytestconfig: pytest.Config) -> None:
         ],
         domain=DomainUrn("Marketing"),
     )
-    assert d.platform_instance is not None
+    assert c.platform_instance is not None
     assert (
-        str(d.platform_instance)
+        str(c.platform_instance)
         == "urn:li:dataPlatformInstance:(urn:li:dataPlatform:snowflake,my_instance)"
     )
-    assert d.subtype == "Schema"
-    assert d.description == "test"
-    assert d.display_name == "MY_SCHEMA"
-    assert d.qualified_name == "MY_DB.MY_SCHEMA"
-    assert d.external_url == "https://example.com"
-    assert d.created == created
-    assert d.last_modified == updated
-    assert d.custom_properties == {
+    assert c.subtype == "Schema"
+    assert c.description == "test"
+    assert c.display_name == "MY_SCHEMA"
+    assert c.qualified_name == "MY_DB.MY_SCHEMA"
+    assert c.external_url == "https://example.com"
+    assert c.created == created
+    assert c.last_modified == updated
+    assert c.custom_properties == {
         "platform": "snowflake",
         "instance": "my_instance",
         "database": "MY_DB",
@@ -122,14 +120,12 @@ def test_container_complex(pytestconfig: pytest.Config) -> None:
     }
 
     # Check standard aspects.
-    assert d.domain == DomainUrn("Marketing")
-    assert d.tags is not None
-    assert len(d.tags) == 2
-    assert d.terms is not None
-    assert len(d.terms) == 1
-    assert d.owners is not None
-    assert len(d.owners) == 1
+    assert c.domain == DomainUrn("Marketing")
+    assert c.tags is not None
+    assert len(c.tags) == 2
+    assert c.terms is not None
+    assert len(c.terms) == 1
+    assert c.owners is not None
+    assert len(c.owners) == 1
 
-    assert_entity_golden(
-        pytestconfig, d, _GOLDEN_DIR / "test_container_complex_golden.json"
-    )
+    assert_entity_golden(c, _GOLDEN_DIR / "test_container_complex_golden.json")
diff --git a/metadata-ingestion/tests/unit/sdk_v2/test_dataset.py b/metadata-ingestion/tests/unit/sdk_v2/test_dataset.py
@@ -65,9 +65,7 @@ def test_dataset_basic(pytestconfig: pytest.Config) -> None:
         # This should fail. Eventually we should make it suggest calling set_owners instead.
         d.owners = []  # type: ignore
 
-    assert_entity_golden(
-        pytestconfig, d, _GOLDEN_DIR / "test_dataset_basic_golden.json"
-    )
+    assert_entity_golden(d, _GOLDEN_DIR / "test_dataset_basic_golden.json")
 
 
 def _build_complex_dataset() -> Dataset:
@@ -161,17 +159,13 @@ def _build_complex_dataset() -> Dataset:
     return d
 
 
-def test_dataset_complex(pytestconfig: pytest.Config) -> None:
+def test_dataset_complex() -> None:
     d = _build_complex_dataset()
-    assert_entity_golden(
-        pytestconfig, d, _GOLDEN_DIR / "test_dataset_complex_golden.json"
-    )
+    assert_entity_golden(d, _GOLDEN_DIR / "test_dataset_complex_golden.json")
 
 
-def test_dataset_ingestion(pytestconfig: pytest.Config) -> None:
+def test_dataset_ingestion() -> None:
     with change_default_attribution(KnownAttribution.INGESTION):
         d = _build_complex_dataset()
 
-        assert_entity_golden(
-            pytestconfig, d, _GOLDEN_DIR / "test_dataset_ingestion_golden.json"
-        )
+        assert_entity_golden(d, _GOLDEN_DIR / "test_dataset_ingestion_golden.json")
diff --git a/metadata-ingestion/tests/unit/sdk_v2/test_entity_client.py b/metadata-ingestion/tests/unit/sdk_v2/test_entity_client.py
@@ -35,7 +35,6 @@ def assert_client_golden(
 ) -> None:
     mcps = client._graph.emit_mcps.call_args[0][0]  # type: ignore
     mce_helpers.check_goldens_stream(
-        pytestconfig=pytestconfig,
         outputs=mcps,
         golden_path=golden_path,
         ignore_order=False,