From 86ed40c9041133e5da18655d76b8b2127d94bc4e Mon Sep 17 00:00:00 2001 From: Pedro Silva Date: Fri, 24 Jan 2025 07:27:32 +0000 Subject: [PATCH 01/18] fix(docs): Add links to new datahub cloud event source (#12450) --- docs/actions/sources/datahub-cloud-event-source.md | 2 +- docs/managed-datahub/datahub-api/entity-events-api.md | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/actions/sources/datahub-cloud-event-source.md b/docs/actions/sources/datahub-cloud-event-source.md index d1751ae256867..656fe4a3a6329 100644 --- a/docs/actions/sources/datahub-cloud-event-source.md +++ b/docs/actions/sources/datahub-cloud-event-source.md @@ -38,7 +38,7 @@ If you've configured your Action pipeline `failure_mode` to be `THROW`, then eve The DataHub Cloud Event Source produces -- [Entity Change Event V1](../events/entity-change-event.md) +- [Entity Change Event V1](../../managed-datahub/datahub-api/entity-events-api.md) Note that the DataHub Cloud Event Source does _not_ yet support the full [Metadata Change Log V1](../events/metadata-change-log-event.md) event stream. diff --git a/docs/managed-datahub/datahub-api/entity-events-api.md b/docs/managed-datahub/datahub-api/entity-events-api.md index e59f1650c7d76..377f2fd01e813 100644 --- a/docs/managed-datahub/datahub-api/entity-events-api.md +++ b/docs/managed-datahub/datahub-api/entity-events-api.md @@ -15,6 +15,7 @@ The Events API allows you to integrate changes happening on the DataHub Metadata ### Supported Integrations * [AWS EventBridge](docs/managed-datahub/operator-guide/setting-up-events-api-on-aws-eventbridge.md) +* [DataHub Cloud Event Source](docs/actions/sources/datahub-cloud-event-source.md) ### Use Cases From f80d58d297653bb628f5b6e6cb4969b691af44eb Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Fri, 24 Jan 2025 00:42:33 -0800 Subject: [PATCH 02/18] fix(cli): ignore prereleases when suggesting upgrades (#12424) --- metadata-ingestion/src/datahub/upgrade/upgrade.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/metadata-ingestion/src/datahub/upgrade/upgrade.py b/metadata-ingestion/src/datahub/upgrade/upgrade.py index fb14514588e5f..7872681797d6f 100644 --- a/metadata-ingestion/src/datahub/upgrade/upgrade.py +++ b/metadata-ingestion/src/datahub/upgrade/upgrade.py @@ -93,11 +93,11 @@ async def get_github_stats(): async with aiohttp.ClientSession( headers={"Accept": "application/vnd.github.v3+json"} ) as session: - gh_url = "https://api.github.com/repos/datahub-project/datahub/releases" + gh_url = "https://api.github.com/repos/datahub-project/datahub/releases/latest" async with session.get(gh_url) as gh_response: gh_response_json = await gh_response.json() - latest_server_version = Version(gh_response_json[0].get("tag_name")) - latest_server_date = gh_response_json[0].get("published_at") + latest_server_version = Version(gh_response_json.get("tag_name")) + latest_server_date = gh_response_json.get("published_at") return (latest_server_version, latest_server_date) From b701e0714ed4b08646d877dce4c0b3e430e0c257 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Fri, 24 Jan 2025 00:42:48 -0800 Subject: [PATCH 03/18] fix(ingest/clickhouse): remove unused lineage_properties code path (#12442) --- .../ingestion/source/sql/clickhouse.py | 48 ++----------------- 1 file changed, 5 insertions(+), 43 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/clickhouse.py b/metadata-ingestion/src/datahub/ingestion/source/sql/clickhouse.py index a8208ca807ed0..a2db116cf2091 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/clickhouse.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/clickhouse.py @@ -53,7 +53,6 @@ ) from datahub.metadata.schema_classes import ( DatasetLineageTypeClass, - DatasetPropertiesClass, DatasetSnapshotClass, UpstreamClass, ) @@ -418,41 +417,11 @@ def get_workunits_internal(self) -> Iterable[Union[MetadataWorkUnit, SqlWorkUnit dataset_snapshot: DatasetSnapshotClass = wu.metadata.proposedSnapshot assert dataset_snapshot - lineage_mcp, lineage_properties_aspect = self.get_lineage_mcp( - wu.metadata.proposedSnapshot.urn - ) + lineage_mcp = self.get_lineage_mcp(wu.metadata.proposedSnapshot.urn) if lineage_mcp is not None: yield lineage_mcp.as_workunit() - if lineage_properties_aspect: - aspects = dataset_snapshot.aspects - if aspects is None: - aspects = [] - - dataset_properties_aspect: Optional[DatasetPropertiesClass] = None - - for aspect in aspects: - if isinstance(aspect, DatasetPropertiesClass): - dataset_properties_aspect = aspect - - if dataset_properties_aspect is None: - dataset_properties_aspect = DatasetPropertiesClass() - aspects.append(dataset_properties_aspect) - - custom_properties = ( - { - **dataset_properties_aspect.customProperties, - **lineage_properties_aspect.customProperties, - } - if dataset_properties_aspect.customProperties - else lineage_properties_aspect.customProperties - ) - dataset_properties_aspect.customProperties = custom_properties - dataset_snapshot.aspects = aspects - - dataset_snapshot.aspects.append(dataset_properties_aspect) - # Emit the work unit from super. yield wu @@ -656,19 +625,16 @@ def _populate_lineage(self) -> None: def get_lineage_mcp( self, dataset_urn: str - ) -> Tuple[ - Optional[MetadataChangeProposalWrapper], Optional[DatasetPropertiesClass] - ]: + ) -> Optional[MetadataChangeProposalWrapper]: dataset_key = mce_builder.dataset_urn_to_key(dataset_urn) if dataset_key is None: - return None, None + return None if not self._lineage_map: self._populate_lineage() assert self._lineage_map is not None upstream_lineage: List[UpstreamClass] = [] - custom_properties: Dict[str, str] = {} if dataset_key.name in self._lineage_map: item = self._lineage_map[dataset_key.name] @@ -684,16 +650,12 @@ def get_lineage_mcp( ) upstream_lineage.append(upstream_table) - properties = None - if custom_properties: - properties = DatasetPropertiesClass(customProperties=custom_properties) - if not upstream_lineage: - return None, properties + return None mcp = MetadataChangeProposalWrapper( entityUrn=dataset_urn, aspect=UpstreamLineage(upstreams=upstream_lineage), ) - return mcp, properties + return mcp From f8149084a1826bfd67c5708372ce5e64eaa0c05d Mon Sep 17 00:00:00 2001 From: Saketh Varma Date: Fri, 24 Jan 2025 14:14:34 +0530 Subject: [PATCH 04/18] fix(ui): fetch Data Products always from the network (#11165) Co-authored-by: Aseem Bansal --- .../src/app/entity/domain/DataProductsTab/DataProductsTab.tsx | 1 + 1 file changed, 1 insertion(+) diff --git a/datahub-web-react/src/app/entity/domain/DataProductsTab/DataProductsTab.tsx b/datahub-web-react/src/app/entity/domain/DataProductsTab/DataProductsTab.tsx index 15cc99127f350..39f89979dd95a 100644 --- a/datahub-web-react/src/app/entity/domain/DataProductsTab/DataProductsTab.tsx +++ b/datahub-web-react/src/app/entity/domain/DataProductsTab/DataProductsTab.tsx @@ -68,6 +68,7 @@ export default function DataProductsTab() { searchFlags: { skipCache: true }, }, }, + fetchPolicy: 'no-cache', }); const totalResults = data?.searchAcrossEntities?.total || 0; const searchResults = data?.searchAcrossEntities?.searchResults?.map((r) => r.entity) || []; From 0f538d8df2ce26bdc7f46757e95c2c9735d40182 Mon Sep 17 00:00:00 2001 From: Mayuri Nehate <33225191+mayurinehate@users.noreply.github.com> Date: Fri, 24 Jan 2025 14:51:52 +0530 Subject: [PATCH 05/18] fix(ingest): fix reporting for missing secure view lineage (#12430) Co-authored-by: Harshal Sheth --- .../source/snowflake/snowflake_schema_gen.py | 24 +++++--- .../snowflake/test_snowflake_failures.py | 60 +++++++++++++++++++ 2 files changed, 77 insertions(+), 7 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py index a2d69d9e55291..04bc51f1ebd3f 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py @@ -491,15 +491,25 @@ def fetch_secure_view_definition( try: view_definitions = self.data_dictionary.get_secure_view_definitions() return view_definitions[db_name][schema_name][table_name] + except KeyError: + # Received secure view definitions but the view is not present in results + self.structured_reporter.info( + title="Secure view definition not found", + message="Lineage will be missing for the view.", + context=f"{db_name}.{schema_name}.{table_name}", + ) + return None except Exception as e: - if isinstance(e, SnowflakePermissionError): - error_msg = ( - "Failed to get secure views definitions. Please check permissions." - ) - else: - error_msg = "Failed to get secure views definitions" + action_msg = ( + "Please check permissions." + if isinstance(e, SnowflakePermissionError) + else "" + ) + self.structured_reporter.warning( - error_msg, + title="Failed to get secure views definitions", + message=f"Lineage will be missing for the view. {action_msg}", + context=f"{db_name}.{schema_name}.{table_name}", exc=e, ) return None diff --git a/metadata-ingestion/tests/integration/snowflake/test_snowflake_failures.py b/metadata-ingestion/tests/integration/snowflake/test_snowflake_failures.py index de6e996a52642..4cb6cec4906ef 100644 --- a/metadata-ingestion/tests/integration/snowflake/test_snowflake_failures.py +++ b/metadata-ingestion/tests/integration/snowflake/test_snowflake_failures.py @@ -260,3 +260,63 @@ def test_snowflake_missing_snowflake_operations_permission_causes_pipeline_failu assert "usage-permission-error" in [ failure.message for failure in pipeline.source.get_report().failures ] + + +@freeze_time(FROZEN_TIME) +def test_snowflake_missing_snowflake_secure_view_definitions_raises_pipeline_info( + pytestconfig, + snowflake_pipeline_config, +): + with mock.patch("snowflake.connector.connect") as mock_connect: + sf_connection = mock.MagicMock() + sf_cursor = mock.MagicMock() + mock_connect.return_value = sf_connection + sf_connection.cursor.return_value = sf_cursor + + # Empty secure view definitions + sf_cursor.execute.side_effect = query_permission_response_override( + default_query_results, + [snowflake_query.SnowflakeQuery.get_secure_view_definitions()], + [], + ) + pipeline = Pipeline(snowflake_pipeline_config) + pipeline.run() + + pipeline.raise_from_status(raise_warnings=True) + assert pipeline.source.get_report().infos.as_obj() == [ + { + "title": "Secure view definition not found", + "message": "Lineage will be missing for the view.", + "context": ["TEST_DB.TEST_SCHEMA.VIEW_1"], + } + ] + + +@freeze_time(FROZEN_TIME) +def test_snowflake_failed_secure_view_definitions_query_raises_pipeline_warning( + pytestconfig, + snowflake_pipeline_config, +): + with mock.patch("snowflake.connector.connect") as mock_connect: + sf_connection = mock.MagicMock() + sf_cursor = mock.MagicMock() + mock_connect.return_value = sf_connection + sf_connection.cursor.return_value = sf_cursor + + # Error in getting secure view definitions + sf_cursor.execute.side_effect = query_permission_error_override( + default_query_results, + [snowflake_query.SnowflakeQuery.get_secure_view_definitions()], + "Database 'SNOWFLAKE' does not exist or not authorized.", + ) + pipeline = Pipeline(snowflake_pipeline_config) + pipeline.run() + assert pipeline.source.get_report().warnings.as_obj() == [ + { + "title": "Failed to get secure views definitions", + "message": "Lineage will be missing for the view. Please check permissions.", + "context": [ + "TEST_DB.TEST_SCHEMA.VIEW_1 : Database 'SNOWFLAKE' does not exist or not authorized." + ], + } + ] From a8d6c54965006462cf5f2fc5f9029ff517fb5a40 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Fri, 24 Jan 2025 11:35:39 -0800 Subject: [PATCH 06/18] feat(sdk): move version info to dedicated file (#12456) --- .../datahub-ingestion-base/smoke.Dockerfile | 6 +- docker/datahub-ingestion/Dockerfile | 8 +- docker/datahub-ingestion/Dockerfile-slim-only | 4 +- .../airflow-plugin/.gitignore | 1 - .../airflow-plugin/scripts/release.sh | 25 +++--- .../airflow-plugin/setup.py | 2 +- .../src/datahub_airflow_plugin/__init__.py | 19 +---- .../src/datahub_airflow_plugin/_version.py | 3 + .../datahub_airflow_plugin/datahub_plugin.py | 2 +- .../dagster-plugin/.gitignore | 1 - .../dagster-plugin/scripts/release.sh | 25 +++--- .../dagster-plugin/setup.py | 2 +- .../src/datahub_dagster_plugin/__init__.py | 22 +---- .../src/datahub_dagster_plugin/_version.py | 3 + .../gx-plugin/.gitignore | 1 - .../gx-plugin/scripts/release.sh | 25 +++--- metadata-ingestion-modules/gx-plugin/setup.py | 2 +- .../src/datahub_gx_plugin/__init__.py | 22 +---- .../src/datahub_gx_plugin/_version.py | 3 + .../prefect-plugin/.gitignore | 1 - .../prefect-plugin/scripts/release.sh | 25 +++--- .../prefect-plugin/setup.py | 6 +- .../src/prefect_datahub/__init__.py | 22 +---- .../src/prefect_datahub/_version.py | 3 + metadata-ingestion/.gitignore | 1 - metadata-ingestion/scripts/release.sh | 25 +++--- metadata-ingestion/setup.py | 17 ++-- metadata-ingestion/src/datahub/__init__.py | 26 +----- metadata-ingestion/src/datahub/_version.py | 13 +++ .../src/datahub/cli/check_cli.py | 2 +- .../src/datahub/cli/cli_utils.py | 6 +- .../src/datahub/cli/ingest_cli.py | 4 +- .../src/datahub/emitter/rest_emitter.py | 2 +- metadata-ingestion/src/datahub/entrypoints.py | 10 +-- .../src/datahub/ingestion/api/registry.py | 2 +- .../datahub_ingestion_run_summary_provider.py | 2 +- .../src/datahub/ingestion/run/connection.py | 2 +- .../src/datahub/ingestion/run/pipeline.py | 6 +- .../datahub/ingestion/source/unity/proxy.py | 4 +- .../src/datahub/telemetry/telemetry.py | 8 +- .../src/datahub/testing/check_imports.py | 28 ++++++ .../src/datahub/upgrade/upgrade.py | 2 +- .../tests/unit/test_packages.py | 6 +- .../tests/unit/test_packaging.py | 4 +- python-build/generate_release_scripts.py | 85 +++++++++++++++++++ 45 files changed, 277 insertions(+), 211 deletions(-) create mode 100644 metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/_version.py create mode 100644 metadata-ingestion-modules/dagster-plugin/src/datahub_dagster_plugin/_version.py create mode 100644 metadata-ingestion-modules/gx-plugin/src/datahub_gx_plugin/_version.py create mode 100644 metadata-ingestion-modules/prefect-plugin/src/prefect_datahub/_version.py create mode 100644 metadata-ingestion/src/datahub/_version.py create mode 100644 python-build/generate_release_scripts.py diff --git a/docker/datahub-ingestion-base/smoke.Dockerfile b/docker/datahub-ingestion-base/smoke.Dockerfile index 34654faaad729..81a6bd0e20cac 100644 --- a/docker/datahub-ingestion-base/smoke.Dockerfile +++ b/docker/datahub-ingestion-base/smoke.Dockerfile @@ -20,9 +20,9 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y \ COPY . /datahub-src ARG RELEASE_VERSION RUN cd /datahub-src && \ - sed -i.bak "s/__version__ = \"1\!0.0.0.dev0\"/__version__ = \"$(echo $RELEASE_VERSION|sed s/-/+/)\"/" metadata-ingestion/src/datahub/__init__.py && \ - sed -i.bak "s/__version__ = \"1\!0.0.0.dev0\"/__version__ = \"$(echo $RELEASE_VERSION|sed s/-/+/)\"/" metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/__init__.py && \ - cat metadata-ingestion/src/datahub/__init__.py && \ + sed -i.bak "s/__version__ = .*$/__version__ = \"$(echo $RELEASE_VERSION|sed s/-/+/)\"/" metadata-ingestion/src/datahub/_version.py && \ + sed -i.bak "s/__version__ = .*$/__version__ = \"$(echo $RELEASE_VERSION|sed s/-/+/)\"/" metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/_version.py && \ + cat metadata-ingestion/src/datahub/_version.py && \ ./gradlew :metadata-ingestion:codegen && \ pip install file:metadata-ingestion-modules/airflow-plugin#egg=acryl-datahub-airflow-plugin file:metadata-ingestion#egg=acryl-datahub diff --git a/docker/datahub-ingestion/Dockerfile b/docker/datahub-ingestion/Dockerfile index ee0333e1cb1d1..a9fd3a6662d1b 100644 --- a/docker/datahub-ingestion/Dockerfile +++ b/docker/datahub-ingestion/Dockerfile @@ -26,10 +26,10 @@ COPY --chown=datahub ./metadata-ingestion-modules/airflow-plugin /metadata-inges ARG RELEASE_VERSION WORKDIR /metadata-ingestion -RUN sed -i.bak "s/__version__ = \"1\!0.0.0.dev0\"/__version__ = \"$(echo $RELEASE_VERSION|sed s/-/+/)\"/" src/datahub/__init__.py && \ - sed -i.bak "s/__version__ = \"1\!0.0.0.dev0\"/__version__ = \"$(echo $RELEASE_VERSION|sed s/-/+/)\"/" airflow-plugin/src/datahub_airflow_plugin/__init__.py && \ - cat src/datahub/__init__.py | grep __version__ && \ - cat airflow-plugin/src/datahub_airflow_plugin/__init__.py | grep __version__ +RUN sed -i.bak "s/__version__ = .*$/__version__ = \"$(echo $RELEASE_VERSION|sed s/-/+/)\"/" src/datahub/_version.py && \ + sed -i.bak "s/__version__ = .*$/__version__ = \"$(echo $RELEASE_VERSION|sed s/-/+/)\"/" airflow-plugin/src/datahub_airflow_plugin/_version.py && \ + cat src/datahub/_version.py | grep __version__ && \ + cat airflow-plugin/src/datahub_airflow_plugin/_version.py | grep __version__ FROM base AS slim-install diff --git a/docker/datahub-ingestion/Dockerfile-slim-only b/docker/datahub-ingestion/Dockerfile-slim-only index 6ade262f2fede..80abff204df9f 100644 --- a/docker/datahub-ingestion/Dockerfile-slim-only +++ b/docker/datahub-ingestion/Dockerfile-slim-only @@ -15,8 +15,8 @@ COPY --chown=datahub ./metadata-ingestion /metadata-ingestion ARG RELEASE_VERSION WORKDIR /metadata-ingestion -RUN sed -i.bak "s/__version__ = \"1\!0.0.0.dev0\"/__version__ = \"$(echo $RELEASE_VERSION|sed s/-/+/)\"/" src/datahub/__init__.py && \ - cat src/datahub/__init__.py +RUN sed -i.bak "s/__version__ = .*$/__version__ = \"$(echo $RELEASE_VERSION|sed s/-/+/)\"/" src/datahub/_version.py && \ + cat src/datahub/_version.py FROM base as slim-install diff --git a/metadata-ingestion-modules/airflow-plugin/.gitignore b/metadata-ingestion-modules/airflow-plugin/.gitignore index d0108e8361a06..6801b785ea1e4 100644 --- a/metadata-ingestion-modules/airflow-plugin/.gitignore +++ b/metadata-ingestion-modules/airflow-plugin/.gitignore @@ -1,5 +1,4 @@ .envrc -src/datahub_airflow_plugin/__init__.py.bak .vscode/ output pvenv36/ diff --git a/metadata-ingestion-modules/airflow-plugin/scripts/release.sh b/metadata-ingestion-modules/airflow-plugin/scripts/release.sh index 8f23f72082c2c..994c1ae145ce5 100755 --- a/metadata-ingestion-modules/airflow-plugin/scripts/release.sh +++ b/metadata-ingestion-modules/airflow-plugin/scripts/release.sh @@ -1,26 +1,31 @@ #!/bin/bash +# Auto-generated by python-build/generate_release_scripts.py. Do not edit manually. + set -euxo pipefail +ROOT=../.. +MODULE=datahub_airflow_plugin + if [[ ! ${RELEASE_SKIP_TEST:-} ]] && [[ ! ${RELEASE_SKIP_INSTALL:-} ]]; then - ../../gradlew build # also runs tests + ${ROOT}/gradlew build # also runs tests elif [[ ! ${RELEASE_SKIP_INSTALL:-} ]]; then - ../../gradlew install + ${ROOT}/gradlew install fi -MODULE=datahub_airflow_plugin - # Check packaging constraint. python -c 'import setuptools; where="./src"; assert setuptools.find_packages(where) == setuptools.find_namespace_packages(where), "you seem to be missing or have extra __init__.py files"' -if [[ ${RELEASE_VERSION:-} ]]; then - # Replace version with RELEASE_VERSION env variable - sed -i.bak "s/__version__ = \"1\!0.0.0.dev0\"/__version__ = \"$(echo $RELEASE_VERSION|sed s/-/+/)\"/" src/${MODULE}/__init__.py -else - vim src/${MODULE}/__init__.py + +# Update the release version. +if [[ ! ${RELEASE_VERSION:-} ]]; then + echo "RELEASE_VERSION is not set" + exit 1 fi +sed -i.bak "s/__version__ = .*$/__version__ = \"$(echo $RELEASE_VERSION|sed s/-/+/)\"/" src/${MODULE}/_version.py +# Build and upload the release. rm -rf build dist || true python -m build if [[ ! ${RELEASE_SKIP_UPLOAD:-} ]]; then python -m twine upload 'dist/*' fi -mv src/${MODULE}/__init__.py.bak src/${MODULE}/__init__.py +mv src/${MODULE}/_version.py.bak src/${MODULE}/_version.py diff --git a/metadata-ingestion-modules/airflow-plugin/setup.py b/metadata-ingestion-modules/airflow-plugin/setup.py index 58c04158957cc..d03ed824c9a26 100644 --- a/metadata-ingestion-modules/airflow-plugin/setup.py +++ b/metadata-ingestion-modules/airflow-plugin/setup.py @@ -5,7 +5,7 @@ import setuptools package_metadata: dict = {} -with open("./src/datahub_airflow_plugin/__init__.py") as fp: +with open("./src/datahub_airflow_plugin/_version.py") as fp: exec(fp.read(), package_metadata) diff --git a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/__init__.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/__init__.py index e4040e3a17dfd..7743c8ab2bab1 100644 --- a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/__init__.py +++ b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/__init__.py @@ -1,23 +1,12 @@ -# Published at https://pypi.org/project/acryl-datahub/. -__package_name__ = "acryl-datahub-airflow-plugin" -__version__ = "1!0.0.0.dev0" +from datahub_airflow_plugin._version import __package_name__, __version__ -def is_dev_mode() -> bool: - return __version__.endswith("dev0") - - -def nice_version_name() -> str: - if is_dev_mode(): - return "unavailable (installed in develop mode)" - return __version__ - - -def get_provider_info(): +def get_provider_info() -> dict: + # Register our hooks with Airflow. return { "package-name": f"{__package_name__}", "name": f"{__package_name__}", - "description": "Datahub metadata collector plugin", + "description": "DataHub metadata collector plugin", "connection-types": [ { "hook-class-name": "datahub_airflow_plugin.hooks.datahub.DatahubRestHook", diff --git a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/_version.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/_version.py new file mode 100644 index 0000000000000..efda3f6bf3124 --- /dev/null +++ b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/_version.py @@ -0,0 +1,3 @@ +# Published at https://pypi.org/project/acryl-datahub-airflow-plugin/. +__package_name__ = "acryl-datahub-airflow-plugin" +__version__ = "1!0.0.0.dev0" diff --git a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/datahub_plugin.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/datahub_plugin.py index 7638720db023a..2aeaaee4a542d 100644 --- a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/datahub_plugin.py +++ b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/datahub_plugin.py @@ -4,12 +4,12 @@ from airflow.plugins_manager import AirflowPlugin -from datahub_airflow_plugin import __package_name__ from datahub_airflow_plugin._airflow_compat import AIRFLOW_PATCHED from datahub_airflow_plugin._airflow_shims import ( HAS_AIRFLOW_LISTENER_API, NEEDS_AIRFLOW_LISTENER_MODULE, ) +from datahub_airflow_plugin._version import __package_name__ assert AIRFLOW_PATCHED logger = logging.getLogger(__name__) diff --git a/metadata-ingestion-modules/dagster-plugin/.gitignore b/metadata-ingestion-modules/dagster-plugin/.gitignore index 4ff42af3e16cf..6801b785ea1e4 100644 --- a/metadata-ingestion-modules/dagster-plugin/.gitignore +++ b/metadata-ingestion-modules/dagster-plugin/.gitignore @@ -1,5 +1,4 @@ .envrc -src/datahub_dagster_plugin/__init__.py.bak .vscode/ output pvenv36/ diff --git a/metadata-ingestion-modules/dagster-plugin/scripts/release.sh b/metadata-ingestion-modules/dagster-plugin/scripts/release.sh index 10cb816d9ffc0..ffd5201574891 100755 --- a/metadata-ingestion-modules/dagster-plugin/scripts/release.sh +++ b/metadata-ingestion-modules/dagster-plugin/scripts/release.sh @@ -1,26 +1,31 @@ #!/bin/bash +# Auto-generated by python-build/generate_release_scripts.py. Do not edit manually. + set -euxo pipefail +ROOT=../.. +MODULE=datahub_dagster_plugin + if [[ ! ${RELEASE_SKIP_TEST:-} ]] && [[ ! ${RELEASE_SKIP_INSTALL:-} ]]; then - ../../gradlew build # also runs tests + ${ROOT}/gradlew build # also runs tests elif [[ ! ${RELEASE_SKIP_INSTALL:-} ]]; then - ../../gradlew install + ${ROOT}/gradlew install fi -MODULE=datahub_dagster_plugin - # Check packaging constraint. python -c 'import setuptools; where="./src"; assert setuptools.find_packages(where) == setuptools.find_namespace_packages(where), "you seem to be missing or have extra __init__.py files"' -if [[ ${RELEASE_VERSION:-} ]]; then - # Replace version with RELEASE_VERSION env variable - sed -i.bak "s/__version__ = \"1\!0.0.0.dev0\"/__version__ = \"$(echo $RELEASE_VERSION|sed s/-/+/)\"/" src/${MODULE}/__init__.py -else - vim src/${MODULE}/__init__.py + +# Update the release version. +if [[ ! ${RELEASE_VERSION:-} ]]; then + echo "RELEASE_VERSION is not set" + exit 1 fi +sed -i.bak "s/__version__ = .*$/__version__ = \"$(echo $RELEASE_VERSION|sed s/-/+/)\"/" src/${MODULE}/_version.py +# Build and upload the release. rm -rf build dist || true python -m build if [[ ! ${RELEASE_SKIP_UPLOAD:-} ]]; then python -m twine upload 'dist/*' fi -mv src/${MODULE}/__init__.py.bak src/${MODULE}/__init__.py +mv src/${MODULE}/_version.py.bak src/${MODULE}/_version.py diff --git a/metadata-ingestion-modules/dagster-plugin/setup.py b/metadata-ingestion-modules/dagster-plugin/setup.py index 09859b6c4344e..6e2e013f719f5 100644 --- a/metadata-ingestion-modules/dagster-plugin/setup.py +++ b/metadata-ingestion-modules/dagster-plugin/setup.py @@ -4,7 +4,7 @@ import setuptools package_metadata: dict = {} -with open("./src/datahub_dagster_plugin/__init__.py") as fp: +with open("./src/datahub_dagster_plugin/_version.py") as fp: exec(fp.read(), package_metadata) diff --git a/metadata-ingestion-modules/dagster-plugin/src/datahub_dagster_plugin/__init__.py b/metadata-ingestion-modules/dagster-plugin/src/datahub_dagster_plugin/__init__.py index 1c7d60666a085..20baf25413535 100644 --- a/metadata-ingestion-modules/dagster-plugin/src/datahub_dagster_plugin/__init__.py +++ b/metadata-ingestion-modules/dagster-plugin/src/datahub_dagster_plugin/__init__.py @@ -1,21 +1 @@ -# Published at https://pypi.org/project/acryl-datahub/. -__package_name__ = "acryl-datahub-dagster-plugin" -__version__ = "1!0.0.0.dev0" - - -def is_dev_mode() -> bool: - return __version__.endswith("dev0") - - -def nice_version_name() -> str: - if is_dev_mode(): - return "unavailable (installed in develop mode)" - return __version__ - - -def get_provider_info(): - return { - "package-name": f"{__package_name__}", - "name": f"{__package_name__}", - "description": "Datahub metadata collector plugin", - } +from datahub_dagster_plugin._version import __package_name__, __version__ diff --git a/metadata-ingestion-modules/dagster-plugin/src/datahub_dagster_plugin/_version.py b/metadata-ingestion-modules/dagster-plugin/src/datahub_dagster_plugin/_version.py new file mode 100644 index 0000000000000..e287b6bf32f5d --- /dev/null +++ b/metadata-ingestion-modules/dagster-plugin/src/datahub_dagster_plugin/_version.py @@ -0,0 +1,3 @@ +# Published at https://pypi.org/project/acryl-datahub-dagster-plugin/. +__package_name__ = "acryl-datahub-dagster-plugin" +__version__ = "1!0.0.0.dev0" diff --git a/metadata-ingestion-modules/gx-plugin/.gitignore b/metadata-ingestion-modules/gx-plugin/.gitignore index 8c01744589e35..6801b785ea1e4 100644 --- a/metadata-ingestion-modules/gx-plugin/.gitignore +++ b/metadata-ingestion-modules/gx-plugin/.gitignore @@ -1,5 +1,4 @@ .envrc -src/datahub_gx_plugin/__init__.py.bak .vscode/ output pvenv36/ diff --git a/metadata-ingestion-modules/gx-plugin/scripts/release.sh b/metadata-ingestion-modules/gx-plugin/scripts/release.sh index 058add495821c..06605f03a78aa 100755 --- a/metadata-ingestion-modules/gx-plugin/scripts/release.sh +++ b/metadata-ingestion-modules/gx-plugin/scripts/release.sh @@ -1,26 +1,31 @@ #!/bin/bash +# Auto-generated by python-build/generate_release_scripts.py. Do not edit manually. + set -euxo pipefail +ROOT=../.. +MODULE=datahub_gx_plugin + if [[ ! ${RELEASE_SKIP_TEST:-} ]] && [[ ! ${RELEASE_SKIP_INSTALL:-} ]]; then - ../../gradlew build # also runs tests + ${ROOT}/gradlew build # also runs tests elif [[ ! ${RELEASE_SKIP_INSTALL:-} ]]; then - ../../gradlew install + ${ROOT}/gradlew install fi -MODULE=datahub_gx_plugin - # Check packaging constraint. python -c 'import setuptools; where="./src"; assert setuptools.find_packages(where) == setuptools.find_namespace_packages(where), "you seem to be missing or have extra __init__.py files"' -if [[ ${RELEASE_VERSION:-} ]]; then - # Replace version with RELEASE_VERSION env variable - sed -i.bak "s/__version__ = \"1\!0.0.0.dev0\"/__version__ = \"$(echo $RELEASE_VERSION|sed s/-/+/)\"/" src/${MODULE}/__init__.py -else - vim src/${MODULE}/__init__.py + +# Update the release version. +if [[ ! ${RELEASE_VERSION:-} ]]; then + echo "RELEASE_VERSION is not set" + exit 1 fi +sed -i.bak "s/__version__ = .*$/__version__ = \"$(echo $RELEASE_VERSION|sed s/-/+/)\"/" src/${MODULE}/_version.py +# Build and upload the release. rm -rf build dist || true python -m build if [[ ! ${RELEASE_SKIP_UPLOAD:-} ]]; then python -m twine upload 'dist/*' fi -mv src/${MODULE}/__init__.py.bak src/${MODULE}/__init__.py +mv src/${MODULE}/_version.py.bak src/${MODULE}/_version.py diff --git a/metadata-ingestion-modules/gx-plugin/setup.py b/metadata-ingestion-modules/gx-plugin/setup.py index fbc4097388993..43495673a7ff1 100644 --- a/metadata-ingestion-modules/gx-plugin/setup.py +++ b/metadata-ingestion-modules/gx-plugin/setup.py @@ -4,7 +4,7 @@ import setuptools package_metadata: dict = {} -with open("./src/datahub_gx_plugin/__init__.py") as fp: +with open("./src/datahub_gx_plugin/_version.py") as fp: exec(fp.read(), package_metadata) diff --git a/metadata-ingestion-modules/gx-plugin/src/datahub_gx_plugin/__init__.py b/metadata-ingestion-modules/gx-plugin/src/datahub_gx_plugin/__init__.py index a7689be82a5d9..b3f8638c28088 100644 --- a/metadata-ingestion-modules/gx-plugin/src/datahub_gx_plugin/__init__.py +++ b/metadata-ingestion-modules/gx-plugin/src/datahub_gx_plugin/__init__.py @@ -1,21 +1 @@ -# Published at https://pypi.org/project/acryl-datahub/. -__package_name__ = "acryl-datahub-gx-plugin" -__version__ = "1!0.0.0.dev0" - - -def is_dev_mode() -> bool: - return __version__.endswith("dev0") - - -def nice_version_name() -> str: - if is_dev_mode(): - return "unavailable (installed in develop mode)" - return __version__ - - -def get_provider_info(): - return { - "package-name": f"{__package_name__}", - "name": f"{__package_name__}", - "description": "Datahub metadata collector plugin", - } +from datahub_gx_plugin._version import __package_name__, __version__ diff --git a/metadata-ingestion-modules/gx-plugin/src/datahub_gx_plugin/_version.py b/metadata-ingestion-modules/gx-plugin/src/datahub_gx_plugin/_version.py new file mode 100644 index 0000000000000..02dbb70a9b483 --- /dev/null +++ b/metadata-ingestion-modules/gx-plugin/src/datahub_gx_plugin/_version.py @@ -0,0 +1,3 @@ +# Published at https://pypi.org/project/acryl-datahub-gx-plugin/. +__package_name__ = "acryl-datahub-gx-plugin" +__version__ = "1!0.0.0.dev0" diff --git a/metadata-ingestion-modules/prefect-plugin/.gitignore b/metadata-ingestion-modules/prefect-plugin/.gitignore index 1d2916d00eabd..6801b785ea1e4 100644 --- a/metadata-ingestion-modules/prefect-plugin/.gitignore +++ b/metadata-ingestion-modules/prefect-plugin/.gitignore @@ -1,5 +1,4 @@ .envrc -src/prefect_datahub/__init__.py.bak .vscode/ output pvenv36/ diff --git a/metadata-ingestion-modules/prefect-plugin/scripts/release.sh b/metadata-ingestion-modules/prefect-plugin/scripts/release.sh index f398db98b6029..b3b99d61c904a 100755 --- a/metadata-ingestion-modules/prefect-plugin/scripts/release.sh +++ b/metadata-ingestion-modules/prefect-plugin/scripts/release.sh @@ -1,26 +1,31 @@ #!/bin/bash +# Auto-generated by python-build/generate_release_scripts.py. Do not edit manually. + set -euxo pipefail +ROOT=../.. +MODULE=prefect_datahub + if [[ ! ${RELEASE_SKIP_TEST:-} ]] && [[ ! ${RELEASE_SKIP_INSTALL:-} ]]; then - ../../gradlew build # also runs tests + ${ROOT}/gradlew build # also runs tests elif [[ ! ${RELEASE_SKIP_INSTALL:-} ]]; then - ../../gradlew install + ${ROOT}/gradlew install fi -MODULE=prefect_datahub - # Check packaging constraint. python -c 'import setuptools; where="./src"; assert setuptools.find_packages(where) == setuptools.find_namespace_packages(where), "you seem to be missing or have extra __init__.py files"' -if [[ ${RELEASE_VERSION:-} ]]; then - # Replace version with RELEASE_VERSION env variable - sed -i.bak "s/__version__ = \"1\!0.0.0.dev0\"/__version__ = \"$(echo $RELEASE_VERSION|sed s/-/+/)\"/" src/${MODULE}/__init__.py -else - vim src/${MODULE}/__init__.py + +# Update the release version. +if [[ ! ${RELEASE_VERSION:-} ]]; then + echo "RELEASE_VERSION is not set" + exit 1 fi +sed -i.bak "s/__version__ = .*$/__version__ = \"$(echo $RELEASE_VERSION|sed s/-/+/)\"/" src/${MODULE}/_version.py +# Build and upload the release. rm -rf build dist || true python -m build if [[ ! ${RELEASE_SKIP_UPLOAD:-} ]]; then python -m twine upload 'dist/*' fi -mv src/${MODULE}/__init__.py.bak src/${MODULE}/__init__.py \ No newline at end of file +mv src/${MODULE}/_version.py.bak src/${MODULE}/_version.py diff --git a/metadata-ingestion-modules/prefect-plugin/setup.py b/metadata-ingestion-modules/prefect-plugin/setup.py index 1d56cae8d938a..87feb810b8e5a 100644 --- a/metadata-ingestion-modules/prefect-plugin/setup.py +++ b/metadata-ingestion-modules/prefect-plugin/setup.py @@ -4,7 +4,7 @@ import setuptools package_metadata: dict = {} -with open("./src/prefect_datahub/__init__.py") as fp: +with open("./src/prefect_datahub/_version.py") as fp: exec(fp.read(), package_metadata) @@ -30,9 +30,7 @@ def get_long_description(): # Temporary pinning to 2.0.0 until we can upgrade to 3.0.0 "prefect >= 2.0.0,<3.0.0", *rest_common, - # Ignoring the dependency below because it causes issues with the vercel built wheel install - # f"acryl-datahub[datahub-rest]{_self_pin}", - "acryl-datahub[datahub-rest]", + f"acryl-datahub[datahub-rest]{_self_pin}", } diff --git a/metadata-ingestion-modules/prefect-plugin/src/prefect_datahub/__init__.py b/metadata-ingestion-modules/prefect-plugin/src/prefect_datahub/__init__.py index 8cc65f9010613..f38863a1f31e2 100644 --- a/metadata-ingestion-modules/prefect-plugin/src/prefect_datahub/__init__.py +++ b/metadata-ingestion-modules/prefect-plugin/src/prefect_datahub/__init__.py @@ -1,21 +1 @@ -# Published at https://pypi.org/project/acryl-datahub/. -__package_name__ = "prefect-datahub" -__version__ = "1!0.0.0.dev0" - - -def is_dev_mode() -> bool: - return __version__.endswith("dev0") - - -def nice_version_name() -> str: - if is_dev_mode(): - return "unavailable (installed in develop mode)" - return __version__ - - -def get_provider_info(): - return { - "package-name": f"{__package_name__}", - "name": f"{__package_name__}", - "description": "Datahub prefect block to capture executions and send to Datahub", - } +from prefect_datahub._version import __package_name__, __version__ diff --git a/metadata-ingestion-modules/prefect-plugin/src/prefect_datahub/_version.py b/metadata-ingestion-modules/prefect-plugin/src/prefect_datahub/_version.py new file mode 100644 index 0000000000000..8c45e7d04b367 --- /dev/null +++ b/metadata-ingestion-modules/prefect-plugin/src/prefect_datahub/_version.py @@ -0,0 +1,3 @@ +# Published at https://pypi.org/project/prefect-datahub/. +__package_name__ = "prefect-datahub" +__version__ = "1!0.0.0.dev0" diff --git a/metadata-ingestion/.gitignore b/metadata-ingestion/.gitignore index acc15c4598869..c7a781ded1d68 100644 --- a/metadata-ingestion/.gitignore +++ b/metadata-ingestion/.gitignore @@ -1,5 +1,4 @@ .envrc -src/datahub/__init__.py.bak .vscode/ output pvenv36/ diff --git a/metadata-ingestion/scripts/release.sh b/metadata-ingestion/scripts/release.sh index a18dd6f934b43..9b4e62aef581a 100755 --- a/metadata-ingestion/scripts/release.sh +++ b/metadata-ingestion/scripts/release.sh @@ -1,26 +1,31 @@ #!/bin/bash +# Auto-generated by python-build/generate_release_scripts.py. Do not edit manually. + set -euxo pipefail +ROOT=.. +MODULE=datahub + if [[ ! ${RELEASE_SKIP_TEST:-} ]] && [[ ! ${RELEASE_SKIP_INSTALL:-} ]]; then - ../gradlew build # also runs tests + ${ROOT}/gradlew build # also runs tests elif [[ ! ${RELEASE_SKIP_INSTALL:-} ]]; then - ../gradlew install + ${ROOT}/gradlew install fi -MODULE=datahub - # Check packaging constraint. python -c 'import setuptools; where="./src"; assert setuptools.find_packages(where) == setuptools.find_namespace_packages(where), "you seem to be missing or have extra __init__.py files"' -if [[ ${RELEASE_VERSION:-} ]]; then - # Replace version with RELEASE_VERSION env variable - sed -i.bak "s/__version__ = \"1\!0.0.0.dev0\"/__version__ = \"$(echo $RELEASE_VERSION|sed s/-/+/)\"/" src/${MODULE}/__init__.py -else - vim src/${MODULE}/__init__.py + +# Update the release version. +if [[ ! ${RELEASE_VERSION:-} ]]; then + echo "RELEASE_VERSION is not set" + exit 1 fi +sed -i.bak "s/__version__ = .*$/__version__ = \"$(echo $RELEASE_VERSION|sed s/-/+/)\"/" src/${MODULE}/_version.py +# Build and upload the release. rm -rf build dist || true python -m build if [[ ! ${RELEASE_SKIP_UPLOAD:-} ]]; then python -m twine upload 'dist/*' fi -mv src/${MODULE}/__init__.py.bak src/${MODULE}/__init__.py +mv src/${MODULE}/_version.py.bak src/${MODULE}/_version.py diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index c91dbf709e6d8..b317598930984 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -3,7 +3,7 @@ import setuptools package_metadata: dict = {} -with open("./src/datahub/__init__.py") as fp: +with open("./src/datahub/_version.py") as fp: exec(fp.read(), package_metadata) _version: str = package_metadata["__version__"] @@ -312,7 +312,10 @@ powerbi_report_server = {"requests", "requests_ntlm"} -slack = {"slack-sdk==3.18.1", "tenacity>=8.0.1",} +slack = { + "slack-sdk==3.18.1", + "tenacity>=8.0.1", +} databricks = { # 0.1.11 appears to have authentication issues with azure databricks @@ -505,12 +508,10 @@ "starburst-trino-usage": sql_common | usage_common | trino, "nifi": {"requests", "packaging", "requests-gssapi"}, "powerbi": ( - ( - microsoft_common - | {"lark[regex]==1.1.4", "sqlparse", "more-itertools"} - | sqlglot_lib - | threading_timeout_common - ) + microsoft_common + | {"lark[regex]==1.1.4", "sqlparse", "more-itertools"} + | sqlglot_lib + | threading_timeout_common ), "powerbi-report-server": powerbi_report_server, "vertica": sql_common | {"vertica-sqlalchemy-dialect[vertica-python]==0.0.8.2"}, diff --git a/metadata-ingestion/src/datahub/__init__.py b/metadata-ingestion/src/datahub/__init__.py index b254deb7fa30e..8b8ef52d27bb9 100644 --- a/metadata-ingestion/src/datahub/__init__.py +++ b/metadata-ingestion/src/datahub/__init__.py @@ -1,25 +1 @@ -import sys -import warnings - -# Published at https://pypi.org/project/acryl-datahub/. -__package_name__ = "acryl-datahub" -__version__ = "1!0.0.0.dev0" - - -def is_dev_mode() -> bool: - return __version__.endswith("dev0") - - -def nice_version_name() -> str: - if is_dev_mode(): - return "unavailable (installed in develop mode)" - return __version__ - - -if sys.version_info < (3, 8): - warnings.warn( - "DataHub requires Python 3.8 or newer. " - "Please upgrade your Python version to continue using DataHub.", - FutureWarning, - stacklevel=2, - ) +from datahub._version import __package_name__, __version__ diff --git a/metadata-ingestion/src/datahub/_version.py b/metadata-ingestion/src/datahub/_version.py new file mode 100644 index 0000000000000..a34748ac942a1 --- /dev/null +++ b/metadata-ingestion/src/datahub/_version.py @@ -0,0 +1,13 @@ +# Published at https://pypi.org/project/acryl-datahub/. +__package_name__ = "acryl-datahub" +__version__ = "1!0.0.0.dev0" + + +def is_dev_mode() -> bool: + return __version__.endswith("dev0") + + +def nice_version_name() -> str: + if is_dev_mode(): + return "unavailable (installed in develop mode)" + return __version__ diff --git a/metadata-ingestion/src/datahub/cli/check_cli.py b/metadata-ingestion/src/datahub/cli/check_cli.py index fbe07b64f0e15..6b3124fc37393 100644 --- a/metadata-ingestion/src/datahub/cli/check_cli.py +++ b/metadata-ingestion/src/datahub/cli/check_cli.py @@ -9,7 +9,7 @@ import click -from datahub import __package_name__ +from datahub._version import __package_name__ from datahub.cli.json_file import check_mce_file from datahub.configuration import config_loader from datahub.configuration.common import AllowDenyPattern diff --git a/metadata-ingestion/src/datahub/cli/cli_utils.py b/metadata-ingestion/src/datahub/cli/cli_utils.py index 1f13391644c6c..26f4117e151f9 100644 --- a/metadata-ingestion/src/datahub/cli/cli_utils.py +++ b/metadata-ingestion/src/datahub/cli/cli_utils.py @@ -9,7 +9,7 @@ import requests from requests.sessions import Session -import datahub +import datahub._version as datahub_version from datahub.cli import config_utils from datahub.emitter.aspect import ASPECT_MAP, TIMESERIES_ASPECT_MAP from datahub.emitter.mcp import MetadataChangeProposalWrapper @@ -422,5 +422,5 @@ def ensure_has_system_metadata( if metadata.properties is None: metadata.properties = {} props = metadata.properties - props["clientId"] = datahub.__package_name__ - props["clientVersion"] = datahub.__version__ + props["clientId"] = datahub_version.__package_name__ + props["clientVersion"] = datahub_version.__version__ diff --git a/metadata-ingestion/src/datahub/cli/ingest_cli.py b/metadata-ingestion/src/datahub/cli/ingest_cli.py index c9eaccbc65ee2..e2a2f35a36631 100644 --- a/metadata-ingestion/src/datahub/cli/ingest_cli.py +++ b/metadata-ingestion/src/datahub/cli/ingest_cli.py @@ -12,7 +12,7 @@ from click_default_group import DefaultGroup from tabulate import tabulate -import datahub as datahub_package +from datahub._version import nice_version_name from datahub.cli import cli_utils from datahub.cli.config_utils import CONDENSED_DATAHUB_CONFIG_PATH from datahub.configuration.common import ConfigModel, GraphError @@ -147,7 +147,7 @@ def run_pipeline_to_completion(pipeline: Pipeline) -> int: return ret # main function begins - logger.info("DataHub CLI version: %s", datahub_package.nice_version_name()) + logger.info("DataHub CLI version: %s", nice_version_name()) pipeline_config = load_config_file( config, diff --git a/metadata-ingestion/src/datahub/emitter/rest_emitter.py b/metadata-ingestion/src/datahub/emitter/rest_emitter.py index 7271f784bf881..4e7a152204da8 100644 --- a/metadata-ingestion/src/datahub/emitter/rest_emitter.py +++ b/metadata-ingestion/src/datahub/emitter/rest_emitter.py @@ -22,7 +22,7 @@ from requests.adapters import HTTPAdapter, Retry from requests.exceptions import HTTPError, RequestException -from datahub import nice_version_name +from datahub._version import nice_version_name from datahub.cli import config_utils from datahub.cli.cli_utils import ensure_has_system_metadata, fixup_gms_url, get_or_else from datahub.cli.env_utils import get_boolean_env_variable diff --git a/metadata-ingestion/src/datahub/entrypoints.py b/metadata-ingestion/src/datahub/entrypoints.py index 182084e479425..73d35381d5df2 100644 --- a/metadata-ingestion/src/datahub/entrypoints.py +++ b/metadata-ingestion/src/datahub/entrypoints.py @@ -6,7 +6,7 @@ import click -import datahub as datahub_package +import datahub._version as datahub_version from datahub.cli.check_cli import check from datahub.cli.cli_utils import ( fixup_gms_url, @@ -74,8 +74,8 @@ help="Write debug-level logs to a file.", ) @click.version_option( - version=datahub_package.nice_version_name(), - prog_name=datahub_package.__package_name__, + version=datahub_version.nice_version_name(), + prog_name=datahub_version.__package_name__, ) def datahub( debug: bool, @@ -112,7 +112,7 @@ def datahub( def version(include_server: bool = False) -> None: """Print version number and exit.""" - click.echo(f"DataHub CLI version: {datahub_package.nice_version_name()}") + click.echo(f"DataHub CLI version: {datahub_version.nice_version_name()}") click.echo(f"Models: {model_version_name()}") click.echo(f"Python version: {sys.version}") if include_server: @@ -223,7 +223,7 @@ def main(**kwargs): logger.exception(f"Command failed: {exc}") logger.debug( - f"DataHub CLI version: {datahub_package.__version__} at {datahub_package.__file__}" + f"DataHub CLI version: {datahub_version.__version__} at {__file__}" ) logger.debug( f"Python version: {sys.version} at {sys.executable} on {platform.platform()}" diff --git a/metadata-ingestion/src/datahub/ingestion/api/registry.py b/metadata-ingestion/src/datahub/ingestion/api/registry.py index 5e372a964c7e6..91ee98865e78e 100644 --- a/metadata-ingestion/src/datahub/ingestion/api/registry.py +++ b/metadata-ingestion/src/datahub/ingestion/api/registry.py @@ -17,7 +17,7 @@ import typing_inspect -from datahub import __package_name__ +from datahub._version import __package_name__ from datahub.configuration.common import ConfigurationError if sys.version_info < (3, 10): diff --git a/metadata-ingestion/src/datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py b/metadata-ingestion/src/datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py index c143a8b49f4b7..fc790535cfe03 100644 --- a/metadata-ingestion/src/datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +++ b/metadata-ingestion/src/datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py @@ -3,7 +3,7 @@ import time from typing import Any, Dict, Optional -from datahub import nice_version_name +from datahub._version import nice_version_name from datahub.configuration.common import ( ConfigModel, DynamicTypedConfig, diff --git a/metadata-ingestion/src/datahub/ingestion/run/connection.py b/metadata-ingestion/src/datahub/ingestion/run/connection.py index 54b0ab9f22c65..d42a1ba8767f9 100644 --- a/metadata-ingestion/src/datahub/ingestion/run/connection.py +++ b/metadata-ingestion/src/datahub/ingestion/run/connection.py @@ -1,6 +1,6 @@ import logging -from datahub import __version__ +from datahub._version import __version__ from datahub.ingestion.api.source import TestableSource, TestConnectionReport from datahub.ingestion.source.source_registry import source_registry diff --git a/metadata-ingestion/src/datahub/ingestion/run/pipeline.py b/metadata-ingestion/src/datahub/ingestion/run/pipeline.py index 25cbd340c9674..120cf6a79bc02 100644 --- a/metadata-ingestion/src/datahub/ingestion/run/pipeline.py +++ b/metadata-ingestion/src/datahub/ingestion/run/pipeline.py @@ -15,7 +15,7 @@ import humanfriendly import psutil -import datahub +from datahub._version import nice_version_name from datahub.configuration.common import ( ConfigModel, IgnorableError, @@ -144,8 +144,8 @@ def _add_init_error_context(step: str) -> Iterator[None]: @dataclass class CliReport(Report): - cli_version: str = datahub.nice_version_name() - cli_entry_location: str = datahub.__file__ + cli_version: str = nice_version_name() + cli_entry_location: str = __file__ models_version: str = model_version_name() py_version: str = sys.version py_exec_path: str = sys.executable diff --git a/metadata-ingestion/src/datahub/ingestion/source/unity/proxy.py b/metadata-ingestion/src/datahub/ingestion/source/unity/proxy.py index fd6fa8a50f707..86e577febf454 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/unity/proxy.py +++ b/metadata-ingestion/src/datahub/ingestion/source/unity/proxy.py @@ -26,7 +26,7 @@ ) from databricks.sdk.service.workspace import ObjectType -import datahub +from datahub._version import nice_version_name from datahub.emitter.mce_builder import parse_ts_millis from datahub.ingestion.source.unity.hive_metastore_proxy import HiveMetastoreProxy from datahub.ingestion.source.unity.proxy_profiling import ( @@ -103,7 +103,7 @@ def __init__( host=workspace_url, token=personal_access_token, product="datahub", - product_version=datahub.nice_version_name(), + product_version=nice_version_name(), ) self.warehouse_id = warehouse_id or "" self.report = report diff --git a/metadata-ingestion/src/datahub/telemetry/telemetry.py b/metadata-ingestion/src/datahub/telemetry/telemetry.py index 22b2cb6a101af..c64c133fbf456 100644 --- a/metadata-ingestion/src/datahub/telemetry/telemetry.py +++ b/metadata-ingestion/src/datahub/telemetry/telemetry.py @@ -12,7 +12,7 @@ from mixpanel import Consumer, Mixpanel from typing_extensions import ParamSpec -import datahub as datahub_package +from datahub._version import __version__, nice_version_name from datahub.cli.config_utils import DATAHUB_ROOT_FOLDER from datahub.cli.env_utils import get_boolean_env_variable from datahub.configuration.common import ExceptionWithProps @@ -106,7 +106,7 @@ def _default_telemetry_properties() -> Dict[str, Any]: return { - "datahub_version": datahub_package.nice_version_name(), + "datahub_version": nice_version_name(), "python_version": platform.python_version(), "os": platform.system(), "arch": platform.machine(), @@ -132,7 +132,7 @@ def __init__(self): sentry_sdk.init( dsn=SENTRY_DSN, environment=SENTRY_ENVIRONMENT, - release=datahub_package.__version__, + release=__version__, ) except Exception as e: # We need to print initialization errors to stderr, since logger is not initialized yet @@ -277,7 +277,7 @@ def init_capture_exception(self) -> None: "environment", { "environment": SENTRY_ENVIRONMENT, - "datahub_version": datahub_package.nice_version_name(), + "datahub_version": nice_version_name(), "os": platform.system(), "python_version": platform.python_version(), }, diff --git a/metadata-ingestion/src/datahub/testing/check_imports.py b/metadata-ingestion/src/datahub/testing/check_imports.py index e4bf07882b36a..b65b3aa90dca3 100644 --- a/metadata-ingestion/src/datahub/testing/check_imports.py +++ b/metadata-ingestion/src/datahub/testing/check_imports.py @@ -1,4 +1,5 @@ import pathlib +import re from typing import List @@ -32,3 +33,30 @@ def ensure_no_indirect_model_imports(dirs: List[pathlib.Path]) -> None: f"Disallowed import found in {file}: `{line.rstrip()}`. " f"Import from {replacement} instead." ) + + +def ban_direct_datahub_imports(dirs: List[pathlib.Path]) -> None: + # We also want to ban all direct imports of datahub. + # The base `datahub` package is used to export public-facing classes. + # If we import it directly, we'll likely end up with circular imports. + + banned_strings = [ + r"^import datahub[\s$]", + r"^from datahub import", + ] + ignored_files = { + __file__, + } + for dir in dirs: + for file in dir.rglob("*.py"): + if str(file) in ignored_files: + continue + + file_contents = file.read_text() + + for banned_string in banned_strings: + if re.search(banned_string, file_contents, re.MULTILINE): + raise ValueError( + f"Disallowed bare datahub import found in {file}. " + f"Do not import datahub directly; instead import from the underlying file." + ) diff --git a/metadata-ingestion/src/datahub/upgrade/upgrade.py b/metadata-ingestion/src/datahub/upgrade/upgrade.py index 7872681797d6f..276f4ccd54a4a 100644 --- a/metadata-ingestion/src/datahub/upgrade/upgrade.py +++ b/metadata-ingestion/src/datahub/upgrade/upgrade.py @@ -10,7 +10,7 @@ from packaging.version import Version from pydantic import BaseModel -from datahub import __version__ +from datahub._version import __version__ from datahub.cli.config_utils import load_client_config from datahub.ingestion.graph.client import DataHubGraph from datahub.utilities.perf_timer import PerfTimer diff --git a/metadata-ingestion/tests/unit/test_packages.py b/metadata-ingestion/tests/unit/test_packages.py index f4045bac6e6ef..ab538cf0c1ed0 100644 --- a/metadata-ingestion/tests/unit/test_packages.py +++ b/metadata-ingestion/tests/unit/test_packages.py @@ -1,7 +1,10 @@ import pytest import setuptools -from datahub.testing.check_imports import ensure_no_indirect_model_imports +from datahub.testing.check_imports import ( + ban_direct_datahub_imports, + ensure_no_indirect_model_imports, +) from datahub.testing.check_str_enum import ensure_no_enum_mixin @@ -16,6 +19,7 @@ def test_check_import_paths(pytestconfig: pytest.Config) -> None: root = pytestconfig.rootpath ensure_no_indirect_model_imports([root / "src", root / "tests"]) + ban_direct_datahub_imports([root / "src", root / "tests"]) def test_check_str_enum_usage(pytestconfig: pytest.Config) -> None: diff --git a/metadata-ingestion/tests/unit/test_packaging.py b/metadata-ingestion/tests/unit/test_packaging.py index 4b99be750a4da..f9a3ae9562d3e 100644 --- a/metadata-ingestion/tests/unit/test_packaging.py +++ b/metadata-ingestion/tests/unit/test_packaging.py @@ -1,6 +1,6 @@ import pytest -import datahub as datahub_metadata +import datahub._version as datahub_version @pytest.mark.filterwarnings( @@ -10,4 +10,4 @@ def test_datahub_version(): # Simply importing pkg_resources checks for unsatisfied dependencies. import pkg_resources - assert pkg_resources.get_distribution(datahub_metadata.__package_name__).version + assert pkg_resources.get_distribution(datahub_version.__package_name__).version diff --git a/python-build/generate_release_scripts.py b/python-build/generate_release_scripts.py new file mode 100644 index 0000000000000..36253a24cfa3b --- /dev/null +++ b/python-build/generate_release_scripts.py @@ -0,0 +1,85 @@ +import dataclasses +import pathlib + +REPO_ROOT = pathlib.Path(__file__).parent.parent + + +@dataclasses.dataclass +class Package: + # TODO: This doesn't have the actual package names. + directory: str + main_module_name: str + + def root_from_directory(self) -> str: + ups = self.directory.count("/") + 1 + + return "/".join([".."] * ups) + + +packages = [ + Package(directory="metadata-ingestion", main_module_name="datahub"), + Package( + directory="metadata-ingestion-modules/airflow-plugin", + main_module_name="datahub_airflow_plugin", + ), + Package( + directory="metadata-ingestion-modules/dagster-plugin", + main_module_name="datahub_dagster_plugin", + ), + Package( + directory="metadata-ingestion-modules/gx-plugin", + main_module_name="datahub_gx_plugin", + ), + Package( + directory="metadata-ingestion-modules/prefect-plugin", + main_module_name="prefect_datahub", + ), +] + +generation_header = f"# Auto-generated by {pathlib.Path(__file__).relative_to(REPO_ROOT)}. Do not edit manually." + +template = """\ +#!/bin/bash +%s + +set -euxo pipefail + +ROOT=%s +MODULE=%s + +if [[ ! ${RELEASE_SKIP_TEST:-} ]] && [[ ! ${RELEASE_SKIP_INSTALL:-} ]]; then + ${ROOT}/gradlew build # also runs tests +elif [[ ! ${RELEASE_SKIP_INSTALL:-} ]]; then + ${ROOT}/gradlew install +fi + +# Check packaging constraint. +python -c 'import setuptools; where="./src"; assert setuptools.find_packages(where) == setuptools.find_namespace_packages(where), "you seem to be missing or have extra __init__.py files"' + +# Update the release version. +if [[ ! ${RELEASE_VERSION:-} ]]; then + echo "RELEASE_VERSION is not set" + exit 1 +fi +sed -i.bak "s/__version__ = .*$/__version__ = \\"$(echo $RELEASE_VERSION|sed s/-/+/)\\"/" src/${MODULE}/_version.py + +# Build and upload the release. +rm -rf build dist || true +python -m build +if [[ ! ${RELEASE_SKIP_UPLOAD:-} ]]; then + python -m twine upload 'dist/*' +fi +mv src/${MODULE}/_version.py.bak src/${MODULE}/_version.py +""" + +for package in packages: + script_path = REPO_ROOT / package.directory / "scripts/release.sh" + + script_path.write_text( + template + % ( + generation_header, + package.root_from_directory(), + package.main_module_name, + ) + ) From a4f8d170f9c232989edd5e6235b6b062b7b326db Mon Sep 17 00:00:00 2001 From: david-leifker <114954101+david-leifker@users.noreply.github.com> Date: Sat, 25 Jan 2025 10:25:43 -0600 Subject: [PATCH 07/18] misc(search-explain): set default value (#12463) --- .../operations/elastic/OperationsController.java | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/operations/elastic/OperationsController.java b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/operations/elastic/OperationsController.java index 64333009dda7a..ea437f4cf3511 100644 --- a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/operations/elastic/OperationsController.java +++ b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/operations/elastic/OperationsController.java @@ -231,7 +231,10 @@ public ResponseEntity explainSearchQuery( @Nullable List sortCriteria, @Parameter(name = "searchFlags", description = "Optional configuration flags.") - @RequestParam(value = "searchFlags", required = false) + @RequestParam( + value = "searchFlags", + required = false, + defaultValue = "{\"fulltext\":true}") @Nullable String searchFlags) throws JsonProcessingException { @@ -338,7 +341,10 @@ public ResponseEntity explainSearchQueryDiff( @Nullable List sortCriteria, @Parameter(name = "searchFlags", description = "Optional configuration flags.") - @RequestParam(value = "searchFlags", required = false) + @RequestParam( + value = "searchFlags", + required = false, + defaultValue = "{\"fulltext\":true}") @Nullable String searchFlags) throws JsonProcessingException { From 3e9e6e4fe07d68d3926f73080f4898d142720a58 Mon Sep 17 00:00:00 2001 From: sagar-salvi-apptware <159135491+sagar-salvi-apptware@users.noreply.github.com> Date: Mon, 27 Jan 2025 15:37:31 +0530 Subject: [PATCH 08/18] fix(lookml/ingestion): Skip unreferenced or improperly loaded Lookml view files (#12351) --- .../ingestion/source/looker/lookml_config.py | 5 +- .../ingestion/source/looker/lookml_source.py | 56 ++++++++++++++++ .../lkml_unreachable_views/data.model.lkml | 10 +++ .../employee_income_source.view.lkml | 40 ++++++++++++ .../employee_total_income.view.lkml | 18 ++++++ .../unreachable_view.view.lkml | 18 ++++++ .../tests/integration/lookml/test_lookml.py | 64 ++++++++++++++++--- 7 files changed, 201 insertions(+), 10 deletions(-) create mode 100644 metadata-ingestion/tests/integration/lookml/lkml_unreachable_views/data.model.lkml create mode 100644 metadata-ingestion/tests/integration/lookml/lkml_unreachable_views/employee_income_source.view.lkml create mode 100644 metadata-ingestion/tests/integration/lookml/lkml_unreachable_views/employee_total_income.view.lkml create mode 100644 metadata-ingestion/tests/integration/lookml/lkml_unreachable_views/unreachable_view.view.lkml diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_config.py b/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_config.py index 7ffb895349ed2..4d3255c3c0715 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_config.py @@ -139,7 +139,10 @@ class LookMLSourceConfig( ) emit_reachable_views_only: bool = Field( True, - description="When enabled, only views that are reachable from explores defined in the model files are emitted", + description=( + "When enabled, only views that are reachable from explores defined in the model files are emitted. " + "If set to False, all views imported in model files are emitted. Views that are unreachable i.e. not explicitly defined in the model files are currently not emitted however reported as warning for debugging purposes." + ), ) populate_sql_logic_for_missing_descriptions: bool = Field( False, diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py b/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py index a8575c84b510d..9a937840a5012 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py @@ -59,6 +59,7 @@ from datahub.ingestion.source.looker.lookml_config import ( BASE_PROJECT_NAME, MODEL_FILE_EXTENSION, + VIEW_FILE_EXTENSION, LookerConnectionDefinition, LookMLSourceConfig, LookMLSourceReport, @@ -884,6 +885,7 @@ def get_internal_workunits(self) -> Iterable[MetadataWorkUnit]: # noqa: C901 view_urn = maybe_looker_view.id.get_urn( self.source_config ) + view_connection_mapping = view_connection_map.get( view_urn ) @@ -939,6 +941,9 @@ def get_internal_workunits(self) -> Iterable[MetadataWorkUnit]: # noqa: C901 str(maybe_looker_view.id) ) + if not self.source_config.emit_reachable_views_only: + self.report_skipped_unreachable_views(viewfile_loader, processed_view_map) + if ( self.source_config.tag_measures_and_dimensions and self.reporter.events_produced != 0 @@ -966,5 +971,56 @@ def gen_project_workunits(self, project_name: str) -> Iterable[MetadataWorkUnit] ), ).as_workunit() + def report_skipped_unreachable_views( + self, + viewfile_loader: LookerViewFileLoader, + processed_view_map: Dict[str, Set[str]] = {}, + ) -> None: + view_files: Dict[str, List[pathlib.Path]] = {} + for project, folder_path in self.base_projects_folder.items(): + folder = pathlib.Path(folder_path) + view_files[project] = list(folder.glob(f"**/*{VIEW_FILE_EXTENSION}")) + + skipped_view_paths: Dict[str, List[str]] = {} + for project, views in view_files.items(): + skipped_paths: Set[str] = set() + + for view_path in views: + # Check if the view is already in processed_view_map + if not any( + str(view_path) in view_set + for view_set in processed_view_map.values() + ): + looker_viewfile = viewfile_loader.load_viewfile( + path=str(view_path), + project_name=project, + connection=None, + reporter=self.reporter, + ) + + if looker_viewfile is not None: + for raw_view in looker_viewfile.views: + raw_view_name = raw_view.get("name", "") + + if ( + raw_view_name + and self.source_config.view_pattern.allowed( + raw_view_name + ) + ): + skipped_paths.add(str(view_path)) + + skipped_view_paths[project] = list(skipped_paths) + + for project, view_paths in skipped_view_paths.items(): + for path in view_paths: + self.reporter.report_warning( + title="Skipped View File", + message=( + "The Looker view file was skipped because it may not be referenced by any models." + ), + context=(f"Project: {project}, View File Path: {path}"), + ) + def get_report(self): return self.reporter diff --git a/metadata-ingestion/tests/integration/lookml/lkml_unreachable_views/data.model.lkml b/metadata-ingestion/tests/integration/lookml/lkml_unreachable_views/data.model.lkml new file mode 100644 index 0000000000000..b19135659b07a --- /dev/null +++ b/metadata-ingestion/tests/integration/lookml/lkml_unreachable_views/data.model.lkml @@ -0,0 +1,10 @@ +connection: "my_connection" + +include: "employee_income_source.view.lkml" +include: "employee_total_income.view.lkml" + +explore: employee_income_source { +} + +explore: employee_total_income { +} diff --git a/metadata-ingestion/tests/integration/lookml/lkml_unreachable_views/employee_income_source.view.lkml b/metadata-ingestion/tests/integration/lookml/lkml_unreachable_views/employee_income_source.view.lkml new file mode 100644 index 0000000000000..f4a443ab11537 --- /dev/null +++ b/metadata-ingestion/tests/integration/lookml/lkml_unreachable_views/employee_income_source.view.lkml @@ -0,0 +1,40 @@ +view: employee_income_source { + derived_table: { + sql: SELECT + employee_id, + employee_name, + {% if dw_eff_dt_date._is_selected or finance_dw_eff_dt_date._is_selected %} + prod_core.data.r_metric_summary_v2 + {% elsif dw_eff_dt_week._is_selected or finance_dw_eff_dt_week._is_selected %} + prod_core.data.r_metric_summary_v3 + {% else %} + 'default_table' as source + {% endif %}, + employee_income + FROM source_table + WHERE + {% condition source_region %} source_table.region {% endcondition %} + ;; + } + + dimension: id { + type: number + sql: ${TABLE}.employee_id;; + } + + dimension: name { + type: string + sql: ${TABLE}.employee_name;; + } + + dimension: source { + type: string + sql: ${TABLE}.source ;; + } + + dimension: income { + type: number + sql: ${TABLE}.employee_income ;; + } + +} diff --git a/metadata-ingestion/tests/integration/lookml/lkml_unreachable_views/employee_total_income.view.lkml b/metadata-ingestion/tests/integration/lookml/lkml_unreachable_views/employee_total_income.view.lkml new file mode 100644 index 0000000000000..18a1ab660b3a1 --- /dev/null +++ b/metadata-ingestion/tests/integration/lookml/lkml_unreachable_views/employee_total_income.view.lkml @@ -0,0 +1,18 @@ +view: employee_total_income { + sql_table_name: ${employee_income_source.SQL_TABLE_NAME} ;; + + dimension: id { + type: number + sql: ${TABLE}.id;; + } + + dimension: name { + type: string + sql: ${TABLE}.name;; + } + + measure: total_income { + type: sum + sql: ${TABLE}.income;; + } +} diff --git a/metadata-ingestion/tests/integration/lookml/lkml_unreachable_views/unreachable_view.view.lkml b/metadata-ingestion/tests/integration/lookml/lkml_unreachable_views/unreachable_view.view.lkml new file mode 100644 index 0000000000000..5c75abe41cfce --- /dev/null +++ b/metadata-ingestion/tests/integration/lookml/lkml_unreachable_views/unreachable_view.view.lkml @@ -0,0 +1,18 @@ +view: employee_unreachable { + sql_table_name: ${employee_income_source.SQL_TABLE_NAME} ;; + + dimension: id { + type: number + sql: ${TABLE}.id;; + } + + dimension: name { + type: string + sql: ${TABLE}.name;; + } + + measure: total_income { + type: sum + sql: ${TABLE}.income;; + } +} diff --git a/metadata-ingestion/tests/integration/lookml/test_lookml.py b/metadata-ingestion/tests/integration/lookml/test_lookml.py index d803b8498104f..ac01132468418 100644 --- a/metadata-ingestion/tests/integration/lookml/test_lookml.py +++ b/metadata-ingestion/tests/integration/lookml/test_lookml.py @@ -10,6 +10,8 @@ from freezegun import freeze_time from looker_sdk.sdk.api40.models import DBConnection +from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.run.pipeline import Pipeline from datahub.ingestion.source.file import read_metadata_file from datahub.ingestion.source.looker.looker_dataclasses import LookerModel @@ -20,6 +22,7 @@ ) from datahub.ingestion.source.looker.lookml_config import LookMLSourceConfig from datahub.ingestion.source.looker.lookml_refinement import LookerRefinementResolver +from datahub.ingestion.source.looker.lookml_source import LookMLSource from datahub.metadata.schema_classes import ( DatasetSnapshotClass, MetadataChangeEventClass, @@ -78,7 +81,8 @@ def test_lookml_ingest(pytestconfig, tmp_path, mock_time): ) pipeline.run() pipeline.pretty_print_summary() - pipeline.raise_from_status(raise_warnings=True) + pipeline.raise_from_status(raise_warnings=False) + assert pipeline.source.get_report().warnings.total_elements == 1 mce_helpers.check_golden_file( pytestconfig, @@ -112,7 +116,8 @@ def test_lookml_refinement_ingest(pytestconfig, tmp_path, mock_time): pipeline = Pipeline.create(new_recipe) pipeline.run() pipeline.pretty_print_summary() - pipeline.raise_from_status(raise_warnings=True) + pipeline.raise_from_status(raise_warnings=False) + assert pipeline.source.get_report().warnings.total_elements == 1 golden_path = test_resources_dir / "refinements_ingestion_golden.json" mce_helpers.check_golden_file( @@ -142,7 +147,8 @@ def test_lookml_refinement_include_order(pytestconfig, tmp_path, mock_time): pipeline = Pipeline.create(new_recipe) pipeline.run() pipeline.pretty_print_summary() - pipeline.raise_from_status(raise_warnings=True) + pipeline.raise_from_status(raise_warnings=False) + assert pipeline.source.get_report().warnings.total_elements == 1 golden_path = test_resources_dir / "refinement_include_order_golden.json" mce_helpers.check_golden_file( @@ -332,7 +338,8 @@ def test_lookml_ingest_offline(pytestconfig, tmp_path, mock_time): ) pipeline.run() pipeline.pretty_print_summary() - pipeline.raise_from_status(raise_warnings=True) + pipeline.raise_from_status(raise_warnings=False) + assert pipeline.source.get_report().warnings.total_elements == 1 mce_helpers.check_golden_file( pytestconfig, @@ -377,7 +384,8 @@ def test_lookml_ingest_offline_with_model_deny(pytestconfig, tmp_path, mock_time ) pipeline.run() pipeline.pretty_print_summary() - pipeline.raise_from_status(raise_warnings=True) + pipeline.raise_from_status(raise_warnings=False) + assert pipeline.source.get_report().warnings.total_elements == 1 mce_helpers.check_golden_file( pytestconfig, @@ -424,7 +432,8 @@ def test_lookml_ingest_offline_platform_instance(pytestconfig, tmp_path, mock_ti ) pipeline.run() pipeline.pretty_print_summary() - pipeline.raise_from_status(raise_warnings=True) + pipeline.raise_from_status(raise_warnings=False) + assert pipeline.source.get_report().warnings.total_elements == 1 mce_helpers.check_golden_file( pytestconfig, @@ -507,7 +516,8 @@ def ingestion_test( ) pipeline.run() pipeline.pretty_print_summary() - pipeline.raise_from_status(raise_warnings=True) + pipeline.raise_from_status(raise_warnings=False) + assert pipeline.source.get_report().warnings.total_elements == 1 mce_helpers.check_golden_file( pytestconfig, @@ -553,7 +563,8 @@ def test_lookml_git_info(pytestconfig, tmp_path, mock_time): ) pipeline.run() pipeline.pretty_print_summary() - pipeline.raise_from_status(raise_warnings=True) + pipeline.raise_from_status(raise_warnings=False) + assert pipeline.source.get_report().warnings.total_elements == 1 mce_helpers.check_golden_file( pytestconfig, @@ -668,7 +679,8 @@ def test_hive_platform_drops_ids(pytestconfig, tmp_path, mock_time): ) pipeline.run() pipeline.pretty_print_summary() - pipeline.raise_from_status(raise_warnings=True) + pipeline.raise_from_status(raise_warnings=False) + assert pipeline.source.get_report().warnings.total_elements == 1 events = read_metadata_file(tmp_path / mce_out) for mce in events: @@ -1051,3 +1063,37 @@ def test_gms_schema_resolution(pytestconfig, tmp_path, mock_time): output_path=tmp_path / mce_out_file, golden_path=golden_path, ) + + +@freeze_time(FROZEN_TIME) +def test_unreachable_views(pytestconfig): + test_resources_dir = pytestconfig.rootpath / "tests/integration/lookml" + + config = { + "base_folder": f"{test_resources_dir}/lkml_unreachable_views", + "connection_to_platform_map": {"my_connection": "postgres"}, + "parse_table_names_from_sql": True, + "tag_measures_and_dimensions": False, + "project_name": "lkml_samples", + "model_pattern": {"deny": ["data2"]}, + "emit_reachable_views_only": False, + "liquid_variable": { + "order_region": "ap-south-1", + "source_region": "ap-south-1", + "dw_eff_dt_date": { + "_is_selected": True, + }, + }, + } + + source = LookMLSource( + LookMLSourceConfig.parse_obj(config), + ctx=PipelineContext(run_id="lookml-source-test"), + ) + wu: List[MetadataWorkUnit] = [*source.get_workunits_internal()] + assert len(wu) == 15 + assert source.reporter.warnings.total_elements == 1 + assert ( + "The Looker view file was skipped because it may not be referenced by any models." + in [failure.message for failure in source.get_report().warnings] + ) From 6ab2c702b7a7b9b829a2093827244970e2ee50f9 Mon Sep 17 00:00:00 2001 From: Aseem Bansal Date: Mon, 27 Jan 2025 16:13:13 +0530 Subject: [PATCH 09/18] docs: add beta labels for Automation (#12459) --- docs/automations/bigquery-metadata-sync.md | 6 ++++++ docs/automations/docs-propagation.md | 6 ++++++ docs/automations/glossary-term-propagation.md | 6 ++++++ docs/automations/snowflake-tag-propagation.md | 7 ++++++- 4 files changed, 24 insertions(+), 1 deletion(-) diff --git a/docs/automations/bigquery-metadata-sync.md b/docs/automations/bigquery-metadata-sync.md index 78bdbdd453e9f..705c3951c060d 100644 --- a/docs/automations/bigquery-metadata-sync.md +++ b/docs/automations/bigquery-metadata-sync.md @@ -4,6 +4,12 @@ import FeatureAvailability from '@site/src/components/FeatureAvailability'; +:::info + +This feature is currently in open beta in Acryl Cloud. Reach out to your Acryl representative to get access. + +::: + ## Introduction BigQuery Metadata Sync is an automation that synchronizes DataHub Tags, Table and Column descriptions, and Column Glossary Terms with diff --git a/docs/automations/docs-propagation.md b/docs/automations/docs-propagation.md index 9f38902894191..af553b9b84a7e 100644 --- a/docs/automations/docs-propagation.md +++ b/docs/automations/docs-propagation.md @@ -1,5 +1,11 @@ # Documentation Propagation Automation +:::info + +This feature is currently in open beta in Acryl Cloud. Reach out to your Acryl representative to get access. + +::: + ## Introduction Documentation Propagation is an automation automatically propagates column and asset (coming soon) descriptions based on downstream column-level lineage and sibling relationships. diff --git a/docs/automations/glossary-term-propagation.md b/docs/automations/glossary-term-propagation.md index 90e8e75ea44ef..5a0f20eb79db2 100644 --- a/docs/automations/glossary-term-propagation.md +++ b/docs/automations/glossary-term-propagation.md @@ -2,6 +2,12 @@ +:::info + +This feature is currently in open beta in Acryl Cloud. Reach out to your Acryl representative to get access. + +::: + ## Introduction Glossary Term Propagation is an automation feature that propagates classification labels (Glossary Terms) across column and assets based on downstream lineage and sibling relationships. diff --git a/docs/automations/snowflake-tag-propagation.md b/docs/automations/snowflake-tag-propagation.md index 8eded451644cc..dd920c247bbfc 100644 --- a/docs/automations/snowflake-tag-propagation.md +++ b/docs/automations/snowflake-tag-propagation.md @@ -4,7 +4,12 @@ import FeatureAvailability from '@site/src/components/FeatureAvailability'; -> Note that this Automation in currently in open **Beta**. With any questions or issues, please reach out to your Acryl representative. +:::info + +This feature is currently in open beta in Acryl Cloud. Reach out to your Acryl representative to get access. + +::: + ## Introduction From 0c98cdce2e017a2066d72cdc64c641aebd97d214 Mon Sep 17 00:00:00 2001 From: Aseem Bansal Date: Mon, 27 Jan 2025 17:24:11 +0530 Subject: [PATCH 10/18] fix(ingest/glue): add info in report (#12470) --- metadata-ingestion/src/datahub/ingestion/source/aws/glue.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py b/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py index 2ace71b6ff6c1..214b14a2b6c10 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py +++ b/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py @@ -218,6 +218,7 @@ def platform_validator(cls, v: str) -> str: @dataclass class GlueSourceReport(StaleEntityRemovalSourceReport): + catalog_id: Optional[str] = None tables_scanned = 0 filtered: List[str] = dataclass_field(default_factory=list) databases: EntityFilterReport = EntityFilterReport.field(type="database") @@ -315,6 +316,7 @@ def __init__(self, config: GlueSourceConfig, ctx: PipelineContext): self.extract_owners = config.extract_owners self.source_config = config self.report = GlueSourceReport() + self.report.catalog_id = self.source_config.catalog_id self.glue_client = config.glue_client self.s3_client = config.s3_client self.extract_transforms = config.extract_transforms From 1ca95cc2abe3de16d9c9bdf29a5c1adf58bd0d6e Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Mon, 27 Jan 2025 08:55:23 -0800 Subject: [PATCH 11/18] docs(ingest/tableau): tweak permissions docs (#12460) --- metadata-ingestion/docs/sources/tableau/tableau_pre.md | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/metadata-ingestion/docs/sources/tableau/tableau_pre.md b/metadata-ingestion/docs/sources/tableau/tableau_pre.md index a3ac85818a51a..2cc9ed2351322 100644 --- a/metadata-ingestion/docs/sources/tableau/tableau_pre.md +++ b/metadata-ingestion/docs/sources/tableau/tableau_pre.md @@ -12,16 +12,15 @@ DataHub supports two authentication methods: 1. Username/Password 2. [Personal Access Token](https://help.tableau.com/current/pro/desktop/en-us/useracct.htm#create-and-revoke-personal-access-tokens) -Either way, the user/token must have the **Site Administrator Explorer** site role. +Either way, the user/token must have at least the **Site Administrator Explorer** site role. :::info -We need the **Site Administrator Explorer** site role in order to get complete metadata from Tableau. +We need at least the **Site Administrator Explorer** site role in order to get complete metadata from Tableau. Roles with higher privileges, like **Site Administrator Creator** or **Server Admin** also work. With any lower role, the Tableau Metadata API returns missing/partial metadata. This particularly affects data source fields and definitions, which impacts our ability to extract most columns and generate column lineage. Some table-level lineage is also impacted. - -Other site roles, including Site Administrator Creator and Viewer, are insufficient due to these limitations in the current Tableau Metadata API. +Other site roles, like Viewer or Explorer, are insufficient due to these limitations in the current Tableau Metadata API. ::: From d8ac6cd2586e041a0cb7b18a6c1f04207932dbe2 Mon Sep 17 00:00:00 2001 From: Shirshanka Das Date: Mon, 27 Jan 2025 19:49:12 -0800 Subject: [PATCH 12/18] gql: add data platform instance to search fragment (#12472) --- datahub-web-react/src/graphql/search.graphql | 3 +++ 1 file changed, 3 insertions(+) diff --git a/datahub-web-react/src/graphql/search.graphql b/datahub-web-react/src/graphql/search.graphql index 9edd675402286..d12193b471d46 100644 --- a/datahub-web-react/src/graphql/search.graphql +++ b/datahub-web-react/src/graphql/search.graphql @@ -954,6 +954,9 @@ fragment searchResultsWithoutSchemaField on Entity { ...versionProperties } } + ... on DataPlatformInstance { + ...dataPlatformInstanceFields + } } fragment searchResultFields on Entity { From 563656c4d52ee8ab2ee373b78c4f896c643e8def Mon Sep 17 00:00:00 2001 From: sagar-salvi-apptware <159135491+sagar-salvi-apptware@users.noreply.github.com> Date: Tue, 28 Jan 2025 12:25:45 +0530 Subject: [PATCH 13/18] feat(ingestion/lookml): resolve access notation for LookML Constant (#12277) Co-authored-by: Siddique Bagwan Co-authored-by: Mayuri Nehate <33225191+mayurinehate@users.noreply.github.com> --- .../docs/sources/looker/looker_recipe.yml | 14 + .../docs/sources/looker/lookml_post.md | 54 +- .../source/looker/looker_dataclasses.py | 8 + .../source/looker/looker_file_loader.py | 13 +- .../source/looker/looker_template_language.py | 118 +++- .../ingestion/source/looker/lookml_config.py | 18 +- .../ingestion/source/looker/lookml_source.py | 76 ++- .../tests/integration/lookml/test_lookml.py | 143 ++++- .../data.model.lkml | 2 +- .../data.model.lkml | 10 + .../manifest.lkml | 15 + .../star_award_winner.view.lkml | 12 + .../star_award_winner_dev.view.lkml | 17 + .../vv_lineage_lookml_constant_golden.json | 514 ++++++++++++++++++ 14 files changed, 962 insertions(+), 52 deletions(-) create mode 100644 metadata-ingestion/tests/integration/lookml/vv-lineage-and-lookml-constant/data.model.lkml create mode 100644 metadata-ingestion/tests/integration/lookml/vv-lineage-and-lookml-constant/manifest.lkml create mode 100644 metadata-ingestion/tests/integration/lookml/vv-lineage-and-lookml-constant/star_award_winner.view.lkml create mode 100644 metadata-ingestion/tests/integration/lookml/vv-lineage-and-lookml-constant/star_award_winner_dev.view.lkml create mode 100644 metadata-ingestion/tests/integration/lookml/vv_lineage_lookml_constant_golden.json diff --git a/metadata-ingestion/docs/sources/looker/looker_recipe.yml b/metadata-ingestion/docs/sources/looker/looker_recipe.yml index 42209f8cc6809..0939b6546411d 100644 --- a/metadata-ingestion/docs/sources/looker/looker_recipe.yml +++ b/metadata-ingestion/docs/sources/looker/looker_recipe.yml @@ -8,4 +8,18 @@ source: client_id: ${LOOKER_CLIENT_ID} client_secret: ${LOOKER_CLIENT_SECRET} + # Liquid variables + # liquid_variables: + # _user_attributes: + # looker_env: "dev" + # dev_database_prefix: "employee" + # dev_schema_prefix: "public" + # dw_eff_dt_date: + # _is_selected: true + # source_region: "ap-south-1" + # db: "test-db" + + # LookML Constants + # lookml_constants: + # star_award_winner_year: "public.winner_2025" # sink configs diff --git a/metadata-ingestion/docs/sources/looker/lookml_post.md b/metadata-ingestion/docs/sources/looker/lookml_post.md index 8a4bf823ffc27..fdbe7f3e1217d 100644 --- a/metadata-ingestion/docs/sources/looker/lookml_post.md +++ b/metadata-ingestion/docs/sources/looker/lookml_post.md @@ -1,11 +1,49 @@ -#### Configuration Notes - -1. If a view contains a liquid template (e.g. `sql_table_name: {{ user_attributes['db']}}.kafka_streaming.events }}`, with `db=ANALYTICS_PROD`), then you will need to specify the values of those variables in the `liquid_variable` config as shown below: - ```yml - liquid_variable: - user_attributes: - db: ANALYTICS_PROD - ``` +### Configuration Notes + +1. Handling Liquid Templates + + If a view contains a liquid template, for example: + + ``` + sql_table_name: {{ user_attributes['db'] }}.kafka_streaming.events + ``` + + where `db=ANALYTICS_PROD`, you need to specify the values of those variables in the liquid_variables configuration as shown below: + + ```yml + liquid_variables: + user_attributes: + db: ANALYTICS_PROD + ``` + +2. Resolving LookML Constants + + If a view contains a LookML constant, for example: + + ``` + sql_table_name: @{db}.kafka_streaming.events; + ``` + + Ingestion attempts to resolve it's value by looking at project manifest files + + ```yml + manifest.lkml + constant: db { + value: "ANALYTICS_PROD" + } + ``` + + - If the constant's value is not resolved or incorrectly resolved, you can specify `lookml_constants` configuration in ingestion recipe as shown below. The constant value in recipe takes precedence over constant values resolved from manifest. + + ```yml + lookml_constants: + db: ANALYTICS_PROD + ``` + + +**Additional Notes** + +Although liquid variables and LookML constants can be used anywhere in LookML code, their values are currently resolved only for LookML views by DataHub LookML ingestion. This behavior is sufficient since LookML ingestion processes only views and their upstream dependencies. ### Multi-Project LookML (Advanced) diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_dataclasses.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_dataclasses.py index d771821a14d88..e928c25e22fbd 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_dataclasses.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_dataclasses.py @@ -32,6 +32,12 @@ class LookerField: sql: Optional[str] +@dataclass +class LookerConstant: + name: str + value: str + + @dataclass class LookerModel: connection: str @@ -75,6 +81,7 @@ def from_looker_dict( try: parsed = load_and_preprocess_file( path=included_file, + reporter=reporter, source_config=source_config, ) included_explores = parsed.get("explores", []) @@ -217,6 +224,7 @@ def resolve_includes( try: parsed = load_and_preprocess_file( path=included_file, + reporter=reporter, source_config=source_config, ) seen_so_far.add(included_file) diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_file_loader.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_file_loader.py index 9fac0b52fde0d..bd6a37fe4b4e2 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_file_loader.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_file_loader.py @@ -4,7 +4,10 @@ from typing import Dict, Optional from datahub.ingestion.source.looker.looker_config import LookerConnectionDefinition -from datahub.ingestion.source.looker.looker_dataclasses import LookerViewFile +from datahub.ingestion.source.looker.looker_dataclasses import ( + LookerConstant, + LookerViewFile, +) from datahub.ingestion.source.looker.looker_template_language import ( load_and_preprocess_file, ) @@ -30,12 +33,14 @@ def __init__( base_projects_folder: Dict[str, pathlib.Path], reporter: LookMLSourceReport, source_config: LookMLSourceConfig, + manifest_constants: Dict[str, LookerConstant] = {}, ) -> None: self.viewfile_cache: Dict[str, Optional[LookerViewFile]] = {} self._root_project_name = root_project_name self._base_projects_folder = base_projects_folder self.reporter = reporter self.source_config = source_config + self.manifest_constants = manifest_constants def _load_viewfile( self, project_name: str, path: str, reporter: LookMLSourceReport @@ -71,9 +76,15 @@ def _load_viewfile( try: logger.debug(f"Loading viewfile {path}") + # load_and preprocess_file is called multiple times for loading view file from multiple flows. + # Flag resolve_constants is a hack to avoid passing around manifest_constants from all of the flows. + # This is fine as rest of flows do not need resolution of constants. parsed = load_and_preprocess_file( path=path, + reporter=self.reporter, source_config=self.source_config, + resolve_constants=True, + manifest_constants=self.manifest_constants, ) looker_viewfile = LookerViewFile.from_looker_dict( diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_template_language.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_template_language.py index 2bcae4d46b8d5..60983f04bafa0 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_template_language.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_template_language.py @@ -2,7 +2,7 @@ import pathlib import re from abc import ABC, abstractmethod -from typing import Any, ClassVar, Dict, List, Optional, Set, Union +from typing import TYPE_CHECKING, Any, ClassVar, Dict, List, Optional, Set, Union from deepmerge import always_merger from liquid import Undefined @@ -27,8 +27,12 @@ from datahub.ingestion.source.looker.lookml_config import ( DERIVED_VIEW_PATTERN, LookMLSourceConfig, + LookMLSourceReport, ) +if TYPE_CHECKING: + from datahub.ingestion.source.looker.looker_dataclasses import LookerConstant + logger = logging.getLogger(__name__) @@ -82,7 +86,12 @@ def liquid_variable_with_default(self, text: str) -> dict: return self._create_new_liquid_variables_with_default(variables=variables) -def resolve_liquid_variable(text: str, liquid_variable: Dict[Any, Any]) -> str: +def resolve_liquid_variable( + text: str, + view_name: str, + liquid_variable: Dict[Any, Any], + report: LookMLSourceReport, +) -> str: # Set variable value to NULL if not present in liquid_variable dictionary Undefined.__str__ = lambda instance: "NULL" # type: ignore try: @@ -96,6 +105,7 @@ def resolve_liquid_variable(text: str, liquid_variable: Dict[Any, Any]) -> str: # Resolve liquid template return create_template(text).render(liquid_variable) except LiquidSyntaxError as e: + # TODO: Will add warning once we get rid of duplcate warning message for same view logger.warning(f"Unsupported liquid template encountered. error [{e.message}]") # TODO: There are some tag specific to looker and python-liquid library does not understand them. currently # we are not parsing such liquid template. @@ -103,6 +113,7 @@ def resolve_liquid_variable(text: str, liquid_variable: Dict[Any, Any]) -> str: # See doc: https://cloud.google.com/looker/docs/templated-filters and look for { % condition region %} # order.region { % endcondition %} except CustomTagException as e: + # TODO: Will add warning once we get rid of duplcate warning message for same view logger.warning(e) logger.debug(e, exc_info=e) @@ -192,15 +203,20 @@ class LookMLViewTransformer(ABC): source_config: LookMLSourceConfig - def __init__(self, source_config: LookMLSourceConfig): + def __init__( + self, + source_config: LookMLSourceConfig, + reporter: LookMLSourceReport, + ): self.source_config = source_config + self.reporter = reporter def transform(self, view: dict) -> dict: value_to_transform: Optional[str] = None - # is_attribute_supported check is required because not all transformer works on all attributes in current - # case mostly all transformer works on sql_table_name and derived.sql attributes, - # however IncompleteSqlTransformer only transform the derived.sql attribute + # is_attribute_supported check is required because not all transformers work on all attributes in the current + # case, mostly all transformers work on sql_table_name and derived.sql attributes; + # however, IncompleteSqlTransformer only transform the derived.sql attribute if SQL_TABLE_NAME in view and self.is_attribute_supported(SQL_TABLE_NAME): # Give precedence to already processed transformed view.sql_table_name to apply more transformation value_to_transform = view.get( @@ -252,7 +268,9 @@ class LiquidVariableTransformer(LookMLViewTransformer): def _apply_transformation(self, value: str, view: dict) -> str: return resolve_liquid_variable( text=value, - liquid_variable=self.source_config.liquid_variable, + liquid_variable=self.source_config.liquid_variables, + view_name=view["name"], + report=self.reporter, ) @@ -287,7 +305,7 @@ def _apply_transformation(self, value: str, view: dict) -> str: class DropDerivedViewPatternTransformer(LookMLViewTransformer): """ - drop ${} from datahub_transformed_sql_table_name and view["derived_table"]["datahub_transformed_sql_table_name"] values. + drop ${} from datahub_transformed_sql_table_name and view["derived_table"]["datahub_transformed_sql_table_name"] values. Example: transform ${employee_income_source.SQL_TABLE_NAME} to employee_income_source.SQL_TABLE_NAME """ @@ -308,8 +326,8 @@ class LookMlIfCommentTransformer(LookMLViewTransformer): evaluate_to_true_regx: str remove_if_comment_line_regx: str - def __init__(self, source_config: LookMLSourceConfig): - super().__init__(source_config=source_config) + def __init__(self, source_config: LookMLSourceConfig, reporter: LookMLSourceReport): + super().__init__(source_config=source_config, reporter=reporter) # This regx will keep whatever after -- if looker_environment -- self.evaluate_to_true_regx = r"-- if {} --".format( @@ -335,6 +353,61 @@ def _apply_transformation(self, value: str, view: dict) -> str: return self._apply_regx(value) +class LookmlConstantTransformer(LookMLViewTransformer): + """ + Replace LookML constants @{constant} from the manifest/configuration. + """ + + CONSTANT_PATTERN = r"@{(\w+)}" # Matches @{constant} + + def __init__( + self, + source_config: LookMLSourceConfig, + reporter: LookMLSourceReport, + manifest_constants: Dict[str, "LookerConstant"], + ): + super().__init__(source_config=source_config, reporter=reporter) + self.manifest_constants = manifest_constants + + def resolve_lookml_constant(self, text: str, view_name: Optional[str]) -> str: + """ + Resolves LookML constants (@{ }) from manifest or config. + Logs warnings for misplaced or missing variables. + """ + + def replace_constants(match): + key = match.group(1) + # Resolve constant from config + if key in self.source_config.lookml_constants: + return str(self.source_config.lookml_constants.get(key)) + + # Resolve constant from manifest + if key in self.manifest_constants: + return self.manifest_constants[key].value + + # Check if it's a misplaced lookml constant + if key in self.source_config.liquid_variables: + self.reporter.warning( + title="Misplaced lookml constant", + message="Use 'lookml_constants' instead of 'liquid_variables'.", + context=f"Key {key}", + ) + return f"@{{{key}}}" + + self.reporter.warning( + title="LookML constant not found", + message="The constant is missing. Either add it under 'lookml_constants' in the config or define it in `manifest.lkml`.", + context=f"view-name: {view_name}, constant: {key}", + ) + return f"@{{{key}}}" + + # Resolve @{} (constant) + return re.sub(self.CONSTANT_PATTERN, replace_constants, text) + + def _apply_transformation(self, value: str, view: dict) -> str: + return self.resolve_lookml_constant(text=value, view_name=view.get("name")) + + class TransformedLookMlView: """ TransformedLookMlView is collecting output of LookMLViewTransformer and creating a new transformed LookML view. @@ -390,24 +463,35 @@ def view(self) -> dict: def process_lookml_template_language( source_config: LookMLSourceConfig, view_lkml_file_dict: dict, + reporter: LookMLSourceReport, + manifest_constants: Dict[str, "LookerConstant"] = {}, + resolve_constants: bool = False, ) -> None: if "views" not in view_lkml_file_dict: return transformers: List[LookMLViewTransformer] = [ LookMlIfCommentTransformer( - source_config=source_config + source_config=source_config, reporter=reporter ), # First evaluate the -- if -- comments. Looker does the same LiquidVariableTransformer( - source_config=source_config + source_config=source_config, reporter=reporter ), # Now resolve liquid variables DropDerivedViewPatternTransformer( - source_config=source_config + source_config=source_config, reporter=reporter ), # Remove any ${} symbol IncompleteSqlTransformer( - source_config=source_config + source_config=source_config, reporter=reporter ), # complete any incomplete sql ] + if resolve_constants: + transformers.append( + LookmlConstantTransformer( + source_config=source_config, + manifest_constants=manifest_constants, + reporter=reporter, + ), # Resolve @{} constant with its corresponding value + ) transformed_views: List[dict] = [] @@ -422,12 +506,18 @@ def process_lookml_template_language( def load_and_preprocess_file( path: Union[str, pathlib.Path], source_config: LookMLSourceConfig, + reporter: LookMLSourceReport, + manifest_constants: Dict[str, "LookerConstant"] = {}, + resolve_constants: bool = False, ) -> dict: parsed = load_lkml(path) process_lookml_template_language( view_lkml_file_dict=parsed, + reporter=reporter, source_config=source_config, + manifest_constants=manifest_constants, + resolve_constants=resolve_constants, ) return parsed diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_config.py b/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_config.py index 4d3255c3c0715..75de6f1fe3c6e 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_config.py @@ -161,13 +161,27 @@ class LookMLSourceConfig( description="When enabled, looker refinement will be processed to adapt an existing view.", ) - liquid_variable: Dict[Any, Any] = Field( + liquid_variables: Dict[Any, Any] = Field( {}, - description="A dictionary containing Liquid variables and their corresponding values, utilized in SQL-defined " + description="A dictionary containing Liquid variables with their corresponding values, utilized in SQL-defined " "derived views. The Liquid template will be resolved in view.derived_table.sql and " "view.sql_table_name. Defaults to an empty dictionary.", ) + _liquid_variable_deprecated = pydantic_renamed_field( + old_name="liquid_variable", new_name="liquid_variables", print_warning=True + ) + + lookml_constants: Dict[str, str] = Field( + {}, + description=( + "A dictionary containing LookML constants (`@{constant_name}`) and their values. " + "If a constant is defined in the `manifest.lkml` file, its value will be used. " + "If not found in the manifest, the value from this config will be used instead. " + "Defaults to an empty dictionary." + ), + ) + looker_environment: Literal["prod", "dev"] = Field( "prod", description="A looker prod or dev environment. " diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py b/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py index 9a937840a5012..5f39821ee6c2e 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py @@ -43,6 +43,7 @@ from datahub.ingestion.source.looker.looker_connection import ( get_connection_def_based_on_connection_string, ) +from datahub.ingestion.source.looker.looker_dataclasses import LookerConstant from datahub.ingestion.source.looker.looker_lib_wrapper import LookerAPI from datahub.ingestion.source.looker.looker_template_language import ( load_and_preprocess_file, @@ -254,6 +255,7 @@ class LookerManifest: # This must be set if the manifest has local_dependency entries. # See https://cloud.google.com/looker/docs/reference/param-manifest-project-name project_name: Optional[str] + constants: Optional[List[Dict[str, str]]] local_dependencies: List[str] remote_dependencies: List[LookerRemoteDependency] @@ -310,11 +312,14 @@ def __init__(self, config: LookMLSourceConfig, ctx: PipelineContext): "manage_models permission enabled on this API key." ) from err + self.manifest_constants: Dict[str, "LookerConstant"] = {} + def _load_model(self, path: str) -> LookerModel: logger.debug(f"Loading model from file {path}") parsed = load_and_preprocess_file( path=path, + reporter=self.reporter, source_config=self.source_config, ) @@ -500,27 +505,33 @@ def get_project_name(self, model_name: str) -> str: def get_manifest_if_present(self, folder: pathlib.Path) -> Optional[LookerManifest]: manifest_file = folder / "manifest.lkml" - if manifest_file.exists(): - manifest_dict = load_and_preprocess_file( - path=manifest_file, source_config=self.source_config - ) - manifest = LookerManifest( - project_name=manifest_dict.get("project_name"), - local_dependencies=[ - x["project"] for x in manifest_dict.get("local_dependencys", []) - ], - remote_dependencies=[ - LookerRemoteDependency( - name=x["name"], url=x["url"], ref=x.get("ref") - ) - for x in manifest_dict.get("remote_dependencys", []) - ], + if not manifest_file.exists(): + self.reporter.info( + message="manifest.lkml file missing from project", + context=str(manifest_file), ) - return manifest - else: return None + manifest_dict = load_and_preprocess_file( + path=manifest_file, + source_config=self.source_config, + reporter=self.reporter, + ) + + manifest = LookerManifest( + project_name=manifest_dict.get("project_name"), + constants=manifest_dict.get("constants", []), + local_dependencies=[ + x["project"] for x in manifest_dict.get("local_dependencys", []) + ], + remote_dependencies=[ + LookerRemoteDependency(name=x["name"], url=x["url"], ref=x.get("ref")) + for x in manifest_dict.get("remote_dependencys", []) + ], + ) + return manifest + def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: return [ *super().get_workunit_processors(), @@ -575,7 +586,10 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: self.base_projects_folder[project] = p_ref self._recursively_check_manifests( - tmp_dir, BASE_PROJECT_NAME, visited_projects + tmp_dir, + BASE_PROJECT_NAME, + visited_projects, + self.manifest_constants, ) yield from self.get_internal_workunits() @@ -588,7 +602,11 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: ) def _recursively_check_manifests( - self, tmp_dir: str, project_name: str, project_visited: Set[str] + self, + tmp_dir: str, + project_name: str, + project_visited: Set[str], + manifest_constants: Dict[str, "LookerConstant"], ) -> None: if project_name in project_visited: return @@ -605,6 +623,14 @@ def _recursively_check_manifests( if not manifest: return + if manifest.constants: + for constant in manifest.constants: + if constant.get("name") and constant.get("value"): + manifest_constants[constant["name"]] = LookerConstant( + name=constant["name"], + value=constant["value"], + ) + # Special case handling if the root project has a name in the manifest file. if project_name == BASE_PROJECT_NAME and manifest.project_name: if ( @@ -664,21 +690,27 @@ def _recursively_check_manifests( project_visited.add(project_name) else: self._recursively_check_manifests( - tmp_dir, remote_project.name, project_visited + tmp_dir, + remote_project.name, + project_visited, + manifest_constants, ) for project in manifest.local_dependencies: - self._recursively_check_manifests(tmp_dir, project, project_visited) + self._recursively_check_manifests( + tmp_dir, project, project_visited, manifest_constants + ) def get_internal_workunits(self) -> Iterable[MetadataWorkUnit]: # noqa: C901 assert self.source_config.base_folder - viewfile_loader = LookerViewFileLoader( self.source_config.project_name, self.base_projects_folder, self.reporter, self.source_config, + self.manifest_constants, ) + logger.debug(f"LookML Constants : {', '.join(self.manifest_constants.keys())}") # Some views can be mentioned by multiple 'include' statements and can be included via different connections. diff --git a/metadata-ingestion/tests/integration/lookml/test_lookml.py b/metadata-ingestion/tests/integration/lookml/test_lookml.py index ac01132468418..7baaccbbaa664 100644 --- a/metadata-ingestion/tests/integration/lookml/test_lookml.py +++ b/metadata-ingestion/tests/integration/lookml/test_lookml.py @@ -14,13 +14,20 @@ from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.run.pipeline import Pipeline from datahub.ingestion.source.file import read_metadata_file -from datahub.ingestion.source.looker.looker_dataclasses import LookerModel +from datahub.ingestion.source.looker.looker_dataclasses import ( + LookerConstant, + LookerModel, +) from datahub.ingestion.source.looker.looker_template_language import ( + LookmlConstantTransformer, SpecialVariable, load_and_preprocess_file, resolve_liquid_variable, ) -from datahub.ingestion.source.looker.lookml_config import LookMLSourceConfig +from datahub.ingestion.source.looker.lookml_config import ( + LookMLSourceConfig, + LookMLSourceReport, +) from datahub.ingestion.source.looker.lookml_refinement import LookerRefinementResolver from datahub.ingestion.source.looker.lookml_source import LookMLSource from datahub.metadata.schema_classes import ( @@ -835,8 +842,7 @@ def test_manifest_parser(pytestconfig: pytest.Config) -> None: manifest_file = test_resources_dir / "lkml_manifest_samples/complex-manifest.lkml" manifest = load_and_preprocess_file( - path=manifest_file, - source_config=MagicMock(), + path=manifest_file, source_config=MagicMock(), reporter=LookMLSourceReport() ) assert manifest @@ -900,6 +906,31 @@ def test_view_to_view_lineage_and_liquid_template(pytestconfig, tmp_path, mock_t ) +@freeze_time(FROZEN_TIME) +def test_view_to_view_lineage_and_lookml_constant(pytestconfig, tmp_path, mock_time): + test_resources_dir = pytestconfig.rootpath / "tests/integration/lookml" + mce_out_file = "vv_lineage_lookml_constant_golden.json" + + new_recipe = get_default_recipe( + f"{tmp_path}/{mce_out_file}", + f"{test_resources_dir}/vv-lineage-and-lookml-constant", + ) + + new_recipe["source"]["config"]["lookml_constants"] = {"winner_table": "dev"} + + pipeline = Pipeline.create(new_recipe) + pipeline.run() + pipeline.pretty_print_summary() + assert pipeline.source.get_report().warnings.total_elements == 1 + + golden_path = test_resources_dir / "vv_lineage_lookml_constant_golden.json" + mce_helpers.check_golden_file( + pytestconfig, + output_path=tmp_path / mce_out_file, + golden_path=golden_path, + ) + + @freeze_time(FROZEN_TIME) def test_special_liquid_variables(): text: str = """{% assign source_table_variable = "source_table" | sql_quote | non_existing_filter_where_it_should_not_fail %} @@ -966,6 +997,8 @@ def test_special_liquid_variables(): actual_text = resolve_liquid_variable( text=text, liquid_variable=input_liquid_variable, + report=LookMLSourceReport(), + view_name="test", ) expected_text: str = ( @@ -976,6 +1009,108 @@ def test_special_liquid_variables(): assert actual_text == expected_text +@pytest.mark.parametrize( + "view, expected_result, warning_expected", + [ + # Case 1: Single constant replacement in sql_table_name + ( + {"sql_table_name": "@{constant1}.kafka_streaming.events"}, + {"datahub_transformed_sql_table_name": "value1.kafka_streaming.events"}, + False, + ), + # Case 2: Single constant replacement with config-defined constant + ( + {"sql_table_name": "SELECT * FROM @{constant2}"}, + {"datahub_transformed_sql_table_name": "SELECT * FROM value2"}, + False, + ), + # Case 3: Multiple constants in a derived_table SQL query + ( + {"derived_table": {"sql": "SELECT @{constant1}, @{constant3}"}}, + { + "derived_table": { + "datahub_transformed_sql": "SELECT value1, manifest_value3" + } + }, + False, + ), + # Case 4: Non-existent constant in sql_table_name + ( + {"sql_table_name": "SELECT * FROM @{nonexistent}"}, + {"datahub_transformed_sql_table_name": "SELECT * FROM @{nonexistent}"}, + False, + ), + # Case 5: View with unsupported attribute + ({"unsupported_attribute": "SELECT * FROM @{constant1}"}, {}, False), + # Case 6: View with no transformable attributes + ( + {"sql_table_name": "SELECT * FROM table_name"}, + {"datahub_transformed_sql_table_name": "SELECT * FROM table_name"}, + False, + ), + # Case 7: Constants only in manifest_constants + ( + {"sql_table_name": "SELECT @{constant3}"}, + {"datahub_transformed_sql_table_name": "SELECT manifest_value3"}, + False, + ), + # Case 8: Constants only in lookml_constants + ( + {"sql_table_name": "SELECT @{constant2}"}, + {"datahub_transformed_sql_table_name": "SELECT value2"}, + False, + ), + # Case 9: Multiple unsupported attributes + ( + { + "unsupported_attribute": "SELECT @{constant1}", + "another_unsupported_attribute": "SELECT @{constant2}", + }, + {}, + False, + ), + # Case 10: Misplaced lookml constant + ( + {"sql_table_name": "@{constant1}.@{constant2}.@{constant4}"}, + {"datahub_transformed_sql_table_name": "value1.value2.@{constant4}"}, + True, + ), + ], +) +@freeze_time(FROZEN_TIME) +def test_lookml_constant_transformer(view, expected_result, warning_expected): + """ + Test LookmlConstantTransformer with various view structures. + """ + config = MagicMock() + report = MagicMock() + config.lookml_constants = { + "constant1": "value1", + "constant2": "value2", + } + config.liquid_variables = { + "constant4": "liquid_value1", + } + + transformer = LookmlConstantTransformer( + source_config=config, + reporter=report, + manifest_constants={ + "constant1": LookerConstant(name="constant1", value="manifest_value1"), + "constant3": LookerConstant(name="constant3", value="manifest_value3"), + }, + ) + + result = transformer.transform(view) + assert result == expected_result + if warning_expected: + report.warning.assert_called_once_with( + title="Misplaced lookml constant", + message="Use 'lookml_constants' instead of 'liquid_variables'.", + context="Key constant4", + ) + + @freeze_time(FROZEN_TIME) def test_field_tag_ingest(pytestconfig, tmp_path, mock_time): test_resources_dir = pytestconfig.rootpath / "tests/integration/lookml" diff --git a/metadata-ingestion/tests/integration/lookml/vv-lineage-and-liquid-templates/data.model.lkml b/metadata-ingestion/tests/integration/lookml/vv-lineage-and-liquid-templates/data.model.lkml index d570e0ecdb5b2..4de4df34e15d1 100644 --- a/metadata-ingestion/tests/integration/lookml/vv-lineage-and-liquid-templates/data.model.lkml +++ b/metadata-ingestion/tests/integration/lookml/vv-lineage-and-liquid-templates/data.model.lkml @@ -39,4 +39,4 @@ explore: rent_as_employee_income_source { } explore: child_view { -} \ No newline at end of file +} diff --git a/metadata-ingestion/tests/integration/lookml/vv-lineage-and-lookml-constant/data.model.lkml b/metadata-ingestion/tests/integration/lookml/vv-lineage-and-lookml-constant/data.model.lkml new file mode 100644 index 0000000000000..6f425c469c954 --- /dev/null +++ b/metadata-ingestion/tests/integration/lookml/vv-lineage-and-lookml-constant/data.model.lkml @@ -0,0 +1,10 @@ +connection: "my_connection" + +include: "star_award_winner.view.lkml" +include: "star_award_winner_dev.view.lkml" + +explore: star_award_winner { +} + +explore: star_award_winner_dev { +} diff --git a/metadata-ingestion/tests/integration/lookml/vv-lineage-and-lookml-constant/manifest.lkml b/metadata-ingestion/tests/integration/lookml/vv-lineage-and-lookml-constant/manifest.lkml new file mode 100644 index 0000000000000..fcdd71a626294 --- /dev/null +++ b/metadata-ingestion/tests/integration/lookml/vv-lineage-and-lookml-constant/manifest.lkml @@ -0,0 +1,15 @@ +constant: customer_support_db { + value: "star_award_winner_year" + export: none +} + +constant: customer_support_schema { + value: "public" + export: none +} + +constant: customer_support_table { + value: "winner" + export: none +} + diff --git a/metadata-ingestion/tests/integration/lookml/vv-lineage-and-lookml-constant/star_award_winner.view.lkml b/metadata-ingestion/tests/integration/lookml/vv-lineage-and-lookml-constant/star_award_winner.view.lkml new file mode 100644 index 0000000000000..fd0fcf33c376e --- /dev/null +++ b/metadata-ingestion/tests/integration/lookml/vv-lineage-and-lookml-constant/star_award_winner.view.lkml @@ -0,0 +1,12 @@ +view: star_award_winner { + sql_table_name: @{customer_support_db}.@{customer_support_schema}.@{invalid_constant};; + + + dimension: id { + label: "id" + primary_key: yes + type: number + sql: ${TABLE}.id ;; + } + +} \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/lookml/vv-lineage-and-lookml-constant/star_award_winner_dev.view.lkml b/metadata-ingestion/tests/integration/lookml/vv-lineage-and-lookml-constant/star_award_winner_dev.view.lkml new file mode 100644 index 0000000000000..0c2417251fc15 --- /dev/null +++ b/metadata-ingestion/tests/integration/lookml/vv-lineage-and-lookml-constant/star_award_winner_dev.view.lkml @@ -0,0 +1,17 @@ +view: star_award_winner_dev { + sql_table_name: @{customer_support_db}.@{customer_support_schema}.@{winner_table};; + + + dimension: id { + label: "id" + primary_key: yes + type: number + sql: ${TABLE}.id ;; + } + + dimension: name { + type: string + sql: ${TABLE}.name;; + } + +} \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/lookml/vv_lineage_lookml_constant_golden.json b/metadata-ingestion/tests/integration/lookml/vv_lineage_lookml_constant_golden.json new file mode 100644 index 0000000000000..296f09b697ee4 --- /dev/null +++ b/metadata-ingestion/tests/integration/lookml/vv_lineage_lookml_constant_golden.json @@ -0,0 +1,514 @@ +[ +{ + "entityType": "container", + "entityUrn": "urn:li:container:78f22c19304954b15e8adb1d9809975e", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "looker", + "env": "PROD", + "project_name": "lkml_samples" + }, + "name": "lkml_samples", + "env": "PROD" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:78f22c19304954b15e8adb1d9809975e", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:78f22c19304954b15e8adb1d9809975e", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:looker" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:78f22c19304954b15e8adb1d9809975e", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "LookML Project" + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:78f22c19304954b15e8adb1d9809975e", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "Folders" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.star_award_winner,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "View" + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.star_award_winner,PROD)", + "changeType": "UPSERT", + "aspectName": "viewProperties", + "aspect": { + "json": { + "materialized": false, + "viewLogic": "view: star_award_winner {\n sql_table_name: @{customer_support_db}.@{customer_support_schema}.@{invalid_constant};;\n\n\n dimension: id {\n label: \"id\"\n primary_key: yes\n type: number\n sql: ${TABLE}.id ;;\n }\n\n}", + "viewLanguage": "lookml" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.star_award_winner,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:78f22c19304954b15e8adb1d9809975e" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.star_award_winner,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.BrowsePaths": { + "paths": [ + "/Develop/lkml_samples/" + ] + } + }, + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.UpstreamLineage": { + "upstreams": [ + { + "auditStamp": { + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:postgres,star_award_winner_year.public.@{invalid_constant},PROD)", + "type": "VIEW" + } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,star_award_winner_year.public.@{invalid_constant},PROD),id)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.star_award_winner,PROD),id)" + ], + "confidenceScore": 1.0 + } + ] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "star_award_winner", + "platform": "urn:li:dataPlatform:looker", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.OtherSchema": { + "rawSchema": "" + } + }, + "fields": [ + { + "fieldPath": "id", + "nullable": false, + "description": "", + "label": "id", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "number", + "recursive": false, + "globalTags": { + "tags": [ + { + "tag": "urn:li:tag:Dimension" + } + ] + }, + "isPartOfKey": true + } + ], + "primaryKeys": [ + "id" + ] + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "looker.file.path": "star_award_winner.view.lkml", + "looker.model": "data" + }, + "name": "star_award_winner", + "tags": [] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.star_award_winner,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "Develop" + }, + { + "id": "urn:li:container:78f22c19304954b15e8adb1d9809975e", + "urn": "urn:li:container:78f22c19304954b15e8adb1d9809975e" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.star_award_winner_dev,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "View" + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.star_award_winner_dev,PROD)", + "changeType": "UPSERT", + "aspectName": "viewProperties", + "aspect": { + "json": { + "materialized": false, + "viewLogic": "view: star_award_winner_dev {\n sql_table_name: @{customer_support_db}.@{customer_support_schema}.@{winner_table};;\n\n\n dimension: id {\n label: \"id\"\n primary_key: yes\n type: number\n sql: ${TABLE}.id ;;\n }\n\n dimension: name {\n type: string\n sql: ${TABLE}.name;;\n }\n\n}", + "viewLanguage": "lookml" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.star_award_winner_dev,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:78f22c19304954b15e8adb1d9809975e" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.star_award_winner_dev,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.BrowsePaths": { + "paths": [ + "/Develop/lkml_samples/" + ] + } + }, + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.UpstreamLineage": { + "upstreams": [ + { + "auditStamp": { + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:postgres,star_award_winner_year.public.dev,PROD)", + "type": "VIEW" + } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,star_award_winner_year.public.dev,PROD),id)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.star_award_winner_dev,PROD),id)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,star_award_winner_year.public.dev,PROD),name)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.star_award_winner_dev,PROD),name)" + ], + "confidenceScore": 1.0 + } + ] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "star_award_winner_dev", + "platform": "urn:li:dataPlatform:looker", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.OtherSchema": { + "rawSchema": "" + } + }, + "fields": [ + { + "fieldPath": "id", + "nullable": false, + "description": "", + "label": "id", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "number", + "recursive": false, + "globalTags": { + "tags": [ + { + "tag": "urn:li:tag:Dimension" + } + ] + }, + "isPartOfKey": true + }, + { + "fieldPath": "name", + "nullable": false, + "description": "", + "label": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "globalTags": { + "tags": [ + { + "tag": "urn:li:tag:Dimension" + } + ] + }, + "isPartOfKey": false + } + ], + "primaryKeys": [ + "id" + ] + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "looker.file.path": "star_award_winner_dev.view.lkml", + "looker.model": "data" + }, + "name": "star_award_winner_dev", + "tags": [] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.star_award_winner_dev,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "Develop" + }, + { + "id": "urn:li:container:78f22c19304954b15e8adb1d9809975e", + "urn": "urn:li:container:78f22c19304954b15e8adb1d9809975e" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:Dimension", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "Dimension" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" + } +} +] \ No newline at end of file From 7870b13490e684e1179c8df7f95ec52c8ea3b090 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergio=20G=C3=B3mez=20Villamor?= Date: Tue, 28 Jan 2025 10:26:19 +0100 Subject: [PATCH 14/18] feat(snowflake): set is_temp_table and is_allowed_table function for SqlParsingAggregator in SnowflakeV2Source (#12438) --- .../source/snowflake/snowflake_v2.py | 45 +++++++++++++++++-- 1 file changed, 41 insertions(+), 4 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py index b4ef2180d71d4..7d63f41f4bcf0 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py @@ -5,6 +5,7 @@ import os import os.path import platform +import re from dataclasses import dataclass from typing import Dict, Iterable, List, Optional, Union @@ -33,6 +34,7 @@ from datahub.ingestion.source.snowflake.constants import ( GENERIC_PERMISSION_ERROR_KEY, SnowflakeEdition, + SnowflakeObjectDomain, ) from datahub.ingestion.source.snowflake.snowflake_assertion import ( SnowflakeAssertionsHandler, @@ -162,6 +164,8 @@ def __init__(self, ctx: PipelineContext, config: SnowflakeV2Config): self.data_dictionary = SnowflakeDataDictionary(connection=self.connection) self.lineage_extractor: Optional[SnowflakeLineageExtractor] = None + self.discovered_datasets: Optional[List[str]] = None + self.aggregator: SqlParsingAggregator = self._exit_stack.enter_context( SqlParsingAggregator( platform=self.identifiers.platform, @@ -182,6 +186,8 @@ def __init__(self, ctx: PipelineContext, config: SnowflakeV2Config): generate_usage_statistics=False, generate_operations=False, format_queries=self.config.format_sql_queries, + is_temp_table=self._is_temp_table, + is_allowed_table=self._is_allowed_table, ) ) self.report.sql_aggregator = self.aggregator.report @@ -444,6 +450,34 @@ class SnowflakePrivilege: return _report + def _is_temp_table(self, name: str) -> bool: + if any( + re.match(pattern, name, flags=re.IGNORECASE) + for pattern in self.config.temporary_tables_pattern + ): + return True + + # This is also a temp table if + # 1. this name would be allowed by the dataset patterns, and + # 2. we have a list of discovered tables, and + # 3. it's not in the discovered tables list + if ( + self.filters.is_dataset_pattern_allowed(name, SnowflakeObjectDomain.TABLE) + and self.discovered_datasets + and name not in self.discovered_datasets + ): + return True + + return False + + def _is_allowed_table(self, name: str) -> bool: + if self.discovered_datasets and name not in self.discovered_datasets: + return False + + return self.filters.is_dataset_pattern_allowed( + name, SnowflakeObjectDomain.TABLE + ) + def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: return [ *super().get_workunit_processors(), @@ -513,7 +547,7 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: ) return - discovered_datasets = discovered_tables + discovered_views + self.discovered_datasets = discovered_tables + discovered_views if self.config.use_queries_v2: with self.report.new_stage(f"*: {VIEW_PARSING}"): @@ -538,13 +572,14 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: filters=self.filters, identifiers=self.identifiers, schema_resolver=schema_resolver, - discovered_tables=discovered_datasets, + discovered_tables=self.discovered_datasets, graph=self.ctx.graph, ) # TODO: This is slightly suboptimal because we create two SqlParsingAggregator instances with different configs # but a shared schema resolver. That's fine for now though - once we remove the old lineage/usage extractors, # it should be pretty straightforward to refactor this and only initialize the aggregator once. + # This also applies for the _is_temp_table and _is_allowed_table methods above, duplicated from SnowflakeQueriesExtractor. self.report.queries_extractor = queries_extractor.report yield from queries_extractor.get_workunits_internal() queries_extractor.close() @@ -568,12 +603,14 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: if ( self.config.include_usage_stats or self.config.include_operational_stats ) and self.usage_extractor: - yield from self.usage_extractor.get_usage_workunits(discovered_datasets) + yield from self.usage_extractor.get_usage_workunits( + self.discovered_datasets + ) if self.config.include_assertion_results: yield from SnowflakeAssertionsHandler( self.config, self.report, self.connection, self.identifiers - ).get_assertion_workunits(discovered_datasets) + ).get_assertion_workunits(self.discovered_datasets) self.connection.close() From 79aa40f1e69632ab6e0d63c1c9554a682a99424c Mon Sep 17 00:00:00 2001 From: skrydal Date: Tue, 28 Jan 2025 13:06:24 +0100 Subject: [PATCH 15/18] log(ingest/lookml): view file missing/parsing as warnings (#12448) --- .../src/datahub/ingestion/source/looker/looker_config.py | 4 +++- .../src/datahub/ingestion/source/looker/looker_file_loader.py | 4 ++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_config.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_config.py index 3ed3186399588..0f8d86a2cbd29 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_config.py @@ -177,7 +177,9 @@ def _get_generic_definition( class LookerConnectionDefinition(ConfigModel): platform: str default_db: str - default_schema: Optional[str] # Optional since some sources are two-level only + default_schema: Optional[str] = ( + None # Optional since some sources are two-level only + ) platform_instance: Optional[str] = None platform_env: Optional[str] = Field( default=None, diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_file_loader.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_file_loader.py index bd6a37fe4b4e2..ba7b62a1281c0 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_file_loader.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_file_loader.py @@ -65,7 +65,7 @@ def _load_viewfile( with open(path) as file: raw_file_content = file.read() except Exception as e: - self.reporter.failure( + self.reporter.report_warning( title="LKML File Loading Error", message="A lookml file is not present on local storage or GitHub", context=f"file path: {path}", @@ -101,7 +101,7 @@ def _load_viewfile( self.viewfile_cache[path] = looker_viewfile return looker_viewfile except Exception as e: - self.reporter.failure( + self.reporter.report_warning( title="LKML File Parsing Error", message="The input file is not lookml file", context=f"file path: {path}", From 15c3783532ceb489f783baf1f7a43d3b98a362a8 Mon Sep 17 00:00:00 2001 From: Gabe Lyons Date: Tue, 28 Jan 2025 13:57:42 -0800 Subject: [PATCH 16/18] docs(entity-change-events): include add/remove/update examples (#12388) --- docs/actions/events/entity-change-event.md | 69 +++++++++++++++++++++- 1 file changed, 68 insertions(+), 1 deletion(-) diff --git a/docs/actions/events/entity-change-event.md b/docs/actions/events/entity-change-event.md index 27277a97ad199..4a7264c20bcc6 100644 --- a/docs/actions/events/entity-change-event.md +++ b/docs/actions/events/entity-change-event.md @@ -219,6 +219,73 @@ This event is emitted when an existing owner has been removed from an entity on } ``` +### Add Structured Property Event + +This event is emitted when a Structured Property has been added to an entity on DataHub. + +#### Sample Event +```json +{ + "entityUrn": "urn:li:dataset:abc", + "entityType": "dataset", + "category": "STRUCTURED_PROPERTY", + "operation": "ADD", + "modifier": "urn:li:structuredProperty:prop1", + "parameters": { + "propertyUrn": "urn:li:structuredProperty:prop1", + "propertyValues": "[\"value1\"]" + }, + "version": 0, + "auditStamp": { + "actor": "urn:li:corpuser:jdoe", + "time": 1649953100653 + } +} +``` + +### Remove Structured Property Event + +This event is emitted when a Structured Property has been removed from an entity on DataHub. + +#### Sample Event +```json +{ + "entityUrn": "urn:li:dataset:abc", + "entityType": "dataset", + "category": "STRUCTURED_PROPERTY", + "operation": "REMOVE", + "modifier": "urn:li:structuredProperty:prop1", + "version": 0, + "auditStamp": { + "actor": "urn:li:corpuser:jdoe", + "time": 1649953100653 + } +} +``` + +### Modify Structured Property Event + +This event is emitted when a Structured Property's values have been modified on an entity in DataHub. + +#### Sample Event +```json +{ + "entityUrn": "urn:li:dataset:abc", + "entityType": "dataset", + "category": "STRUCTURED_PROPERTY", + "operation": "MODIFY", + "modifier": "urn:li:structuredProperty:prop1", + "parameters": { + "propertyUrn": "urn:li:structuredProperty:prop1", + "propertyValues": "[\"value1\",\"value2\"]" + }, + "version": 0, + "auditStamp": { + "actor": "urn:li:corpuser:jdoe", + "time": 1649953100653 + } +} +``` ### Modify Deprecation Event @@ -349,4 +416,4 @@ This event is emitted when a new entity has been hard-deleted on DataHub. "time": 1649953100653 } } -``` \ No newline at end of file +``` From dbd57c972f79ae1469adc0c8bde77fd6821fb819 Mon Sep 17 00:00:00 2001 From: david-leifker <114954101+david-leifker@users.noreply.github.com> Date: Tue, 28 Jan 2025 15:59:01 -0600 Subject: [PATCH 17/18] fix(ci): fix datahub-ingestion release/tag publishing (#12466) --- .github/workflows/docker-unified.yml | 14 +++++++------- docker/datahub-ingestion-base/build.gradle | 2 +- docker/datahub-ingestion/build.gradle | 2 +- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/.github/workflows/docker-unified.yml b/.github/workflows/docker-unified.yml index 5f944c8e28769..a756b27a38e84 100644 --- a/.github/workflows/docker-unified.yml +++ b/.github/workflows/docker-unified.yml @@ -44,6 +44,7 @@ jobs: tag: ${{ steps.tag.outputs.tag }} slim_tag: ${{ steps.tag.outputs.slim_tag }} full_tag: ${{ steps.tag.outputs.full_tag }} + short_sha: ${{ steps.tag.outputs.short_sha }} # needed for auto-deploy unique_tag: ${{ steps.tag.outputs.unique_tag }} unique_slim_tag: ${{ steps.tag.outputs.unique_slim_tag }} unique_full_tag: ${{ steps.tag.outputs.unique_full_tag }} @@ -65,6 +66,8 @@ jobs: postgres_setup_change: ${{ steps.ci-optimize.outputs.postgres-setup-change == 'true' }} elasticsearch_setup_change: ${{ steps.ci-optimize.outputs.elasticsearch-setup-change == 'true' }} smoke_test_change: ${{ steps.ci-optimize.outputs.smoke-test-change == 'true' }} + integrations_service_change: "false" + datahub_executor_change: "false" steps: - name: Check out the repo uses: acryldata/sane-checkout-action@v3 @@ -864,7 +867,8 @@ jobs: context: . file: ./docker/datahub-ingestion/Dockerfile platforms: linux/amd64,linux/arm64/v8 - depot-project: ${{ vars.DEPOT_PROJECT_ID }} + # Workaround 2025-01-25 - Depot publishing errors + depot-project: ${{ (startsWith(github.ref, 'refs/tags/') || github.event_name == 'release') && '' || vars.DEPOT_PROJECT_ID }} - name: Compute Tag id: tag run: echo "tag=${{ needs.setup.outputs.ingestion_change == 'true' && needs.setup.outputs.unique_slim_tag || 'head-slim' }}" >> "$GITHUB_OUTPUT" @@ -963,7 +967,8 @@ jobs: context: . file: ./docker/datahub-ingestion/Dockerfile platforms: linux/amd64,linux/arm64/v8 - depot-project: ${{ vars.DEPOT_PROJECT_ID }} + # Workaround 2025-01-25 - Depot publishing errors + depot-project: ${{ (startsWith(github.ref, 'refs/tags/') || github.event_name == 'release') && '' || vars.DEPOT_PROJECT_ID }} - name: Compute Tag (Full) id: tag run: echo "tag=${{ needs.setup.outputs.ingestion_change == 'true' && needs.setup.outputs.unique_tag || 'head' }}" >> "$GITHUB_OUTPUT" @@ -1178,11 +1183,6 @@ jobs: docker pull '${{ env.DATAHUB_ELASTIC_SETUP_IMAGE }}:head' docker tag '${{ env.DATAHUB_ELASTIC_SETUP_IMAGE }}:head' '${{ env.DATAHUB_ELASTIC_SETUP_IMAGE }}:${{ needs.setup.outputs.unique_tag }}' fi - if [ '${{ needs.setup.outputs.integrations_service_change }}' == 'false' ]; then - echo 'datahub-integration-service head images' - docker pull '${{ env.DATAHUB_INTEGRATIONS_IMAGE }}:head' - docker tag '${{ env.DATAHUB_INTEGRATIONS_IMAGE }}:head' '${{ env.DATAHUB_INTEGRATIONS_IMAGE }}:${{ needs.setup.outputs.unique_tag }}' - fi - name: CI Slim Head Images run: | if [ '${{ needs.setup.outputs.ingestion_change }}' == 'false' ]; then diff --git a/docker/datahub-ingestion-base/build.gradle b/docker/datahub-ingestion-base/build.gradle index f19faa227ca61..b3ed6463b9f6c 100644 --- a/docker/datahub-ingestion-base/build.gradle +++ b/docker/datahub-ingestion-base/build.gradle @@ -12,7 +12,7 @@ ext { docker_target = project.getProperties().getOrDefault("dockerTarget", "slim") docker_version = "${version}${docker_target == 'slim' ? '-slim' : ''}" - revision = 8 // increment to trigger rebuild + revision = 10 // increment to trigger rebuild } docker { diff --git a/docker/datahub-ingestion/build.gradle b/docker/datahub-ingestion/build.gradle index b236a53c288f7..b0b666f75eb5a 100644 --- a/docker/datahub-ingestion/build.gradle +++ b/docker/datahub-ingestion/build.gradle @@ -12,7 +12,7 @@ ext { docker_target = project.getProperties().getOrDefault("dockerTarget", "slim") docker_version = "${version}${docker_target == 'slim' ? '-slim' : ''}" - revision = 9 // increment to trigger rebuild + revision = 10 // increment to trigger rebuild } dependencies { From 47134c272bd82ff8d00b6a30c725fbde4165335c Mon Sep 17 00:00:00 2001 From: Hyejin Yoon <0327jane@gmail.com> Date: Wed, 29 Jan 2025 11:41:56 +0900 Subject: [PATCH 18/18] feat: update ml system UI (#12334) Co-authored-by: Andrew Sikowitz Co-authored-by: RyanHolstien Co-authored-by: Shirshanka Das Co-authored-by: ryota-cloud --- .../mappers/MLModelGroupPropertiesMapper.java | 33 +++ .../mappers/MLModelPropertiesMapper.java | 15 ++ .../src/main/resources/lineage.graphql | 29 +++ .../MLModelGroupPropertiesMapperTest.java | 68 ++++++ .../mappers/MLModelPropertiesMapperTest.java | 187 ++++++++++++++++ .../src/app/entity/EntityPage.tsx | 1 + .../DataProcessInstanceEntity.tsx | 72 ++---- .../dataProcessInstance/preview/Preview.tsx | 20 +- .../profile/DataProcessInstanceSummary.tsx | 102 +++++++++ .../src/app/entity/mlModel/MLModelEntity.tsx | 2 +- .../app/entity/mlModel/preview/Preview.tsx | 3 +- .../entity/mlModel/profile/MLModelSummary.tsx | 96 +++++++- .../mlModelGroup/MLModelGroupEntity.tsx | 2 +- .../entity/mlModelGroup/preview/Preview.tsx | 3 +- .../mlModelGroup/profile/ModelGroupModels.tsx | 208 ++++++++++++++++-- .../search/EmbeddedListSearchSection.tsx | 27 ++- .../src/app/entity/shared/constants.ts | 4 + .../DataProcessInstanceRightColumn.tsx | 87 ++++++++ .../src/app/preview/DefaultPreviewCard.tsx | 12 +- .../src/app/shared/time/timeUtils.tsx | 38 ++++ .../src/graphql/fragments.graphql | 12 + datahub-web-react/src/graphql/lineage.graphql | 32 +++ datahub-web-react/src/graphql/mlModel.graphql | 17 ++ datahub-web-react/src/graphql/search.graphql | 9 + 24 files changed, 989 insertions(+), 90 deletions(-) create mode 100644 datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLModelGroupPropertiesMapperTest.java create mode 100644 datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLModelPropertiesMapperTest.java create mode 100644 datahub-web-react/src/app/entity/dataProcessInstance/profile/DataProcessInstanceSummary.tsx create mode 100644 datahub-web-react/src/app/preview/DataProcessInstanceRightColumn.tsx diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLModelGroupPropertiesMapper.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLModelGroupPropertiesMapper.java index a6cfded9865d9..2da2fa2a58a6a 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLModelGroupPropertiesMapper.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLModelGroupPropertiesMapper.java @@ -3,8 +3,11 @@ import com.linkedin.common.urn.Urn; import com.linkedin.datahub.graphql.QueryContext; import com.linkedin.datahub.graphql.generated.MLModelGroupProperties; +import com.linkedin.datahub.graphql.generated.MLModelLineageInfo; import com.linkedin.datahub.graphql.types.common.mappers.CustomPropertiesMapper; +import com.linkedin.datahub.graphql.types.common.mappers.TimeStampToAuditStampMapper; import com.linkedin.datahub.graphql.types.mappers.EmbeddedModelMapper; +import java.util.stream.Collectors; import javax.annotation.Nonnull; import javax.annotation.Nullable; @@ -33,10 +36,40 @@ public MLModelGroupProperties apply( result.setVersion(VersionTagMapper.map(context, mlModelGroupProperties.getVersion())); } result.setCreatedAt(mlModelGroupProperties.getCreatedAt()); + if (mlModelGroupProperties.hasCreated()) { + result.setCreated( + TimeStampToAuditStampMapper.map(context, mlModelGroupProperties.getCreated())); + } + if (mlModelGroupProperties.getName() != null) { + result.setName(mlModelGroupProperties.getName()); + } else { + // backfill name from URN for backwards compatibility + result.setName(entityUrn.getEntityKey().get(1)); // indexed access is safe here + } + + if (mlModelGroupProperties.hasLastModified()) { + result.setLastModified( + TimeStampToAuditStampMapper.map(context, mlModelGroupProperties.getLastModified())); + } result.setCustomProperties( CustomPropertiesMapper.map(mlModelGroupProperties.getCustomProperties(), entityUrn)); + final MLModelLineageInfo lineageInfo = new MLModelLineageInfo(); + if (mlModelGroupProperties.hasTrainingJobs()) { + lineageInfo.setTrainingJobs( + mlModelGroupProperties.getTrainingJobs().stream() + .map(urn -> urn.toString()) + .collect(Collectors.toList())); + } + if (mlModelGroupProperties.hasDownstreamJobs()) { + lineageInfo.setDownstreamJobs( + mlModelGroupProperties.getDownstreamJobs().stream() + .map(urn -> urn.toString()) + .collect(Collectors.toList())); + } + result.setMlModelLineageInfo(lineageInfo); + return result; } } diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLModelPropertiesMapper.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLModelPropertiesMapper.java index 7b00fe88f2d68..1f1003dea720c 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLModelPropertiesMapper.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLModelPropertiesMapper.java @@ -5,6 +5,7 @@ import com.linkedin.common.urn.Urn; import com.linkedin.datahub.graphql.QueryContext; import com.linkedin.datahub.graphql.generated.MLModelGroup; +import com.linkedin.datahub.graphql.generated.MLModelLineageInfo; import com.linkedin.datahub.graphql.generated.MLModelProperties; import com.linkedin.datahub.graphql.types.common.mappers.CustomPropertiesMapper; import com.linkedin.datahub.graphql.types.common.mappers.TimeStampToAuditStampMapper; @@ -87,6 +88,20 @@ public MLModelProperties apply( .collect(Collectors.toList())); } result.setTags(mlModelProperties.getTags()); + final MLModelLineageInfo lineageInfo = new MLModelLineageInfo(); + if (mlModelProperties.hasTrainingJobs()) { + lineageInfo.setTrainingJobs( + mlModelProperties.getTrainingJobs().stream() + .map(urn -> urn.toString()) + .collect(Collectors.toList())); + } + if (mlModelProperties.hasDownstreamJobs()) { + lineageInfo.setDownstreamJobs( + mlModelProperties.getDownstreamJobs().stream() + .map(urn -> urn.toString()) + .collect(Collectors.toList())); + } + result.setMlModelLineageInfo(lineageInfo); return result; } diff --git a/datahub-graphql-core/src/main/resources/lineage.graphql b/datahub-graphql-core/src/main/resources/lineage.graphql index 975d013a44805..abb1446421858 100644 --- a/datahub-graphql-core/src/main/resources/lineage.graphql +++ b/datahub-graphql-core/src/main/resources/lineage.graphql @@ -25,3 +25,32 @@ input LineageEdge { """ upstreamUrn: String! } + +""" +Represents lineage information for ML entities. +""" +type MLModelLineageInfo { + """ + List of jobs or processes used to train the model. + """ + trainingJobs: [String!] + + """ + List of jobs or processes that use this model. + """ + downstreamJobs: [String!] +} + +extend type MLModelProperties { + """ + Information related to lineage to this model group + """ + mlModelLineageInfo: MLModelLineageInfo +} + +extend type MLModelGroupProperties { + """ + Information related to lineage to this model group + """ + mlModelLineageInfo: MLModelLineageInfo +} \ No newline at end of file diff --git a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLModelGroupPropertiesMapperTest.java b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLModelGroupPropertiesMapperTest.java new file mode 100644 index 0000000000000..fc738837c09d1 --- /dev/null +++ b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLModelGroupPropertiesMapperTest.java @@ -0,0 +1,68 @@ +package com.linkedin.datahub.graphql.types.mlmodel.mappers; + +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertNotNull; +import static org.testng.Assert.assertNull; + +import com.linkedin.common.urn.Urn; +import com.linkedin.ml.metadata.MLModelGroupProperties; +import java.net.URISyntaxException; +import org.testng.annotations.Test; + +public class MLModelGroupPropertiesMapperTest { + + @Test + public void testMapMLModelGroupProperties() throws URISyntaxException { + // Create backend ML Model Group Properties + MLModelGroupProperties input = new MLModelGroupProperties(); + + // Set description + input.setDescription("a ml trust model group"); + + // Set Name + input.setName("ML trust model group"); + + // Create URN + Urn groupUrn = + Urn.createFromString( + "urn:li:mlModelGroup:(urn:li:dataPlatform:sagemaker,another-group,PROD)"); + + // Map the properties + com.linkedin.datahub.graphql.generated.MLModelGroupProperties result = + MLModelGroupPropertiesMapper.map(null, input, groupUrn); + + // Verify mapped properties + assertNotNull(result); + assertEquals(result.getDescription(), "a ml trust model group"); + assertEquals(result.getName(), "ML trust model group"); + + // Verify lineage info is null as in the mock data + assertNotNull(result.getMlModelLineageInfo()); + assertNull(result.getMlModelLineageInfo().getTrainingJobs()); + assertNull(result.getMlModelLineageInfo().getDownstreamJobs()); + } + + @Test + public void testMapWithMinimalProperties() throws URISyntaxException { + // Create backend ML Model Group Properties with minimal information + MLModelGroupProperties input = new MLModelGroupProperties(); + + // Create URN + Urn groupUrn = + Urn.createFromString( + "urn:li:mlModelGroup:(urn:li:dataPlatform:sagemaker,another-group,PROD)"); + + // Map the properties + com.linkedin.datahub.graphql.generated.MLModelGroupProperties result = + MLModelGroupPropertiesMapper.map(null, input, groupUrn); + + // Verify basic mapping with minimal properties + assertNotNull(result); + assertNull(result.getDescription()); + + // Verify lineage info is null + assertNotNull(result.getMlModelLineageInfo()); + assertNull(result.getMlModelLineageInfo().getTrainingJobs()); + assertNull(result.getMlModelLineageInfo().getDownstreamJobs()); + } +} diff --git a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLModelPropertiesMapperTest.java b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLModelPropertiesMapperTest.java new file mode 100644 index 0000000000000..17fa7a0abe139 --- /dev/null +++ b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLModelPropertiesMapperTest.java @@ -0,0 +1,187 @@ +package com.linkedin.datahub.graphql.types.mlmodel.mappers; + +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertNotNull; +import static org.testng.Assert.assertNull; + +import com.linkedin.common.MLFeatureUrnArray; +import com.linkedin.common.TimeStamp; +import com.linkedin.common.VersionTag; +import com.linkedin.common.url.Url; +import com.linkedin.common.urn.MLFeatureUrn; +import com.linkedin.common.urn.MLModelUrn; +import com.linkedin.common.urn.Urn; +import com.linkedin.data.template.StringArray; +import com.linkedin.data.template.StringMap; +import com.linkedin.ml.metadata.MLHyperParam; +import com.linkedin.ml.metadata.MLHyperParamArray; +import com.linkedin.ml.metadata.MLMetric; +import com.linkedin.ml.metadata.MLMetricArray; +import com.linkedin.ml.metadata.MLModelProperties; +import java.net.URISyntaxException; +import org.testng.annotations.Test; + +public class MLModelPropertiesMapperTest { + + @Test + public void testMapMLModelProperties() throws URISyntaxException { + MLModelProperties input = new MLModelProperties(); + + // Set basic properties + input.setName("TestModel"); + input.setDescription("A test ML model"); + input.setType("Classification"); + + // Set version + VersionTag versionTag = new VersionTag(); + versionTag.setVersionTag("1.0.0"); + input.setVersion(versionTag); + + // Set external URL + Url externalUrl = new Url("https://example.com/model"); + input.setExternalUrl(externalUrl); + + // Set created and last modified timestamps + TimeStamp createdTimeStamp = new TimeStamp(); + createdTimeStamp.setTime(1000L); + Urn userUrn = Urn.createFromString("urn:li:corpuser:test"); + createdTimeStamp.setActor(userUrn); + input.setCreated(createdTimeStamp); + + TimeStamp lastModifiedTimeStamp = new TimeStamp(); + lastModifiedTimeStamp.setTime(2000L); + lastModifiedTimeStamp.setActor(userUrn); + input.setLastModified(lastModifiedTimeStamp); + + // Set custom properties + StringMap customProps = new StringMap(); + customProps.put("key1", "value1"); + customProps.put("key2", "value2"); + input.setCustomProperties(customProps); + + // Set hyper parameters + MLHyperParamArray hyperParams = new MLHyperParamArray(); + MLHyperParam hyperParam1 = new MLHyperParam(); + hyperParam1.setName("learning_rate"); + hyperParam1.setValue("0.01"); + hyperParams.add(hyperParam1); + input.setHyperParams(hyperParams); + + // Set training metrics + MLMetricArray trainingMetrics = new MLMetricArray(); + MLMetric metric1 = new MLMetric(); + metric1.setName("accuracy"); + metric1.setValue("0.95"); + trainingMetrics.add(metric1); + input.setTrainingMetrics(trainingMetrics); + + // Set ML features + MLFeatureUrnArray mlFeatures = new MLFeatureUrnArray(); + MLFeatureUrn featureUrn = MLFeatureUrn.createFromString("urn:li:mlFeature:(dataset,feature)"); + mlFeatures.add(featureUrn); + input.setMlFeatures(mlFeatures); + + // Set tags + StringArray tags = new StringArray(); + tags.add("tag1"); + tags.add("tag2"); + input.setTags(tags); + + // Set training and downstream jobs + input.setTrainingJobs( + new com.linkedin.common.UrnArray(Urn.createFromString("urn:li:dataJob:train"))); + input.setDownstreamJobs( + new com.linkedin.common.UrnArray(Urn.createFromString("urn:li:dataJob:predict"))); + + // Create ML Model URN + MLModelUrn modelUrn = + MLModelUrn.createFromString( + "urn:li:mlModel:(urn:li:dataPlatform:sagemaker,unittestmodel,PROD)"); + + // Map the properties + com.linkedin.datahub.graphql.generated.MLModelProperties result = + MLModelPropertiesMapper.map(null, input, modelUrn); + + // Verify mapped properties + assertNotNull(result); + assertEquals(result.getName(), "TestModel"); + assertEquals(result.getDescription(), "A test ML model"); + assertEquals(result.getType(), "Classification"); + assertEquals(result.getVersion(), "1.0.0"); + assertEquals(result.getExternalUrl(), "https://example.com/model"); + + // Verify audit stamps + assertNotNull(result.getCreated()); + assertEquals(result.getCreated().getTime().longValue(), 1000L); + assertEquals(result.getCreated().getActor(), userUrn.toString()); + + assertNotNull(result.getLastModified()); + assertEquals(result.getLastModified().getTime().longValue(), 2000L); + assertEquals(result.getLastModified().getActor(), userUrn.toString()); + + // Verify custom properties + assertNotNull(result.getCustomProperties()); + + // Verify hyper parameters + assertNotNull(result.getHyperParams()); + assertEquals(result.getHyperParams().size(), 1); + assertEquals(result.getHyperParams().get(0).getName(), "learning_rate"); + assertEquals(result.getHyperParams().get(0).getValue(), "0.01"); + + // Verify training metrics + assertNotNull(result.getTrainingMetrics()); + assertEquals(result.getTrainingMetrics().size(), 1); + assertEquals(result.getTrainingMetrics().get(0).getName(), "accuracy"); + assertEquals(result.getTrainingMetrics().get(0).getValue(), "0.95"); + + // Verify ML features + assertNotNull(result.getMlFeatures()); + assertEquals(result.getMlFeatures().size(), 1); + assertEquals(result.getMlFeatures().get(0), featureUrn.toString()); + + // Verify tags + assertNotNull(result.getTags()); + assertEquals(result.getTags().get(0), "tag1"); + assertEquals(result.getTags().get(1), "tag2"); + + // Verify lineage info + assertNotNull(result.getMlModelLineageInfo()); + assertEquals(result.getMlModelLineageInfo().getTrainingJobs().size(), 1); + assertEquals(result.getMlModelLineageInfo().getTrainingJobs().get(0), "urn:li:dataJob:train"); + assertEquals(result.getMlModelLineageInfo().getDownstreamJobs().size(), 1); + assertEquals( + result.getMlModelLineageInfo().getDownstreamJobs().get(0), "urn:li:dataJob:predict"); + } + + @Test + public void testMapWithMissingName() throws URISyntaxException { + MLModelProperties input = new MLModelProperties(); + MLModelUrn modelUrn = + MLModelUrn.createFromString( + "urn:li:mlModel:(urn:li:dataPlatform:sagemaker,missingnamemodel,PROD)"); + + com.linkedin.datahub.graphql.generated.MLModelProperties result = + MLModelPropertiesMapper.map(null, input, modelUrn); + + // Verify that name is extracted from URN when not present in input + assertEquals(result.getName(), "missingnamemodel"); + } + + @Test + public void testMapWithMinimalProperties() throws URISyntaxException { + MLModelProperties input = new MLModelProperties(); + MLModelUrn modelUrn = + MLModelUrn.createFromString( + "urn:li:mlModel:(urn:li:dataPlatform:sagemaker,minimalmodel,PROD)"); + + com.linkedin.datahub.graphql.generated.MLModelProperties result = + MLModelPropertiesMapper.map(null, input, modelUrn); + + // Verify basic mapping with minimal properties + assertNotNull(result); + assertEquals(result.getName(), "minimalmodel"); + assertNull(result.getDescription()); + assertNull(result.getType()); + assertNull(result.getVersion()); + } +} diff --git a/datahub-web-react/src/app/entity/EntityPage.tsx b/datahub-web-react/src/app/entity/EntityPage.tsx index 916fa41795412..d05f75694ab94 100644 --- a/datahub-web-react/src/app/entity/EntityPage.tsx +++ b/datahub-web-react/src/app/entity/EntityPage.tsx @@ -66,6 +66,7 @@ export const EntityPage = ({ entityType }: Props) => { entityType === EntityType.MlfeatureTable || entityType === EntityType.MlmodelGroup || entityType === EntityType.GlossaryTerm || + entityType === EntityType.DataProcessInstance || entityType === EntityType.GlossaryNode; return ( diff --git a/datahub-web-react/src/app/entity/dataProcessInstance/DataProcessInstanceEntity.tsx b/datahub-web-react/src/app/entity/dataProcessInstance/DataProcessInstanceEntity.tsx index 9bb9bd745d1ee..bdf77959e97c7 100644 --- a/datahub-web-react/src/app/entity/dataProcessInstance/DataProcessInstanceEntity.tsx +++ b/datahub-web-react/src/app/entity/dataProcessInstance/DataProcessInstanceEntity.tsx @@ -1,12 +1,7 @@ import React from 'react'; import { ApiOutlined } from '@ant-design/icons'; -import { - DataProcessInstance, - Entity as GeneratedEntity, - EntityType, - OwnershipType, - SearchResult, -} from '../../../types.generated'; +import { Entity as GraphQLEntity } from '@types'; +import { DataProcessInstance, EntityType, OwnershipType, SearchResult } from '../../../types.generated'; import { Preview } from './preview/Preview'; import { Entity, EntityCapabilityType, IconStyleType, PreviewType } from '../Entity'; import { EntityProfile } from '../shared/containers/profile/EntityProfile'; @@ -23,32 +18,21 @@ import { EntityMenuItems } from '../shared/EntityDropdown/EntityDropdown'; import { capitalizeFirstLetterOnly } from '../../shared/textUtil'; import DataProductSection from '../shared/containers/profile/sidebar/DataProduct/DataProductSection'; import { getDataProduct } from '../shared/utils'; -// import SummaryTab from './profile/DataProcessInstaceSummary'; +import SummaryTab from './profile/DataProcessInstanceSummary'; -// const getProcessPlatformName = (data?: DataProcessInstance): string => { -// return ( -// data?.dataPlatformInstance?.platform?.properties?.displayName || -// capitalizeFirstLetterOnly(data?.dataPlatformInstance?.platform?.name) || -// '' -// ); -// }; - -const getParentEntities = (data: DataProcessInstance): GeneratedEntity[] => { +const getParentEntities = (data: DataProcessInstance): GraphQLEntity[] => { const parentEntity = data?.relationships?.relationships?.find( (rel) => rel.type === 'InstanceOf' && rel.entity?.type === EntityType.DataJob, ); - if (!parentEntity?.entity) return []; + if (!parentEntity || !parentEntity.entity) { + return []; + } - // Convert to GeneratedEntity - return [ - { - type: parentEntity.entity.type, - urn: (parentEntity.entity as any).urn, // Make sure urn exists - relationships: (parentEntity.entity as any).relationships, - }, - ]; + // First cast to unknown, then to Entity with proper type + return [parentEntity.entity]; }; + /** * Definition of the DataHub DataProcessInstance entity. */ @@ -97,18 +81,13 @@ export class DataProcessInstanceEntity implements Entity { urn={urn} entityType={EntityType.DataProcessInstance} useEntityQuery={this.useEntityQuery} - // useUpdateQuery={useUpdateDataProcessInstanceMutation} getOverrideProperties={this.getOverridePropertiesFromEntity} headerDropdownItems={new Set([EntityMenuItems.UPDATE_DEPRECATION, EntityMenuItems.RAISE_INCIDENT])} tabs={[ - // { - // name: 'Documentation', - // component: DocumentationTab, - // }, - // { - // name: 'Summary', - // component: SummaryTab, - // }, + { + name: 'Summary', + component: SummaryTab, + }, { name: 'Lineage', component: LineageTab, @@ -117,14 +96,6 @@ export class DataProcessInstanceEntity implements Entity { name: 'Properties', component: PropertiesTab, }, - // { - // name: 'Incidents', - // component: IncidentTab, - // getDynamicName: (_, processInstance) => { - // const activeIncidentCount = processInstance?.dataProcessInstance?.activeIncidents.total; - // return `Incidents${(activeIncidentCount && ` (${activeIncidentCount})`) || ''}`; - // }, - // }, ]} sidebarSections={this.getSidebarSections()} /> @@ -181,13 +152,11 @@ export class DataProcessInstanceEntity implements Entity { platformLogo={data?.dataPlatformInstance?.platform?.properties?.logoUrl} owners={null} globalTags={null} - // domain={data.domain?.domain} dataProduct={getDataProduct(genericProperties?.dataProduct)} externalUrl={data.properties?.externalUrl} parentContainers={data.parentContainers} parentEntities={parentEntities} container={data.container || undefined} - // health={data.health} /> ); }; @@ -196,6 +165,9 @@ export class DataProcessInstanceEntity implements Entity { const data = result.entity as DataProcessInstance; const genericProperties = this.getGenericEntityProperties(data); const parentEntities = getParentEntities(data); + + const firstState = data?.state && data.state.length > 0 ? data.state[0] : undefined; + return ( { platformInstanceId={data.dataPlatformInstance?.instanceId} owners={null} globalTags={null} - // domain={data.domain?.domain} dataProduct={getDataProduct(genericProperties?.dataProduct)} - // deprecation={data.deprecation} insights={result.insights} externalUrl={data.properties?.externalUrl} degree={(result as any).degree} @@ -220,10 +190,9 @@ export class DataProcessInstanceEntity implements Entity { parentContainers={data.parentContainers} parentEntities={parentEntities} container={data.container || undefined} - // duration={data?.state?.[0]?.durationMillis} - // status={data?.state?.[0]?.result?.resultType} - // startTime={data?.state?.[0]?.timestampMillis} - // health={data.health} + duration={firstState?.durationMillis} + status={firstState?.result?.resultType} + startTime={firstState?.timestampMillis} /> ); }; @@ -237,7 +206,6 @@ export class DataProcessInstanceEntity implements Entity { icon: entity?.dataPlatformInstance?.platform?.properties?.logoUrl || undefined, platform: entity?.dataPlatformInstance?.platform, container: entity?.container, - // health: entity?.health || undefined, }; }; diff --git a/datahub-web-react/src/app/entity/dataProcessInstance/preview/Preview.tsx b/datahub-web-react/src/app/entity/dataProcessInstance/preview/Preview.tsx index 3a3b0340695d9..9a2acbe11c084 100644 --- a/datahub-web-react/src/app/entity/dataProcessInstance/preview/Preview.tsx +++ b/datahub-web-react/src/app/entity/dataProcessInstance/preview/Preview.tsx @@ -39,10 +39,10 @@ export const Preview = ({ health, parentEntities, parentContainers, -}: // duration, -// status, -// startTime, -{ + duration, + status, + startTime, +}: { urn: string; name: string; subType?: string | null; @@ -64,9 +64,9 @@ export const Preview = ({ health?: Health[] | null; parentEntities?: Array | null; parentContainers?: ParentContainersResult | null; - // duration?: number | null; - // status?: string | null; - // startTime?: number | null; + duration?: number | null; + status?: string | null; + startTime?: number | null; }): JSX.Element => { const entityRegistry = useEntityRegistry(); return ( @@ -95,9 +95,9 @@ export const Preview = ({ paths={paths} health={health || undefined} parentEntities={parentEntities} - // duration={duration} - // status={status} - // startTime={startTime} + duration={duration} + status={status} + startTime={startTime} /> ); }; diff --git a/datahub-web-react/src/app/entity/dataProcessInstance/profile/DataProcessInstanceSummary.tsx b/datahub-web-react/src/app/entity/dataProcessInstance/profile/DataProcessInstanceSummary.tsx new file mode 100644 index 0000000000000..c6591d4f5faa1 --- /dev/null +++ b/datahub-web-react/src/app/entity/dataProcessInstance/profile/DataProcessInstanceSummary.tsx @@ -0,0 +1,102 @@ +import React from 'react'; +import styled from 'styled-components'; +import { Space, Table, Typography } from 'antd'; +import { formatDetailedDuration } from '@src/app/shared/time/timeUtils'; +import { capitalize } from 'lodash'; +import moment from 'moment'; +import { MlHyperParam, MlMetric, DataProcessInstanceRunResultType } from '../../../../types.generated'; +import { useBaseEntity } from '../../shared/EntityContext'; +import { InfoItem } from '../../shared/components/styled/InfoItem'; +import { GetDataProcessInstanceQuery } from '../../../../graphql/dataProcessInstance.generated'; +import { Pill } from '../../../../alchemy-components/components/Pills'; + +const TabContent = styled.div` + padding: 16px; +`; + +const InfoItemContainer = styled.div<{ justifyContent }>` + display: flex; + position: relative; + justify-content: ${(props) => props.justifyContent}; + padding: 0px 2px; +`; + +const InfoItemContent = styled.div` + padding-top: 8px; + width: 100px; +`; + +const propertyTableColumns = [ + { + title: 'Name', + dataIndex: 'name', + width: 450, + }, + { + title: 'Value', + dataIndex: 'value', + }, +]; + +export default function MLModelSummary() { + const baseEntity = useBaseEntity(); + const dpi = baseEntity?.dataProcessInstance; + + const formatStatus = (state) => { + if (!state || state.length === 0) return '-'; + const result = state[0]?.result?.resultType; + const statusColor = result === DataProcessInstanceRunResultType.Success ? 'green' : 'red'; + return ; + }; + + const formatDuration = (state) => { + if (!state || state.length === 0) return '-'; + return formatDetailedDuration(state[0]?.durationMillis); + }; + + return ( + + + Details + + + + {dpi?.properties?.created?.time + ? moment(dpi.properties.created.time).format('YYYY-MM-DD HH:mm:ss') + : '-'} + + + + {formatStatus(dpi?.state)} + + + {formatDuration(dpi?.state)} + + + {dpi?.mlTrainingRunProperties?.id} + + + {dpi?.properties?.created?.actor} + + + + + {dpi?.mlTrainingRunProperties?.outputUrls} + + + Training Metrics + + Hyper Parameters +
+ + + ); +} diff --git a/datahub-web-react/src/app/entity/mlModel/MLModelEntity.tsx b/datahub-web-react/src/app/entity/mlModel/MLModelEntity.tsx index b77f6a19436a5..5e75b4680e427 100644 --- a/datahub-web-react/src/app/entity/mlModel/MLModelEntity.tsx +++ b/datahub-web-react/src/app/entity/mlModel/MLModelEntity.tsx @@ -151,7 +151,7 @@ export class MLModelEntity implements Entity { }; displayName = (data: MlModel) => { - return data.name || data.urn; + return data.properties?.name || data.name || data.urn; }; getGenericEntityProperties = (mlModel: MlModel) => { diff --git a/datahub-web-react/src/app/entity/mlModel/preview/Preview.tsx b/datahub-web-react/src/app/entity/mlModel/preview/Preview.tsx index 4b57976dfe1a2..7ea33ba4c15f6 100644 --- a/datahub-web-react/src/app/entity/mlModel/preview/Preview.tsx +++ b/datahub-web-react/src/app/entity/mlModel/preview/Preview.tsx @@ -21,7 +21,8 @@ export const Preview = ({ return ( ` + display: flex; + position: relative; + justify-content: ${(props) => props.justifyContent}; + padding: 0px 2px; +`; + +const InfoItemContent = styled.div` + padding-top: 8px; + width: 100px; + display: flex; + flex-wrap: wrap; + gap: 5px; +`; + +const JobLink = styled(Link)` + color: ${colors.blue[700]}; + &:hover { + text-decoration: underline; + } +`; + export default function MLModelSummary() { const baseEntity = useBaseEntity(); const model = baseEntity?.mlModel; + const entityRegistry = useEntityRegistry(); const propertyTableColumns = [ { @@ -26,9 +55,72 @@ export default function MLModelSummary() { }, ]; + const renderTrainingJobs = () => { + const trainingJobs = + model?.trainedBy?.relationships?.map((relationship) => relationship.entity).filter(notEmpty) || []; + + if (trainingJobs.length === 0) return '-'; + + return ( +
+ {trainingJobs.map((job, index) => { + const { urn, name } = job as { urn: string; name?: string }; + return ( + + + {name || urn} + + {index < trainingJobs.length - 1 && ', '} + + ); + })} +
+ ); + }; + return ( + Model Details + + + {model?.versionProperties?.version?.versionTag} + + + + {model?.properties?.created?.time + ? moment(model.properties.created.time).format('YYYY-MM-DD HH:mm:ss') + : '-'} + + + + + {model?.properties?.lastModified?.time + ? moment(model.properties.lastModified.time).format('YYYY-MM-DD HH:mm:ss') + : '-'} + + + + {model?.properties?.created?.actor} + + + + + + {model?.versionProperties?.aliases?.map((alias) => ( + + ))} + + + + {renderTrainingJobs()} + + Training Metrics
{ }; displayName = (data: MlModelGroup) => { - return data.name || data.urn; + return data.properties?.name || data.name || data.urn; }; getGenericEntityProperties = (mlModelGroup: MlModelGroup) => { diff --git a/datahub-web-react/src/app/entity/mlModelGroup/preview/Preview.tsx b/datahub-web-react/src/app/entity/mlModelGroup/preview/Preview.tsx index 910397af899f5..76ad9c06daece 100644 --- a/datahub-web-react/src/app/entity/mlModelGroup/preview/Preview.tsx +++ b/datahub-web-react/src/app/entity/mlModelGroup/preview/Preview.tsx @@ -19,7 +19,8 @@ export const Preview = ({ return ( ` + display: flex; + position: relative; + justify-content: ${(props) => props.justifyContent}; + padding: 12px 2px 20px 2px; +`; + +const InfoItemContent = styled.div` + padding-top: 8px; + width: 100px; +`; + +const NameContainer = styled.div` + display: flex; + align-items: center; +`; + +const NameLink = styled.a` + font-weight: 700; + color: inherit; + font-size: 0.9rem; + &:hover { + color: ${colors.blue[400]} !important; + } +`; + +const TagContainer = styled.div` + display: inline-flex; + margin-left: 0px; + margin-top: 3px; + flex-wrap: wrap; + margin-right: 8px; + backgroundcolor: white; + gap: 5px; +`; + +const StyledTable = styled(Table)` + &&& .ant-table-cell { + padding: 16px; + } +` as typeof Table; + +const ModelsContainer = styled.div` + width: 100%; + padding: 20px; +`; + +const VersionContainer = styled.div` + display: flex; + align-items: center; +`; export default function MLGroupModels() { const baseEntity = useBaseEntity(); - const models = baseEntity?.mlModelGroup?.incoming?.relationships?.map((relationship) => relationship.entity) || []; - const entityRegistry = useEntityRegistry(); + const modelGroup = baseEntity?.mlModelGroup; + + const models = + baseEntity?.mlModelGroup?.incoming?.relationships + ?.map((relationship) => relationship.entity) + .filter(notEmpty) || []; + + const columns = [ + { + title: 'Name', + dataIndex: 'name', + key: 'name', + width: 300, + render: (_: any, record) => ( + + + {record?.properties?.propertiesName || record?.name} + + + ), + }, + { + title: 'Version', + key: 'version', + width: 70, + render: (_: any, record: any) => ( + {record.versionProperties?.version?.versionTag || '-'} + ), + }, + { + title: 'Created At', + key: 'createdAt', + width: 150, + render: (_: any, record: any) => ( + + {record.properties?.createdTS?.time + ? moment(record.properties.createdTS.time).format('YYYY-MM-DD HH:mm:ss') + : '-'} + + ), + }, + { + title: 'Aliases', + key: 'aliases', + width: 200, + render: (_: any, record: any) => { + const aliases = record.versionProperties?.aliases || []; + + return ( + + {aliases.map((alias) => ( + + ))} + + ); + }, + }, + { + title: 'Tags', + key: 'tags', + width: 200, + render: (_: any, record: any) => { + const tags = record.properties?.tags || []; + + return ( + + {tags.map((tag) => ( + + ))} + + ); + }, + }, + { + title: 'Description', + dataIndex: 'description', + key: 'description', + width: 300, + render: (_: any, record: any) => { + const editableDesc = record.editableProperties?.description; + const originalDesc = record.description; + + return {editableDesc || originalDesc || '-'}; + }, + }, + ]; return ( - <> - - Models} - renderItem={(item) => ( - - {entityRegistry.renderPreview(EntityType.Mlmodel, PreviewType.PREVIEW, item)} - - )} - /> - - + + Model Group Details + + + + {modelGroup?.properties?.created?.time + ? moment(modelGroup.properties.created.time).format('YYYY-MM-DD HH:mm:ss') + : '-'} + + + + + {modelGroup?.properties?.lastModified?.time + ? moment(modelGroup.properties.lastModified.time).format('YYYY-MM-DD HH:mm:ss') + : '-'} + + + {modelGroup?.properties?.created?.actor && ( + + {modelGroup.properties.created?.actor} + + )} + + Models + , + }} + /> + ); } diff --git a/datahub-web-react/src/app/entity/shared/components/styled/search/EmbeddedListSearchSection.tsx b/datahub-web-react/src/app/entity/shared/components/styled/search/EmbeddedListSearchSection.tsx index 9648aaf852bbe..9da7b5d0ffb0c 100644 --- a/datahub-web-react/src/app/entity/shared/components/styled/search/EmbeddedListSearchSection.tsx +++ b/datahub-web-react/src/app/entity/shared/components/styled/search/EmbeddedListSearchSection.tsx @@ -2,7 +2,7 @@ import React from 'react'; import * as QueryString from 'query-string'; import { useHistory, useLocation } from 'react-router'; import { ApolloError } from '@apollo/client'; -import { FacetFilterInput } from '../../../../../../types.generated'; +import { EntityType, FacetFilterInput } from '../../../../../../types.generated'; import useFilters from '../../../../../search/utils/useFilters'; import { navigateToEntitySearchUrl } from './navigateToEntitySearchUrl'; import { FilterSet, GetSearchResultsParams, SearchResultsInterface } from './types'; @@ -16,6 +16,30 @@ import { } from '../../../../../search/utils/types'; const FILTER = 'filter'; +const SEARCH_ENTITY_TYPES = [ + EntityType.Dataset, + EntityType.Dashboard, + EntityType.Chart, + EntityType.Mlmodel, + EntityType.MlmodelGroup, + EntityType.MlfeatureTable, + EntityType.Mlfeature, + EntityType.MlprimaryKey, + EntityType.DataFlow, + EntityType.DataJob, + EntityType.GlossaryTerm, + EntityType.GlossaryNode, + EntityType.Tag, + EntityType.Role, + EntityType.CorpUser, + EntityType.CorpGroup, + EntityType.Container, + EntityType.Domain, + EntityType.DataProduct, + EntityType.Notebook, + EntityType.BusinessAttribute, + EntityType.DataProcessInstance, +]; function getParamsWithoutFilters(params: QueryString.ParsedQuery) { const paramsCopy = { ...params }; @@ -137,6 +161,7 @@ export const EmbeddedListSearchSection = ({ return ( ; + duration: Maybe; + status: Maybe; +} + +export default function DataProcessInstanceRightColumn({ startTime, duration, status }: Props) { + const statusPillColor = status === DataProcessInstanceRunResultType.Success ? 'green' : 'red'; + + return ( + <> + {startTime && ( + {toLocalDateTimeString(startTime)}} + title={Start Time} + trigger="hover" + overlayInnerStyle={popoverStyles.overlayInnerStyle} + overlayStyle={popoverStyles.overlayStyle} + > + {toRelativeTimeString(startTime)} + + )} + {duration && ( + {formatDetailedDuration(duration)}} + title={Duration} + trigger="hover" + overlayInnerStyle={popoverStyles.overlayInnerStyle} + overlayStyle={popoverStyles.overlayStyle} + > + {formatDuration(duration)} + + )} + {status && ( + <> + + + + + )} + + ); +} diff --git a/datahub-web-react/src/app/preview/DefaultPreviewCard.tsx b/datahub-web-react/src/app/preview/DefaultPreviewCard.tsx index a19862e83ae51..42a32a5a1951f 100644 --- a/datahub-web-react/src/app/preview/DefaultPreviewCard.tsx +++ b/datahub-web-react/src/app/preview/DefaultPreviewCard.tsx @@ -1,8 +1,8 @@ +import DataProcessInstanceRightColumn from '@app/preview/DataProcessInstanceRightColumn'; import React, { ReactNode, useState } from 'react'; import { Divider, Tooltip, Typography } from 'antd'; import { Link } from 'react-router-dom'; import styled from 'styled-components'; - import { GlobalTags, Owner, @@ -200,6 +200,9 @@ interface Props { paths?: EntityPath[]; health?: Health[]; parentDataset?: Dataset; + startTime?: number | null; + duration?: number | null; + status?: string | null; } export default function DefaultPreviewCard({ @@ -243,6 +246,9 @@ export default function DefaultPreviewCard({ paths, health, parentDataset, + startTime, + duration, + status, }: Props) { // sometimes these lists will be rendered inside an entity container (for example, in the case of impact analysis) // in those cases, we may want to enrich the preview w/ context about the container entity @@ -270,7 +276,8 @@ export default function DefaultPreviewCard({ event.stopPropagation(); }; - const shouldShowRightColumn = (topUsers && topUsers.length > 0) || (owners && owners.length > 0); + const shouldShowRightColumn = + (topUsers && topUsers.length > 0) || (owners && owners.length > 0) || startTime || duration || status; const uniqueOwners = getUniqueOwners(owners); return ( @@ -380,6 +387,7 @@ export default function DefaultPreviewCard({ {shouldShowRightColumn && ( + {topUsers && topUsers?.length > 0 && ( <> diff --git a/datahub-web-react/src/app/shared/time/timeUtils.tsx b/datahub-web-react/src/app/shared/time/timeUtils.tsx index 26d768a204be6..4ff6ffedf6533 100644 --- a/datahub-web-react/src/app/shared/time/timeUtils.tsx +++ b/datahub-web-react/src/app/shared/time/timeUtils.tsx @@ -206,3 +206,41 @@ export function getTimeRangeDescription(startDate: moment.Moment | null, endDate return 'Unknown time range'; } + +export function formatDuration(durationMs: number): string { + const duration = moment.duration(durationMs); + const hours = Math.floor(duration.asHours()); + const minutes = duration.minutes(); + const seconds = duration.seconds(); + + if (hours === 0 && minutes === 0) { + return `${seconds} secs`; + } + + if (hours === 0) { + return minutes === 1 ? `${minutes} min` : `${minutes} mins`; + } + + const minuteStr = minutes > 0 ? ` ${minutes} mins` : ''; + return hours === 1 ? `${hours} hr${minuteStr}` : `${hours} hrs${minuteStr}`; +} + +export function formatDetailedDuration(durationMs: number): string { + const duration = moment.duration(durationMs); + const hours = Math.floor(duration.asHours()); + const minutes = duration.minutes(); + const seconds = duration.seconds(); + + const parts: string[] = []; + + if (hours > 0) { + parts.push(hours === 1 ? `${hours} hr` : `${hours} hrs`); + } + if (minutes > 0) { + parts.push(minutes === 1 ? `${minutes} min` : `${minutes} mins`); + } + if (seconds > 0) { + parts.push(`${seconds} secs`); + } + return parts.join(' '); +} diff --git a/datahub-web-react/src/graphql/fragments.graphql b/datahub-web-react/src/graphql/fragments.graphql index ecac299748935..e94fc207fefd9 100644 --- a/datahub-web-react/src/graphql/fragments.graphql +++ b/datahub-web-react/src/graphql/fragments.graphql @@ -897,6 +897,10 @@ fragment nonRecursiveMLModel on MLModel { key value } + mlModelLineageInfo { + trainingJobs + downstreamJobs + } } globalTags { ...globalTagsFields @@ -971,6 +975,14 @@ fragment nonRecursiveMLModelGroupFields on MLModelGroup { time actor } + lastModified { + time + actor + } + mlModelLineageInfo { + trainingJobs + downstreamJobs + } } browsePathV2 { ...browsePathV2Fields diff --git a/datahub-web-react/src/graphql/lineage.graphql b/datahub-web-react/src/graphql/lineage.graphql index 457936ed62cd2..f387c0c050668 100644 --- a/datahub-web-react/src/graphql/lineage.graphql +++ b/datahub-web-react/src/graphql/lineage.graphql @@ -272,6 +272,7 @@ fragment lineageNodeProperties on EntityWithRelationships { removed } properties { + propertiesName: name createdTS: created { time actor @@ -296,6 +297,9 @@ fragment lineageNodeProperties on EntityWithRelationships { name description origin + tags { + ...globalTagsFields + } platform { ...platformFields } @@ -305,6 +309,34 @@ fragment lineageNodeProperties on EntityWithRelationships { status { removed } + versionProperties { + versionSet { + urn + type + } + version { + versionTag + } + aliases { + versionTag + } + comment + } + properties { + propertiesName: name + createdTS: created { + time + actor + } + tags + customProperties { + key + value + } + } + editableProperties { + description + } structuredProperties { properties { ...structuredPropertiesFields diff --git a/datahub-web-react/src/graphql/mlModel.graphql b/datahub-web-react/src/graphql/mlModel.graphql index ad97c7c6f530a..ba10a243e6f9b 100644 --- a/datahub-web-react/src/graphql/mlModel.graphql +++ b/datahub-web-react/src/graphql/mlModel.graphql @@ -20,6 +20,23 @@ query getMLModel($urn: String!) { } } } + trainedBy: relationships(input: { types: ["TrainedBy"], direction: OUTGOING, start: 0, count: 100 }) { + start + count + total + relationships { + type + direction + entity { + ... on DataProcessInstance { + urn + name + type + ...dataProcessInstanceFields + } + } + } + } privileges { ...entityPrivileges } diff --git a/datahub-web-react/src/graphql/search.graphql b/datahub-web-react/src/graphql/search.graphql index d12193b471d46..be72ff31a4f26 100644 --- a/datahub-web-react/src/graphql/search.graphql +++ b/datahub-web-react/src/graphql/search.graphql @@ -886,6 +886,9 @@ fragment searchResultsWithoutSchemaField on Entity { ...structuredPropertiesFields } } + properties { + propertiesName: name + } } ... on MLModelGroup { name @@ -908,6 +911,9 @@ fragment searchResultsWithoutSchemaField on Entity { ...structuredPropertiesFields } } + properties { + propertiesName: name + } } ... on Tag { name @@ -954,6 +960,9 @@ fragment searchResultsWithoutSchemaField on Entity { ...versionProperties } } + ... on DataProcessInstance { + ...dataProcessInstanceFields + } ... on DataPlatformInstance { ...dataPlatformInstanceFields }