diff --git a/.github/.codecov.yml b/.github/.codecov.yml
new file mode 100644
index 00000000000000..1faf5a6bab4644
--- /dev/null
+++ b/.github/.codecov.yml
@@ -0,0 +1,65 @@
+comment:
+ layout: "header, files, footer" # remove "new" from "header" and "footer"
+ hide_project_coverage: true # set to false
+ require_changes: false # if true: only post the comment if coverage changes
+
+codecov:
+ #due to ci-optimization, reports for modules that have not changed may be quite old
+ max_report_age: off
+
+flag_management:
+ default_rules: # the rules that will be followed for any flag added, generally
+ carryforward: true
+ statuses:
+ - type: project
+ target: auto
+ threshold: 0% #Not enforcing project coverage yet.
+ - type: patch
+ target: 90%
+ individual_flags: # exceptions to the default rules above, stated flag by flag
+ - name: frontend
+ paths:
+ - "datahub-frontend/**"
+ - "datahub-web-react/**"
+ - name: backend
+ paths:
+ - "metadata-models/**"
+ - "datahub-upgrade/**"
+ - "entity-registry/**"
+ - "li-utils/**"
+ - "metadata-auth/**"
+ - "metadata-dao-impl/**"
+ - "metadata-events/**"
+ - "metadata-jobs/**"
+ - "metadata-service/**"
+ - "metadata-utils/**"
+ - "metadata-operation-context/**"
+ - "datahub-graphql-core/**"
+ - name: metadata-io
+ paths:
+ - "metadata-io/**"
+ - name: ingestion
+ paths:
+ - "metadata-ingestion/**"
+ - name: ingestion-airflow
+ paths:
+ - "metadata-ingestion-modules/airflow-plugin/**"
+ - name: ingestion-dagster
+ paths:
+ - "metadata-ingestion-modules/dagster-plugin/**"
+ - name: ingestion-gx-plugin
+ paths:
+ - "metadata-ingestion-modules/gx-plugin/**"
+ - name: ingestion-prefect
+ paths:
+ - "metadata-ingestion-modules/prefect-plugin/**"
+coverage:
+ status:
+ project:
+ default:
+ target: 0% # no threshold enforcement yet
+ only_pulls: true
+ patch:
+ default:
+ target: 90% # for new code added in the patch
+ only_pulls: true
diff --git a/.github/actions/ci-optimization/action.yml b/.github/actions/ci-optimization/action.yml
index 0d435963382675..8a81859ae903a8 100644
--- a/.github/actions/ci-optimization/action.yml
+++ b/.github/actions/ci-optimization/action.yml
@@ -13,16 +13,16 @@ outputs:
value: ${{ steps.filter.outputs.frontend == 'false' && steps.filter.outputs.ingestion == 'false' && steps.filter.outputs.backend == 'true' }}
backend-change:
description: "Backend code has changed"
- value: ${{ steps.filter.outputs.backend == 'true' }}
+ value: ${{ steps.filter.outputs.backend == 'true' || steps.trigger.outputs.trigger == 'manual' }}
ingestion-change:
description: "Ingestion code has changed"
- value: ${{ steps.filter.outputs.ingestion == 'true' }}
+ value: ${{ steps.filter.outputs.ingestion == 'true' || steps.trigger.outputs.trigger == 'manual' }}
ingestion-base-change:
description: "Ingestion base image docker image has changed"
value: ${{ steps.filter.outputs.ingestion-base == 'true' }}
frontend-change:
description: "Frontend code has changed"
- value: ${{ steps.filter.outputs.frontend == 'true' }}
+ value: ${{ steps.filter.outputs.frontend == 'true' || steps.trigger.outputs.trigger == 'manual' }}
docker-change:
description: "Docker code has changed"
value: ${{ steps.filter.outputs.docker == 'true' }}
@@ -44,6 +44,15 @@ outputs:
runs:
using: "composite"
steps:
+ - name: Check trigger type
+ id: trigger # Add an ID to reference this step
+ shell: bash
+ run: |
+ if [ "${{ github.event_name }}" == "workflow_dispatch" ]; then
+ echo "trigger=manual" >> $GITHUB_OUTPUT
+ else
+ echo "trigger=pr" >> $GITHUB_OUTPUT
+ fi
- uses: dorny/paths-filter@v3
id: filter
with:
diff --git a/.github/scripts/generate_pre_commit.py b/.github/scripts/generate_pre_commit.py
new file mode 100755
index 00000000000000..2db73fd357ff5f
--- /dev/null
+++ b/.github/scripts/generate_pre_commit.py
@@ -0,0 +1,279 @@
+"""Generate pre-commit hooks for Java and Python projects.
+
+This script scans a repository for Java and Python projects and generates appropriate
+pre-commit hooks for linting and formatting. It also merges in additional hooks from
+an override file.
+"""
+
+import os
+from dataclasses import dataclass
+from enum import Enum, auto
+from pathlib import Path
+import datetime
+
+import yaml
+
+
+class ProjectType(Enum):
+ """Types of projects supported for hook generation."""
+
+ JAVA = auto()
+ PYTHON = auto()
+
+
+@dataclass
+class Project:
+ """Represents a project found in the repository."""
+
+ path: str
+ type: ProjectType
+
+ @property
+ def gradle_path(self) -> str:
+ """Convert path to Gradle task format."""
+ return ":" + self.path.replace("/", ":")
+
+ @property
+ def project_id(self) -> str:
+ """Generate a unique identifier for the project."""
+ return self.path.replace("/", "-").replace(".", "-")
+
+
+class ProjectFinder:
+ """Find Java and Python projects in a repository."""
+
+ JAVA_PATTERNS = [
+ "plugins.hasPlugin('java')",
+ "apply plugin: 'java'",
+ "id 'java'",
+ "id 'java-library'",
+ "plugins.hasPlugin('java-library')",
+ "apply plugin: 'java-library'",
+ "plugins.hasPlugin('pegasus')",
+ "org.springframework.boot",
+ ]
+
+ EXCLUDED_DIRS = {".git", "build", "node_modules", ".tox", "venv"}
+ SOURCE_EXTENSIONS = {".java", ".kt", ".groovy"}
+
+ def __init__(self, root_dir: str):
+ self.root_path = Path(root_dir)
+
+ def find_all_projects(self) -> list[Project]:
+ """Find all Java and Python projects in the repository."""
+ java_projects = self._find_java_projects()
+ python_projects = self._find_python_projects()
+
+ all_projects = []
+ all_projects.extend(
+ Project(path=p, type=ProjectType.JAVA) for p in java_projects
+ )
+ all_projects.extend(
+ Project(path=p, type=ProjectType.PYTHON) for p in python_projects
+ )
+
+ return sorted(all_projects, key=lambda p: p.path)
+
+ def _find_java_projects(self) -> set[str]:
+ """Find all Java projects by checking build.gradle files."""
+ java_projects = set()
+
+ # Search both build.gradle and build.gradle.kts
+ for pattern in ["build.gradle", "build.gradle.kts"]:
+ for gradle_file in self.root_path.rglob(pattern):
+ if self._should_skip_directory(gradle_file.parent):
+ continue
+
+ if self._is_java_project(gradle_file):
+ java_projects.add(self._get_relative_path(gradle_file.parent))
+
+ return {
+ p
+ for p in java_projects
+ if "buildSrc" not in p and "spark-smoke-test" not in p and p != "."
+ }
+
+ def _find_python_projects(self) -> set[str]:
+ """Find all Python projects by checking for setup.py or pyproject.toml."""
+ python_projects = set()
+
+ for file_name in ["setup.py", "pyproject.toml"]:
+ for path in self.root_path.rglob(file_name):
+ if self._should_skip_directory(path.parent):
+ continue
+
+ rel_path = self._get_relative_path(path.parent)
+ if "examples" not in rel_path:
+ python_projects.add(rel_path)
+
+ return python_projects
+
+ def _should_skip_directory(self, path: Path) -> bool:
+ """Check if directory should be skipped."""
+ return any(
+ part in self.EXCLUDED_DIRS or part.startswith(".") for part in path.parts
+ )
+
+ def _is_java_project(self, gradle_file: Path) -> bool:
+ """Check if a Gradle file represents a Java project."""
+ try:
+ content = gradle_file.read_text()
+ has_java_plugin = any(pattern in content for pattern in self.JAVA_PATTERNS)
+
+ if has_java_plugin:
+ # Verify presence of source files
+ return any(
+ list(gradle_file.parent.rglob(f"*{ext}"))
+ for ext in self.SOURCE_EXTENSIONS
+ )
+ return False
+
+ except Exception as e:
+ print(f"Warning: Error reading {gradle_file}: {e}")
+ return False
+
+ def _get_relative_path(self, path: Path) -> str:
+ """Get relative path from root, normalized with forward slashes."""
+ return str(path.relative_to(self.root_path)).replace("\\", "/")
+
+
+class HookGenerator:
+ """Generate pre-commit hooks for projects."""
+
+ def __init__(self, projects: list[Project], override_file: str = None):
+ self.projects = projects
+ self.override_file = override_file
+
+ def generate_config(self) -> dict:
+ """Generate the complete pre-commit config."""
+ hooks = []
+
+ for project in self.projects:
+ if project.type == ProjectType.PYTHON:
+ hooks.append(self._generate_lint_fix_hook(project))
+ else: # ProjectType.JAVA
+ hooks.append(self._generate_spotless_hook(project))
+
+ config = {"repos": [{"repo": "local", "hooks": hooks}]}
+
+ # Merge override hooks if they exist
+ if self.override_file and os.path.exists(self.override_file):
+ try:
+ with open(self.override_file, 'r') as f:
+ override_config = yaml.safe_load(f)
+
+ if override_config and 'repos' in override_config:
+ for override_repo in override_config['repos']:
+ matching_repo = next(
+ (repo for repo in config['repos']
+ if repo['repo'] == override_repo['repo']),
+ None
+ )
+
+ if matching_repo:
+ matching_repo['hooks'].extend(override_repo.get('hooks', []))
+ else:
+ config['repos'].append(override_repo)
+
+ print(f"Merged additional hooks from {self.override_file}")
+ except Exception as e:
+ print(f"Warning: Error reading override file {self.override_file}: {e}")
+
+ return config
+
+ def _generate_lint_fix_hook(self, project: Project) -> dict:
+ """Generate a lint-fix hook for Python projects."""
+ return {
+ "id": f"{project.project_id}-lint-fix",
+ "name": f"{project.path} Lint Fix",
+ "entry": f"./gradlew {project.gradle_path}:lintFix",
+ "language": "system",
+ "files": f"^{project.path}/.*\\.py$",
+ "pass_filenames": False,
+ }
+
+ def _generate_spotless_hook(self, project: Project) -> dict:
+ """Generate a spotless hook for Java projects."""
+ return {
+ "id": f"{project.project_id}-spotless",
+ "name": f"{project.path} Spotless Apply",
+ "entry": f"./gradlew {project.gradle_path}:spotlessApply",
+ "language": "system",
+ "files": f"^{project.path}/.*\\.java$",
+ "pass_filenames": False,
+ }
+
+
+class PrecommitDumper(yaml.Dumper):
+ """Custom YAML dumper that maintains proper indentation."""
+
+ def increase_indent(self, flow=False, *args, **kwargs):
+ return super().increase_indent(flow=flow, indentless=False)
+
+
+def write_yaml_with_spaces(file_path: str, data: dict):
+ """Write YAML file with extra spacing between hooks and a timestamp header."""
+ with open(file_path, "w") as f:
+ # Add timestamp header
+ current_time = datetime.datetime.now(datetime.timezone.utc)
+ formatted_time = current_time.strftime("%Y-%m-%d %H:%M:%S %Z")
+ header = f"# Auto-generated by .github/scripts/generate_pre_commit.py at {formatted_time}\n"
+ f.write(header)
+ header = f"# Do not edit this file directly. Run the script to regenerate.\n"
+ f.write(header)
+ header = f"# Add additional hooks in .github/scripts/pre-commit-override.yaml\n"
+ f.write(header)
+
+ # Write the YAML content
+ yaml_str = yaml.dump(
+ data, Dumper=PrecommitDumper, sort_keys=False, default_flow_style=False
+ )
+
+ # Add extra newline between hooks
+ lines = yaml_str.split("\n")
+ result = []
+ in_hook = False
+
+ for line in lines:
+ if line.strip().startswith("- id:"):
+ if in_hook: # If we were already in a hook, add extra newline
+ result.append("")
+ in_hook = True
+ elif not line.strip() and in_hook:
+ in_hook = False
+
+ result.append(line)
+
+ f.write("\n".join(result))
+
+
+def main():
+ root_dir = os.path.abspath(os.curdir)
+ override_file = ".github/scripts/pre-commit-override.yaml"
+
+ # Find projects
+ finder = ProjectFinder(root_dir)
+ projects = finder.find_all_projects()
+
+ # Print summary
+ print("Found projects:")
+ print("\nJava projects:")
+ for project in projects:
+ if project.type == ProjectType.JAVA:
+ print(f" - {project.path}")
+
+ print("\nPython projects:")
+ for project in projects:
+ if project.type == ProjectType.PYTHON:
+ print(f" - {project.path}")
+
+ # Generate and write config
+ generator = HookGenerator(projects, override_file)
+ config = generator.generate_config()
+ write_yaml_with_spaces(".pre-commit-config.yaml", config)
+
+ print("\nGenerated .pre-commit-config.yaml")
+
+
+if __name__ == "__main__":
+ main()
\ No newline at end of file
diff --git a/.github/scripts/pre-commit-override.yaml b/.github/scripts/pre-commit-override.yaml
new file mode 100644
index 00000000000000..961134bebe2c98
--- /dev/null
+++ b/.github/scripts/pre-commit-override.yaml
@@ -0,0 +1,9 @@
+repos:
+ - repo: local
+ hooks:
+ - id: smoke-test-cypress-lint-fix
+ name: smoke-test cypress Lint Fix
+ entry: ./gradlew :smoke-test:cypressLintFix
+ language: system
+ files: ^smoke-test/tests/cypress/.*$
+ pass_filenames: false
\ No newline at end of file
diff --git a/.github/workflows/airflow-plugin.yml b/.github/workflows/airflow-plugin.yml
index b824a21be63f8f..e1e0fb0a85e977 100644
--- a/.github/workflows/airflow-plugin.yml
+++ b/.github/workflows/airflow-plugin.yml
@@ -18,6 +18,7 @@ on:
- "metadata-models/**"
release:
types: [published]
+ workflow_dispatch:
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
@@ -77,16 +78,21 @@ jobs:
**/build/test-results/test/**
**/junit.*.xml
!**/binary/**
- - name: Upload coverage to Codecov
+ - name: Upload coverage to Codecov with ingestion flag
if: always()
uses: codecov/codecov-action@v5
with:
token: ${{ secrets.CODECOV_TOKEN }}
- directory: ./build/coverage-reports/
+ directory: ./build/coverage-reports/metadata-ingestion-modules/airflow-plugin/
fail_ci_if_error: false
- flags: airflow-${{ matrix.python-version }}-${{ matrix.extra_pip_extras }}
- name: pytest-airflow
+ flags: ingestion-airflow
+ name: pytest-airflow-${{ matrix.python-version }}-${{ matrix.extra_pip_requirements }}
verbose: true
+ - name: Upload test results to Codecov
+ if: ${{ !cancelled() }}
+ uses: codecov/test-results-action@v1
+ with:
+ token: ${{ secrets.CODECOV_TOKEN }}
event-file:
runs-on: ubuntu-latest
diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml
index 784dce0f11b2b5..86545946d6afea 100644
--- a/.github/workflows/build-and-test.yml
+++ b/.github/workflows/build-and-test.yml
@@ -12,6 +12,7 @@ on:
paths-ignore:
- "docs/**"
- "**.md"
+ workflow_dispatch:
release:
types: [published]
@@ -113,10 +114,16 @@ jobs:
if: ${{ matrix.command == 'except_metadata_ingestion' && needs.setup.outputs.backend_change == 'true' }}
run: |
./gradlew -PjavaClassVersionDefault=8 :metadata-integration:java:spark-lineage:compileJava
- - uses: actions/upload-artifact@v3
+ - name: Gather coverage files
+ run: |
+ echo "BACKEND_FILES=`find ./build/coverage-reports/ -type f | grep -E '(metadata-models|entity-registry|datahuyb-graphql-core|metadata-io|metadata-jobs|metadata-utils|metadata-service|medata-dao-impl|metadata-operation|li-utils|metadata-integration|metadata-events|metadata-auth|ingestion-scheduler|notifications|datahub-upgrade)' | xargs | sed 's/ /,/g'`" >> $GITHUB_ENV
+ echo "FRONTEND_FILES=`find ./build/coverage-reports/ -type f | grep -E '(datahub-frontend|datahub-web-react).*\.(xml|json)$' | xargs | sed 's/ /,/g'`" >> $GITHUB_ENV
+ - name: Generate tz artifact name
+ run: echo "NAME_TZ=$(echo ${{ matrix.timezone }} | tr '/' '-')" >> $GITHUB_ENV
+ - uses: actions/upload-artifact@v4
if: always()
with:
- name: Test Results (build)
+ name: Test Results (build) - ${{ matrix.command}}-${{ env.NAME_TZ }}
path: |
**/build/reports/tests/test/**
**/build/test-results/test/**
@@ -124,16 +131,35 @@ jobs:
!**/binary/**
- name: Ensure codegen is updated
uses: ./.github/actions/ensure-codegen-updated
- - name: Upload coverage to Codecov
- if: always()
+ - name: Upload backend coverage to Codecov
+ if: ${{ matrix.command == 'except_metadata_ingestion' && needs.setup.outputs.backend_change == 'true' }}
+ uses: codecov/codecov-action@v5
+ with:
+ token: ${{ secrets.CODECOV_TOKEN }}
+ files: ${{ env.BACKEND_FILES }}
+ disable_search: true
+ #handle_no_reports_found: true
+ fail_ci_if_error: false
+ flags: backend
+ name: ${{ matrix.command }}
+ verbose: true
+ - name: Upload frontend coverage to Codecov
+ if: ${{ matrix.command == 'frontend' && needs.setup.outputs.frontend_change == 'true' }}
uses: codecov/codecov-action@v5
with:
token: ${{ secrets.CODECOV_TOKEN }}
- directory: ./build/coverage-reports/
+ files: ${{ env.FRONTEND_FILES }}
+ disable_search: true
+ #handle_no_reports_found: true
fail_ci_if_error: false
- flags: ${{ matrix.timezone }}
+ flags: frontend
name: ${{ matrix.command }}
verbose: true
+ - name: Upload test results to Codecov
+ if: ${{ !cancelled() }}
+ uses: codecov/test-results-action@v1
+ with:
+ token: ${{ secrets.CODECOV_TOKEN }}
quickstart-compose-validation:
runs-on: ubuntu-latest
@@ -152,7 +178,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Upload
- uses: actions/upload-artifact@v3
+ uses: actions/upload-artifact@v4
with:
name: Event File
path: ${{ github.event_path }}
diff --git a/.github/workflows/close-stale-issues.yml b/.github/workflows/close-stale-issues.yml
index 98e3041f288040..005f41b767ea6d 100644
--- a/.github/workflows/close-stale-issues.yml
+++ b/.github/workflows/close-stale-issues.yml
@@ -10,7 +10,7 @@ jobs:
issues: write
pull-requests: write
steps:
- - uses: actions/stale@v6
+ - uses: actions/stale@v9
with:
ascending: true
operations-per-run: 100
diff --git a/.github/workflows/contributor-open-pr-comment.yml b/.github/workflows/contributor-open-pr-comment.yml
index decc7ab27a411d..fe60601b0159bd 100644
--- a/.github/workflows/contributor-open-pr-comment.yml
+++ b/.github/workflows/contributor-open-pr-comment.yml
@@ -17,12 +17,12 @@ jobs:
- name: Get and Format Username (PR only)
if: github.event_name == 'pull_request'
run: |
- formatted_username=$(echo "${{ github.event.pull_request.user.login }}" | tr '[:upper:]' '[:lower:]' | sed 's/ /-/g')
- echo "FORMATTED_USERNAME=$formatted_username" >> $GITHUB_ENV
+ formatted_username="$(echo "${{ github.event.pull_request.user.login }}" | tr '[:upper:]' '[:lower:]' | sed 's/ /-/g')"
+ echo "FORMATTED_USERNAME=${formatted_username}" >> "$GITHUB_ENV"
- name: Create Comment (PR only)
if: github.event_name == 'pull_request'
- uses: actions/github-script@v6
+ uses: actions/github-script@v7
with:
script: |
if (context.payload.pull_request) {
diff --git a/.github/workflows/dagster-plugin.yml b/.github/workflows/dagster-plugin.yml
index ae9a0b1605cdf3..a2ac59d6989a9f 100644
--- a/.github/workflows/dagster-plugin.yml
+++ b/.github/workflows/dagster-plugin.yml
@@ -18,6 +18,7 @@ on:
- "metadata-models/**"
release:
types: [published]
+ workflow_dispatch:
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
@@ -64,16 +65,21 @@ jobs:
**/build/reports/tests/test/**
**/build/test-results/test/**
**/junit.*.xml
- - name: Upload coverage to Codecov
+ - name: Upload coverage to Codecov with ingestion flag
if: always()
uses: codecov/codecov-action@v5
with:
token: ${{ secrets.CODECOV_TOKEN }}
- directory: ./build/coverage-reports/
+ directory: ./build/coverage-reports/metadata-ingestion-modules/dagster-plugin/
fail_ci_if_error: false
- flags: dagster-${{ matrix.python-version }}-${{ matrix.extraPythonRequirement }}
+ flags: ingestion-dagster-plugin
name: pytest-dagster
verbose: true
+ - name: Upload test results to Codecov
+ if: ${{ !cancelled() }}
+ uses: codecov/test-results-action@v1
+ with:
+ token: ${{ secrets.CODECOV_TOKEN }}
event-file:
runs-on: ubuntu-latest
diff --git a/.github/workflows/docker-unified.yml b/.github/workflows/docker-unified.yml
index a5200c7e917d81..e44e6b11c6d057 100644
--- a/.github/workflows/docker-unified.yml
+++ b/.github/workflows/docker-unified.yml
@@ -1253,19 +1253,19 @@ jobs:
TEST_STRATEGY="-${{ matrix.test_strategy }}-${{ matrix.batch }}"
source .github/scripts/docker_logs.sh
- name: Upload logs
- uses: actions/upload-artifact@v3
+ uses: actions/upload-artifact@v4
if: failure()
with:
name: docker-logs-${{ matrix.test_strategy }}-${{ matrix.batch }}
path: "docker_logs/*.log"
retention-days: 5
- name: Upload screenshots
- uses: actions/upload-artifact@v3
+ uses: actions/upload-artifact@v4
if: failure()
with:
name: cypress-snapshots-${{ matrix.test_strategy }}-${{ matrix.batch }}
path: smoke-test/tests/cypress/cypress/screenshots/
- - uses: actions/upload-artifact@v3
+ - uses: actions/upload-artifact@v4
if: always()
with:
name: Test Results (smoke tests) ${{ matrix.test_strategy }} ${{ matrix.batch }}
diff --git a/.github/workflows/gx-plugin.yml b/.github/workflows/gx-plugin.yml
index 2fd814a0764858..c28bdbb30eb36d 100644
--- a/.github/workflows/gx-plugin.yml
+++ b/.github/workflows/gx-plugin.yml
@@ -18,6 +18,7 @@ on:
- "metadata-models/**"
release:
types: [published]
+ workflow_dispatch:
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
@@ -68,16 +69,21 @@ jobs:
**/build/reports/tests/test/**
**/build/test-results/test/**
**/junit.*.xml
- - name: Upload coverage to Codecov
+ - name: Upload coverage to Codecov with ingestion flag
if: always()
uses: codecov/codecov-action@v5
with:
token: ${{ secrets.CODECOV_TOKEN }}
- directory: ./build/coverage-reports/
+ directory: ./build/coverage-reports/metadata-ingestion-modules/gx-plugin/
fail_ci_if_error: false
- flags: gx-${{ matrix.python-version }}-${{ matrix.extraPythonRequirement }}
+ flags: ingestion-gx-plugin
name: pytest-gx
verbose: true
+ - name: Upload test results to Codecov
+ if: ${{ !cancelled() }}
+ uses: codecov/test-results-action@v1
+ with:
+ token: ${{ secrets.CODECOV_TOKEN }}
event-file:
runs-on: ubuntu-latest
diff --git a/.github/workflows/metadata-ingestion.yml b/.github/workflows/metadata-ingestion.yml
index f4d87b361b5edc..be6026098ce420 100644
--- a/.github/workflows/metadata-ingestion.yml
+++ b/.github/workflows/metadata-ingestion.yml
@@ -18,6 +18,7 @@ on:
- "metadata-models/**"
release:
types: [published]
+ workflow_dispatch:
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
@@ -88,16 +89,21 @@ jobs:
**/build/test-results/test/**
**/junit.*.xml
!**/binary/**
- - name: Upload coverage to Codecov
- if: ${{ always() }}
+ - name: Upload coverage to Codecov with ingestion flag
+ if: ${{ always() && matrix.python-version == '3.11' }}
uses: codecov/codecov-action@v5
with:
token: ${{ secrets.CODECOV_TOKEN }}
- directory: ./build/coverage-reports/
+ directory: ./build/coverage-reports/metadata-ingestion/
fail_ci_if_error: false
- flags: ingestion-${{ matrix.python-version }}-${{ matrix.command }}
- name: pytest-ingestion
+ flags: ingestion
+ name: pytest-${{ matrix.python-version }}-${{ matrix.command }}
verbose: true
+ - name: Upload test results to Codecov
+ if: ${{ !cancelled() }}
+ uses: codecov/test-results-action@v1
+ with:
+ token: ${{ secrets.CODECOV_TOKEN }}
event-file:
runs-on: ubuntu-latest
diff --git a/.github/workflows/metadata-io.yml b/.github/workflows/metadata-io.yml
index 2225baecde64c6..80af03e77eef82 100644
--- a/.github/workflows/metadata-io.yml
+++ b/.github/workflows/metadata-io.yml
@@ -20,6 +20,7 @@ on:
- ".github/workflows/metadata-io.yml"
release:
types: [published]
+ workflow_dispatch:
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
@@ -70,7 +71,7 @@ jobs:
- name: Gradle build (and test)
run: |
./gradlew :metadata-io:test
- - uses: actions/upload-artifact@v3
+ - uses: actions/upload-artifact@v4
if: always()
with:
name: Test Results (metadata-io)
@@ -86,16 +87,22 @@ jobs:
uses: codecov/codecov-action@v5
with:
token: ${{ secrets.CODECOV_TOKEN }}
- directory: ./build/coverage-reports/
+ directory: ./build/coverage-reports/metadata-io/
fail_ci_if_error: false
+ flags: metadata-io
name: metadata-io-test
verbose: true
+ - name: Upload test results to Codecov
+ if: ${{ !cancelled() }}
+ uses: codecov/test-results-action@v1
+ with:
+ token: ${{ secrets.CODECOV_TOKEN }}
event-file:
runs-on: ubuntu-latest
steps:
- name: Upload
- uses: actions/upload-artifact@v3
+ uses: actions/upload-artifact@v4
with:
name: Event File
path: ${{ github.event_path }}
diff --git a/.github/workflows/prefect-plugin.yml b/.github/workflows/prefect-plugin.yml
index 879df032409f28..401efa340ae8ca 100644
--- a/.github/workflows/prefect-plugin.yml
+++ b/.github/workflows/prefect-plugin.yml
@@ -18,6 +18,7 @@ on:
- "metadata-models/**"
release:
types: [published]
+ workflow_dispatch:
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
@@ -60,16 +61,21 @@ jobs:
**/build/test-results/test/**
**/junit.*.xml
!**/binary/**
- - name: Upload coverage to Codecov
+ - name: Upload coverage to Codecov with ingestion flag
if: always()
uses: codecov/codecov-action@v5
with:
token: ${{ secrets.CODECOV_TOKEN }}
- directory: ./build/coverage-reports/
+ directory: ./build/coverage-reports/metadata-ingestion-modules/prefect-plugin/
fail_ci_if_error: false
- flags: prefect-${{ matrix.python-version }}
- name: pytest-prefect
+ flags: ingestion-prefect-plugin
+ name: pytest-prefect-${{ matrix.python-version }}
verbose: true
+ - name: Upload test results to Codecov
+ if: ${{ !cancelled() }}
+ uses: codecov/test-results-action@v1
+ with:
+ token: ${{ secrets.CODECOV_TOKEN }}
event-file:
runs-on: ubuntu-latest
diff --git a/.github/workflows/spark-smoke-test.yml b/.github/workflows/spark-smoke-test.yml
index 23413336404f2b..e6a6705a72879c 100644
--- a/.github/workflows/spark-smoke-test.yml
+++ b/.github/workflows/spark-smoke-test.yml
@@ -72,14 +72,14 @@ jobs:
docker logs elasticsearch >& elasticsearch-${{ matrix.test_strategy }}.log || true
docker logs datahub-frontend-react >& frontend-${{ matrix.test_strategy }}.log || true
- name: Upload logs
- uses: actions/upload-artifact@v3
+ uses: actions/upload-artifact@v4
if: failure()
with:
name: docker logs
path: |
"**/build/container-logs/*.log"
"*.log"
- - uses: actions/upload-artifact@v3
+ - uses: actions/upload-artifact@v4
if: always()
with:
name: Test Results (smoke tests)
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 898e3d262b3941..3697efa37770e7 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,26 +1,445 @@
-exclude: ^$
-files: ^(docs/|docs-website/|metadata-ingestion/)
+# Auto-generated by .github/scripts/generate_pre_commit.py at 2025-01-09 10:08:09 UTC
+# Do not edit this file directly. Run the script to regenerate.
+# Add additional hooks in .github/scripts/pre-commit-override.yaml
repos:
- - repo: https://github.com/pre-commit/mirrors-isort
- rev: v5.10.1
+ - repo: local
hooks:
- - id: isort
- - repo: https://github.com/ambv/black
- rev: 23.1.0
- hooks:
- - id: black
- - repo: https://github.com/myint/autoflake
- rev: v1.4
- hooks:
- - id: autoflake
- args:
- - --in-place
- - --remove-unused-variables
- - --remove-all-unused-imports
- - --expand-star-imports
- - repo: https://github.com/pre-commit/mirrors-prettier
- rev: "v3.0.0-alpha.6" # Use the sha or tag you want to point at
- hooks:
- - id: prettier
- args:
- - --write
\ No newline at end of file
+ - id: datahub-graphql-core-spotless
+ name: datahub-graphql-core Spotless Apply
+ entry: ./gradlew :datahub-graphql-core:spotlessApply
+ language: system
+ files: ^datahub-graphql-core/.*\.java$
+ pass_filenames: false
+
+ - id: datahub-upgrade-spotless
+ name: datahub-upgrade Spotless Apply
+ entry: ./gradlew :datahub-upgrade:spotlessApply
+ language: system
+ files: ^datahub-upgrade/.*\.java$
+ pass_filenames: false
+
+ - id: entity-registry-spotless
+ name: entity-registry Spotless Apply
+ entry: ./gradlew :entity-registry:spotlessApply
+ language: system
+ files: ^entity-registry/.*\.java$
+ pass_filenames: false
+
+ - id: ingestion-scheduler-spotless
+ name: ingestion-scheduler Spotless Apply
+ entry: ./gradlew :ingestion-scheduler:spotlessApply
+ language: system
+ files: ^ingestion-scheduler/.*\.java$
+ pass_filenames: false
+
+ - id: li-utils-spotless
+ name: li-utils Spotless Apply
+ entry: ./gradlew :li-utils:spotlessApply
+ language: system
+ files: ^li-utils/.*\.java$
+ pass_filenames: false
+
+ - id: metadata-auth-auth-api-spotless
+ name: metadata-auth/auth-api Spotless Apply
+ entry: ./gradlew :metadata-auth:auth-api:spotlessApply
+ language: system
+ files: ^metadata-auth/auth-api/.*\.java$
+ pass_filenames: false
+
+ - id: metadata-dao-impl-kafka-producer-spotless
+ name: metadata-dao-impl/kafka-producer Spotless Apply
+ entry: ./gradlew :metadata-dao-impl:kafka-producer:spotlessApply
+ language: system
+ files: ^metadata-dao-impl/kafka-producer/.*\.java$
+ pass_filenames: false
+
+ - id: metadata-events-mxe-avro-spotless
+ name: metadata-events/mxe-avro Spotless Apply
+ entry: ./gradlew :metadata-events:mxe-avro:spotlessApply
+ language: system
+ files: ^metadata-events/mxe-avro/.*\.java$
+ pass_filenames: false
+
+ - id: metadata-events-mxe-registration-spotless
+ name: metadata-events/mxe-registration Spotless Apply
+ entry: ./gradlew :metadata-events:mxe-registration:spotlessApply
+ language: system
+ files: ^metadata-events/mxe-registration/.*\.java$
+ pass_filenames: false
+
+ - id: metadata-events-mxe-schemas-spotless
+ name: metadata-events/mxe-schemas Spotless Apply
+ entry: ./gradlew :metadata-events:mxe-schemas:spotlessApply
+ language: system
+ files: ^metadata-events/mxe-schemas/.*\.java$
+ pass_filenames: false
+
+ - id: metadata-events-mxe-utils-avro-spotless
+ name: metadata-events/mxe-utils-avro Spotless Apply
+ entry: ./gradlew :metadata-events:mxe-utils-avro:spotlessApply
+ language: system
+ files: ^metadata-events/mxe-utils-avro/.*\.java$
+ pass_filenames: false
+
+ - id: metadata-ingestion-lint-fix
+ name: metadata-ingestion Lint Fix
+ entry: ./gradlew :metadata-ingestion:lintFix
+ language: system
+ files: ^metadata-ingestion/.*\.py$
+ pass_filenames: false
+
+ - id: metadata-ingestion-modules-airflow-plugin-lint-fix
+ name: metadata-ingestion-modules/airflow-plugin Lint Fix
+ entry: ./gradlew :metadata-ingestion-modules:airflow-plugin:lintFix
+ language: system
+ files: ^metadata-ingestion-modules/airflow-plugin/.*\.py$
+ pass_filenames: false
+
+ - id: metadata-ingestion-modules-dagster-plugin-lint-fix
+ name: metadata-ingestion-modules/dagster-plugin Lint Fix
+ entry: ./gradlew :metadata-ingestion-modules:dagster-plugin:lintFix
+ language: system
+ files: ^metadata-ingestion-modules/dagster-plugin/.*\.py$
+ pass_filenames: false
+
+ - id: metadata-ingestion-modules-gx-plugin-lint-fix
+ name: metadata-ingestion-modules/gx-plugin Lint Fix
+ entry: ./gradlew :metadata-ingestion-modules:gx-plugin:lintFix
+ language: system
+ files: ^metadata-ingestion-modules/gx-plugin/.*\.py$
+ pass_filenames: false
+
+ - id: metadata-ingestion-modules-prefect-plugin-lint-fix
+ name: metadata-ingestion-modules/prefect-plugin Lint Fix
+ entry: ./gradlew :metadata-ingestion-modules:prefect-plugin:lintFix
+ language: system
+ files: ^metadata-ingestion-modules/prefect-plugin/.*\.py$
+ pass_filenames: false
+
+ - id: metadata-integration-java-acryl-spark-lineage-spotless
+ name: metadata-integration/java/acryl-spark-lineage Spotless Apply
+ entry: ./gradlew :metadata-integration:java:acryl-spark-lineage:spotlessApply
+ language: system
+ files: ^metadata-integration/java/acryl-spark-lineage/.*\.java$
+ pass_filenames: false
+
+ - id: metadata-integration-java-datahub-client-spotless
+ name: metadata-integration/java/datahub-client Spotless Apply
+ entry: ./gradlew :metadata-integration:java:datahub-client:spotlessApply
+ language: system
+ files: ^metadata-integration/java/datahub-client/.*\.java$
+ pass_filenames: false
+
+ - id: metadata-integration-java-datahub-event-spotless
+ name: metadata-integration/java/datahub-event Spotless Apply
+ entry: ./gradlew :metadata-integration:java:datahub-event:spotlessApply
+ language: system
+ files: ^metadata-integration/java/datahub-event/.*\.java$
+ pass_filenames: false
+
+ - id: metadata-integration-java-datahub-protobuf-spotless
+ name: metadata-integration/java/datahub-protobuf Spotless Apply
+ entry: ./gradlew :metadata-integration:java:datahub-protobuf:spotlessApply
+ language: system
+ files: ^metadata-integration/java/datahub-protobuf/.*\.java$
+ pass_filenames: false
+
+ - id: metadata-integration-java-datahub-schematron-cli-spotless
+ name: metadata-integration/java/datahub-schematron/cli Spotless Apply
+ entry: ./gradlew :metadata-integration:java:datahub-schematron:cli:spotlessApply
+ language: system
+ files: ^metadata-integration/java/datahub-schematron/cli/.*\.java$
+ pass_filenames: false
+
+ - id: metadata-integration-java-datahub-schematron-lib-spotless
+ name: metadata-integration/java/datahub-schematron/lib Spotless Apply
+ entry: ./gradlew :metadata-integration:java:datahub-schematron:lib:spotlessApply
+ language: system
+ files: ^metadata-integration/java/datahub-schematron/lib/.*\.java$
+ pass_filenames: false
+
+ - id: metadata-integration-java-examples-spotless
+ name: metadata-integration/java/examples Spotless Apply
+ entry: ./gradlew :metadata-integration:java:examples:spotlessApply
+ language: system
+ files: ^metadata-integration/java/examples/.*\.java$
+ pass_filenames: false
+
+ - id: metadata-integration-java-openlineage-converter-spotless
+ name: metadata-integration/java/openlineage-converter Spotless Apply
+ entry: ./gradlew :metadata-integration:java:openlineage-converter:spotlessApply
+ language: system
+ files: ^metadata-integration/java/openlineage-converter/.*\.java$
+ pass_filenames: false
+
+ - id: metadata-integration-java-spark-lineage-legacy-spotless
+ name: metadata-integration/java/spark-lineage-legacy Spotless Apply
+ entry: ./gradlew :metadata-integration:java:spark-lineage-legacy:spotlessApply
+ language: system
+ files: ^metadata-integration/java/spark-lineage-legacy/.*\.java$
+ pass_filenames: false
+
+ - id: metadata-io-spotless
+ name: metadata-io Spotless Apply
+ entry: ./gradlew :metadata-io:spotlessApply
+ language: system
+ files: ^metadata-io/.*\.java$
+ pass_filenames: false
+
+ - id: metadata-io-metadata-io-api-spotless
+ name: metadata-io/metadata-io-api Spotless Apply
+ entry: ./gradlew :metadata-io:metadata-io-api:spotlessApply
+ language: system
+ files: ^metadata-io/metadata-io-api/.*\.java$
+ pass_filenames: false
+
+ - id: metadata-jobs-common-spotless
+ name: metadata-jobs/common Spotless Apply
+ entry: ./gradlew :metadata-jobs:common:spotlessApply
+ language: system
+ files: ^metadata-jobs/common/.*\.java$
+ pass_filenames: false
+
+ - id: metadata-jobs-mae-consumer-spotless
+ name: metadata-jobs/mae-consumer Spotless Apply
+ entry: ./gradlew :metadata-jobs:mae-consumer:spotlessApply
+ language: system
+ files: ^metadata-jobs/mae-consumer/.*\.java$
+ pass_filenames: false
+
+ - id: metadata-jobs-mae-consumer-job-spotless
+ name: metadata-jobs/mae-consumer-job Spotless Apply
+ entry: ./gradlew :metadata-jobs:mae-consumer-job:spotlessApply
+ language: system
+ files: ^metadata-jobs/mae-consumer-job/.*\.java$
+ pass_filenames: false
+
+ - id: metadata-jobs-mce-consumer-spotless
+ name: metadata-jobs/mce-consumer Spotless Apply
+ entry: ./gradlew :metadata-jobs:mce-consumer:spotlessApply
+ language: system
+ files: ^metadata-jobs/mce-consumer/.*\.java$
+ pass_filenames: false
+
+ - id: metadata-jobs-mce-consumer-job-spotless
+ name: metadata-jobs/mce-consumer-job Spotless Apply
+ entry: ./gradlew :metadata-jobs:mce-consumer-job:spotlessApply
+ language: system
+ files: ^metadata-jobs/mce-consumer-job/.*\.java$
+ pass_filenames: false
+
+ - id: metadata-jobs-pe-consumer-spotless
+ name: metadata-jobs/pe-consumer Spotless Apply
+ entry: ./gradlew :metadata-jobs:pe-consumer:spotlessApply
+ language: system
+ files: ^metadata-jobs/pe-consumer/.*\.java$
+ pass_filenames: false
+
+ - id: metadata-models-spotless
+ name: metadata-models Spotless Apply
+ entry: ./gradlew :metadata-models:spotlessApply
+ language: system
+ files: ^metadata-models/.*\.java$
+ pass_filenames: false
+
+ - id: metadata-models-custom-spotless
+ name: metadata-models-custom Spotless Apply
+ entry: ./gradlew :metadata-models-custom:spotlessApply
+ language: system
+ files: ^metadata-models-custom/.*\.java$
+ pass_filenames: false
+
+ - id: metadata-models-validator-spotless
+ name: metadata-models-validator Spotless Apply
+ entry: ./gradlew :metadata-models-validator:spotlessApply
+ language: system
+ files: ^metadata-models-validator/.*\.java$
+ pass_filenames: false
+
+ - id: metadata-operation-context-spotless
+ name: metadata-operation-context Spotless Apply
+ entry: ./gradlew :metadata-operation-context:spotlessApply
+ language: system
+ files: ^metadata-operation-context/.*\.java$
+ pass_filenames: false
+
+ - id: metadata-service-auth-config-spotless
+ name: metadata-service/auth-config Spotless Apply
+ entry: ./gradlew :metadata-service:auth-config:spotlessApply
+ language: system
+ files: ^metadata-service/auth-config/.*\.java$
+ pass_filenames: false
+
+ - id: metadata-service-auth-filter-spotless
+ name: metadata-service/auth-filter Spotless Apply
+ entry: ./gradlew :metadata-service:auth-filter:spotlessApply
+ language: system
+ files: ^metadata-service/auth-filter/.*\.java$
+ pass_filenames: false
+
+ - id: metadata-service-auth-impl-spotless
+ name: metadata-service/auth-impl Spotless Apply
+ entry: ./gradlew :metadata-service:auth-impl:spotlessApply
+ language: system
+ files: ^metadata-service/auth-impl/.*\.java$
+ pass_filenames: false
+
+ - id: metadata-service-auth-servlet-impl-spotless
+ name: metadata-service/auth-servlet-impl Spotless Apply
+ entry: ./gradlew :metadata-service:auth-servlet-impl:spotlessApply
+ language: system
+ files: ^metadata-service/auth-servlet-impl/.*\.java$
+ pass_filenames: false
+
+ - id: metadata-service-configuration-spotless
+ name: metadata-service/configuration Spotless Apply
+ entry: ./gradlew :metadata-service:configuration:spotlessApply
+ language: system
+ files: ^metadata-service/configuration/.*\.java$
+ pass_filenames: false
+
+ - id: metadata-service-factories-spotless
+ name: metadata-service/factories Spotless Apply
+ entry: ./gradlew :metadata-service:factories:spotlessApply
+ language: system
+ files: ^metadata-service/factories/.*\.java$
+ pass_filenames: false
+
+ - id: metadata-service-graphql-servlet-impl-spotless
+ name: metadata-service/graphql-servlet-impl Spotless Apply
+ entry: ./gradlew :metadata-service:graphql-servlet-impl:spotlessApply
+ language: system
+ files: ^metadata-service/graphql-servlet-impl/.*\.java$
+ pass_filenames: false
+
+ - id: metadata-service-openapi-analytics-servlet-spotless
+ name: metadata-service/openapi-analytics-servlet Spotless Apply
+ entry: ./gradlew :metadata-service:openapi-analytics-servlet:spotlessApply
+ language: system
+ files: ^metadata-service/openapi-analytics-servlet/.*\.java$
+ pass_filenames: false
+
+ - id: metadata-service-openapi-entity-servlet-spotless
+ name: metadata-service/openapi-entity-servlet Spotless Apply
+ entry: ./gradlew :metadata-service:openapi-entity-servlet:spotlessApply
+ language: system
+ files: ^metadata-service/openapi-entity-servlet/.*\.java$
+ pass_filenames: false
+
+ - id: metadata-service-openapi-entity-servlet-generators-spotless
+ name: metadata-service/openapi-entity-servlet/generators Spotless Apply
+ entry: ./gradlew :metadata-service:openapi-entity-servlet:generators:spotlessApply
+ language: system
+ files: ^metadata-service/openapi-entity-servlet/generators/.*\.java$
+ pass_filenames: false
+
+ - id: metadata-service-openapi-servlet-spotless
+ name: metadata-service/openapi-servlet Spotless Apply
+ entry: ./gradlew :metadata-service:openapi-servlet:spotlessApply
+ language: system
+ files: ^metadata-service/openapi-servlet/.*\.java$
+ pass_filenames: false
+
+ - id: metadata-service-openapi-servlet-models-spotless
+ name: metadata-service/openapi-servlet/models Spotless Apply
+ entry: ./gradlew :metadata-service:openapi-servlet:models:spotlessApply
+ language: system
+ files: ^metadata-service/openapi-servlet/models/.*\.java$
+ pass_filenames: false
+
+ - id: metadata-service-plugin-spotless
+ name: metadata-service/plugin Spotless Apply
+ entry: ./gradlew :metadata-service:plugin:spotlessApply
+ language: system
+ files: ^metadata-service/plugin/.*\.java$
+ pass_filenames: false
+
+ - id: metadata-service-plugin-src-test-sample-test-plugins-spotless
+ name: metadata-service/plugin/src/test/sample-test-plugins Spotless Apply
+ entry: ./gradlew :metadata-service:plugin:src:test:sample-test-plugins:spotlessApply
+ language: system
+ files: ^metadata-service/plugin/src/test/sample-test-plugins/.*\.java$
+ pass_filenames: false
+
+ - id: metadata-service-restli-client-spotless
+ name: metadata-service/restli-client Spotless Apply
+ entry: ./gradlew :metadata-service:restli-client:spotlessApply
+ language: system
+ files: ^metadata-service/restli-client/.*\.java$
+ pass_filenames: false
+
+ - id: metadata-service-restli-client-api-spotless
+ name: metadata-service/restli-client-api Spotless Apply
+ entry: ./gradlew :metadata-service:restli-client-api:spotlessApply
+ language: system
+ files: ^metadata-service/restli-client-api/.*\.java$
+ pass_filenames: false
+
+ - id: metadata-service-restli-servlet-impl-spotless
+ name: metadata-service/restli-servlet-impl Spotless Apply
+ entry: ./gradlew :metadata-service:restli-servlet-impl:spotlessApply
+ language: system
+ files: ^metadata-service/restli-servlet-impl/.*\.java$
+ pass_filenames: false
+
+ - id: metadata-service-schema-registry-api-spotless
+ name: metadata-service/schema-registry-api Spotless Apply
+ entry: ./gradlew :metadata-service:schema-registry-api:spotlessApply
+ language: system
+ files: ^metadata-service/schema-registry-api/.*\.java$
+ pass_filenames: false
+
+ - id: metadata-service-schema-registry-servlet-spotless
+ name: metadata-service/schema-registry-servlet Spotless Apply
+ entry: ./gradlew :metadata-service:schema-registry-servlet:spotlessApply
+ language: system
+ files: ^metadata-service/schema-registry-servlet/.*\.java$
+ pass_filenames: false
+
+ - id: metadata-service-services-spotless
+ name: metadata-service/services Spotless Apply
+ entry: ./gradlew :metadata-service:services:spotlessApply
+ language: system
+ files: ^metadata-service/services/.*\.java$
+ pass_filenames: false
+
+ - id: metadata-service-servlet-spotless
+ name: metadata-service/servlet Spotless Apply
+ entry: ./gradlew :metadata-service:servlet:spotlessApply
+ language: system
+ files: ^metadata-service/servlet/.*\.java$
+ pass_filenames: false
+
+ - id: metadata-utils-spotless
+ name: metadata-utils Spotless Apply
+ entry: ./gradlew :metadata-utils:spotlessApply
+ language: system
+ files: ^metadata-utils/.*\.java$
+ pass_filenames: false
+
+ - id: mock-entity-registry-spotless
+ name: mock-entity-registry Spotless Apply
+ entry: ./gradlew :mock-entity-registry:spotlessApply
+ language: system
+ files: ^mock-entity-registry/.*\.java$
+ pass_filenames: false
+
+ - id: smoke-test-lint-fix
+ name: smoke-test Lint Fix
+ entry: ./gradlew :smoke-test:lintFix
+ language: system
+ files: ^smoke-test/.*\.py$
+ pass_filenames: false
+
+ - id: test-models-spotless
+ name: test-models Spotless Apply
+ entry: ./gradlew :test-models:spotlessApply
+ language: system
+ files: ^test-models/.*\.java$
+ pass_filenames: false
+
+ - id: smoke-test-cypress-lint-fix
+ name: smoke-test cypress Lint Fix
+ entry: ./gradlew :smoke-test:cypressLintFix
+ language: system
+ files: ^smoke-test/tests/cypress/.*$
diff --git a/build.gradle b/build.gradle
index 8929b4e644972c..284092e2b14f49 100644
--- a/build.gradle
+++ b/build.gradle
@@ -211,7 +211,7 @@ project.ext.externalDependency = [
'mockitoInline': 'org.mockito:mockito-inline:4.11.0',
'mockServer': 'org.mock-server:mockserver-netty:5.11.2',
'mockServerClient': 'org.mock-server:mockserver-client-java:5.11.2',
- 'mysqlConnector': 'mysql:mysql-connector-java:8.0.28',
+ 'mysqlConnector': 'com.mysql:mysql-connector-j:8.4.0',
'neo4jHarness': 'org.neo4j.test:neo4j-harness:' + neo4jTestVersion,
'neo4jJavaDriver': 'org.neo4j.driver:neo4j-java-driver:' + neo4jVersion,
'neo4jTestJavaDriver': 'org.neo4j.driver:neo4j-java-driver:' + neo4jTestVersion,
@@ -235,7 +235,7 @@ project.ext.externalDependency = [
'playFilters': "com.typesafe.play:filters-helpers_$playScalaVersion:$playVersion",
'pac4j': 'org.pac4j:pac4j-oidc:6.0.6',
'playPac4j': "org.pac4j:play-pac4j_$playScalaVersion:12.0.0-PLAY2.8",
- 'postgresql': 'org.postgresql:postgresql:42.3.9',
+ 'postgresql': 'org.postgresql:postgresql:42.7.4',
'protobuf': 'com.google.protobuf:protobuf-java:3.25.5',
'grpcProtobuf': 'io.grpc:grpc-protobuf:1.53.0',
'rangerCommons': 'org.apache.ranger:ranger-plugins-common:2.3.0',
@@ -286,7 +286,8 @@ project.ext.externalDependency = [
'annotationApi': 'javax.annotation:javax.annotation-api:1.3.2',
'jakartaAnnotationApi': 'jakarta.annotation:jakarta.annotation-api:3.0.0',
'classGraph': 'io.github.classgraph:classgraph:4.8.172',
- 'mustache': 'com.github.spullara.mustache.java:compiler:0.9.14'
+ 'mustache': 'com.github.spullara.mustache.java:compiler:0.9.14',
+ 'javaxMail': 'com.sun.mail:jakarta.mail:1.6.7'
]
allprojects {
@@ -374,9 +375,11 @@ configure(subprojects.findAll {! it.name.startsWith('spark-lineage')}) {
exclude group: "org.slf4j", module: "slf4j-nop"
exclude group: "org.slf4j", module: "slf4j-ext"
exclude group: "org.codehaus.jackson", module: "jackson-mapper-asl"
+ exclude group: "javax.mail", module: "mail"
resolutionStrategy.force externalDependency.antlr4Runtime
resolutionStrategy.force externalDependency.antlr4
+ resolutionStrategy.force 'org.apache.mina:mina-core:2.2.4'
}
}
@@ -474,10 +477,6 @@ subprojects {
if (compileJavaTask != null) {
spotlessJavaTask.dependsOn compileJavaTask
}
- // TODO - Do not run this in CI. How?
- // tasks.withType(JavaCompile) {
- // finalizedBy(tasks.findByName('spotlessApply'))
- // }
}
}
diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/GmsGraphQLEngine.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/GmsGraphQLEngine.java
index 94f0e8a055b701..59335ba605a741 100644
--- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/GmsGraphQLEngine.java
+++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/GmsGraphQLEngine.java
@@ -2377,6 +2377,17 @@ private void configureDataJobResolvers(final RuntimeWiring.Builder builder) {
? dataJob.getDataPlatformInstance().getUrn()
: null;
}))
+ .dataFetcher(
+ "container",
+ new LoadableTypeResolver<>(
+ containerType,
+ (env) -> {
+ final DataJob dataJob = env.getSource();
+ return dataJob.getContainer() != null
+ ? dataJob.getContainer().getUrn()
+ : null;
+ }))
+ .dataFetcher("parentContainers", new ParentContainersResolver(entityClient))
.dataFetcher("runs", new DataJobRunsResolver(entityClient))
.dataFetcher("privileges", new EntityPrivilegesResolver(entityClient))
.dataFetcher("exists", new EntityExistsResolver(entityService))
@@ -2454,6 +2465,17 @@ private void configureDataFlowResolvers(final RuntimeWiring.Builder builder) {
? dataFlow.getDataPlatformInstance().getUrn()
: null;
}))
+ .dataFetcher(
+ "container",
+ new LoadableTypeResolver<>(
+ containerType,
+ (env) -> {
+ final DataFlow dataFlow = env.getSource();
+ return dataFlow.getContainer() != null
+ ? dataFlow.getContainer().getUrn()
+ : null;
+ }))
+ .dataFetcher("parentContainers", new ParentContainersResolver(entityClient))
.dataFetcher(
"health",
new EntityHealthResolver(
diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/dataflow/DataFlowType.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/dataflow/DataFlowType.java
index 3a697517bdecee..f2d38aadf49656 100644
--- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/dataflow/DataFlowType.java
+++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/dataflow/DataFlowType.java
@@ -74,6 +74,7 @@ public class DataFlowType
DOMAINS_ASPECT_NAME,
DEPRECATION_ASPECT_NAME,
DATA_PLATFORM_INSTANCE_ASPECT_NAME,
+ CONTAINER_ASPECT_NAME,
DATA_PRODUCTS_ASPECT_NAME,
BROWSE_PATHS_V2_ASPECT_NAME,
STRUCTURED_PROPERTIES_ASPECT_NAME,
diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/dataflow/mappers/DataFlowMapper.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/dataflow/mappers/DataFlowMapper.java
index 44bc6a99eae4bb..0902d6f2080b8f 100644
--- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/dataflow/mappers/DataFlowMapper.java
+++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/dataflow/mappers/DataFlowMapper.java
@@ -16,6 +16,7 @@
import com.linkedin.data.DataMap;
import com.linkedin.datahub.graphql.QueryContext;
import com.linkedin.datahub.graphql.authorization.AuthorizationUtils;
+import com.linkedin.datahub.graphql.generated.Container;
import com.linkedin.datahub.graphql.generated.DataFlow;
import com.linkedin.datahub.graphql.generated.DataFlowEditableProperties;
import com.linkedin.datahub.graphql.generated.DataFlowInfo;
@@ -106,6 +107,7 @@ public DataFlow apply(
(dataset, dataMap) ->
dataset.setDataPlatformInstance(
DataPlatformInstanceAspectMapper.map(context, new DataPlatformInstance(dataMap))));
+ mappingHelper.mapToResult(context, CONTAINER_ASPECT_NAME, DataFlowMapper::mapContainers);
mappingHelper.mapToResult(
BROWSE_PATHS_V2_ASPECT_NAME,
(dataFlow, dataMap) ->
@@ -206,6 +208,17 @@ private static void mapGlobalTags(
dataFlow.setTags(globalTags);
}
+ private static void mapContainers(
+ @Nullable final QueryContext context, @Nonnull DataFlow dataFlow, @Nonnull DataMap dataMap) {
+ final com.linkedin.container.Container gmsContainer =
+ new com.linkedin.container.Container(dataMap);
+ dataFlow.setContainer(
+ Container.builder()
+ .setType(EntityType.CONTAINER)
+ .setUrn(gmsContainer.getContainer().toString())
+ .build());
+ }
+
private static void mapDomains(
@Nullable final QueryContext context, @Nonnull DataFlow dataFlow, @Nonnull DataMap dataMap) {
final Domains domains = new Domains(dataMap);
diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/datajob/DataJobType.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/datajob/DataJobType.java
index 8d55ca6dbf7ac9..317ee39ea565e5 100644
--- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/datajob/DataJobType.java
+++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/datajob/DataJobType.java
@@ -75,6 +75,7 @@ public class DataJobType
DOMAINS_ASPECT_NAME,
DEPRECATION_ASPECT_NAME,
DATA_PLATFORM_INSTANCE_ASPECT_NAME,
+ CONTAINER_ASPECT_NAME,
DATA_PRODUCTS_ASPECT_NAME,
BROWSE_PATHS_V2_ASPECT_NAME,
SUB_TYPES_ASPECT_NAME,
diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/datajob/mappers/DataJobMapper.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/datajob/mappers/DataJobMapper.java
index ec57c95ce151e2..3403d1f8f7b7f2 100644
--- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/datajob/mappers/DataJobMapper.java
+++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/datajob/mappers/DataJobMapper.java
@@ -9,6 +9,7 @@
import com.linkedin.data.DataMap;
import com.linkedin.datahub.graphql.QueryContext;
import com.linkedin.datahub.graphql.authorization.AuthorizationUtils;
+import com.linkedin.datahub.graphql.generated.Container;
import com.linkedin.datahub.graphql.generated.DataFlow;
import com.linkedin.datahub.graphql.generated.DataJob;
import com.linkedin.datahub.graphql.generated.DataJobEditableProperties;
@@ -112,6 +113,14 @@ public DataJob apply(
} else if (DATA_PLATFORM_INSTANCE_ASPECT_NAME.equals(name)) {
result.setDataPlatformInstance(
DataPlatformInstanceAspectMapper.map(context, new DataPlatformInstance(data)));
+ } else if (CONTAINER_ASPECT_NAME.equals(name)) {
+ final com.linkedin.container.Container gmsContainer =
+ new com.linkedin.container.Container(data);
+ result.setContainer(
+ Container.builder()
+ .setType(EntityType.CONTAINER)
+ .setUrn(gmsContainer.getContainer().toString())
+ .build());
} else if (BROWSE_PATHS_V2_ASPECT_NAME.equals(name)) {
result.setBrowsePathV2(BrowsePathsV2Mapper.map(context, new BrowsePathsV2(data)));
} else if (SUB_TYPES_ASPECT_NAME.equals(name)) {
diff --git a/datahub-graphql-core/src/main/resources/entity.graphql b/datahub-graphql-core/src/main/resources/entity.graphql
index a5cb0893a64fae..adb24d92587b58 100644
--- a/datahub-graphql-core/src/main/resources/entity.graphql
+++ b/datahub-graphql-core/src/main/resources/entity.graphql
@@ -6275,6 +6275,16 @@ type DataFlow implements EntityWithRelationships & Entity & BrowsableEntity {
"""
dataPlatformInstance: DataPlatformInstance
+ """
+ The parent container in which the entity resides
+ """
+ container: Container
+
+ """
+ Recursively get the lineage of containers for this entity
+ """
+ parentContainers: ParentContainersResult
+
"""
Granular API for querying edges extending from this entity
"""
@@ -6457,6 +6467,16 @@ type DataJob implements EntityWithRelationships & Entity & BrowsableEntity {
"""
dataPlatformInstance: DataPlatformInstance
+ """
+ The parent container in which the entity resides
+ """
+ container: Container
+
+ """
+ Recursively get the lineage of containers for this entity
+ """
+ parentContainers: ParentContainersResult
+
"""
Additional read write properties associated with the Data Job
"""
diff --git a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/types/dataflow/mappers/DataFlowMapperTest.java b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/types/dataflow/mappers/DataFlowMapperTest.java
new file mode 100644
index 00000000000000..a49f063f94d336
--- /dev/null
+++ b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/types/dataflow/mappers/DataFlowMapperTest.java
@@ -0,0 +1,42 @@
+package com.linkedin.datahub.graphql.types.dataflow.mappers;
+
+import com.linkedin.common.urn.Urn;
+import com.linkedin.datahub.graphql.generated.DataFlow;
+import com.linkedin.entity.Aspect;
+import com.linkedin.entity.EntityResponse;
+import com.linkedin.entity.EnvelopedAspect;
+import com.linkedin.entity.EnvelopedAspectMap;
+import com.linkedin.metadata.Constants;
+import java.net.URISyntaxException;
+import java.util.HashMap;
+import java.util.Map;
+import org.testng.Assert;
+import org.testng.annotations.Test;
+
+public class DataFlowMapperTest {
+ private static final Urn TEST_DATA_FLOW_URN =
+ Urn.createFromTuple(Constants.DATA_FLOW_ENTITY_NAME, "dataflow1");
+ private static final Urn TEST_CONTAINER_URN =
+ Urn.createFromTuple(Constants.CONTAINER_ENTITY_NAME, "container1");
+
+ @Test
+ public void testMapDataFlowContainer() throws URISyntaxException {
+ com.linkedin.container.Container input = new com.linkedin.container.Container();
+ input.setContainer(TEST_CONTAINER_URN);
+
+ final Map containerAspect = new HashMap<>();
+ containerAspect.put(
+ Constants.CONTAINER_ASPECT_NAME,
+ new com.linkedin.entity.EnvelopedAspect().setValue(new Aspect(input.data())));
+ final EntityResponse response =
+ new EntityResponse()
+ .setEntityName(Constants.DATA_FLOW_ENTITY_NAME)
+ .setUrn(TEST_DATA_FLOW_URN)
+ .setAspects(new EnvelopedAspectMap(containerAspect));
+
+ final DataFlow actual = DataFlowMapper.map(null, response);
+
+ Assert.assertEquals(actual.getUrn(), TEST_DATA_FLOW_URN.toString());
+ Assert.assertEquals(actual.getContainer().getUrn(), TEST_CONTAINER_URN.toString());
+ }
+}
diff --git a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/types/datajob/mappers/DataJobMapperTest.java b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/types/datajob/mappers/DataJobMapperTest.java
new file mode 100644
index 00000000000000..d7fc0f198977eb
--- /dev/null
+++ b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/types/datajob/mappers/DataJobMapperTest.java
@@ -0,0 +1,42 @@
+package com.linkedin.datahub.graphql.types.datajob.mappers;
+
+import com.linkedin.common.urn.Urn;
+import com.linkedin.datahub.graphql.generated.DataJob;
+import com.linkedin.entity.Aspect;
+import com.linkedin.entity.EntityResponse;
+import com.linkedin.entity.EnvelopedAspect;
+import com.linkedin.entity.EnvelopedAspectMap;
+import com.linkedin.metadata.Constants;
+import java.net.URISyntaxException;
+import java.util.HashMap;
+import java.util.Map;
+import org.testng.Assert;
+import org.testng.annotations.Test;
+
+public class DataJobMapperTest {
+ private static final Urn TEST_DATA_JOB_URN =
+ Urn.createFromTuple(Constants.DATA_JOB_ENTITY_NAME, "datajob1");
+ private static final Urn TEST_CONTAINER_URN =
+ Urn.createFromTuple(Constants.CONTAINER_ENTITY_NAME, "container1");
+
+ @Test
+ public void testMapDataJobContainer() throws URISyntaxException {
+ com.linkedin.container.Container input = new com.linkedin.container.Container();
+ input.setContainer(TEST_CONTAINER_URN);
+
+ final Map containerAspect = new HashMap<>();
+ containerAspect.put(
+ Constants.CONTAINER_ASPECT_NAME,
+ new com.linkedin.entity.EnvelopedAspect().setValue(new Aspect(input.data())));
+ final EntityResponse response =
+ new EntityResponse()
+ .setEntityName(Constants.DATA_JOB_ENTITY_NAME)
+ .setUrn(TEST_DATA_JOB_URN)
+ .setAspects(new EnvelopedAspectMap(containerAspect));
+
+ final DataJob actual = DataJobMapper.map(null, response);
+
+ Assert.assertEquals(actual.getUrn(), TEST_DATA_JOB_URN.toString());
+ Assert.assertEquals(actual.getContainer().getUrn(), TEST_CONTAINER_URN.toString());
+ }
+}
diff --git a/datahub-web-react/src/app/entity/dataFlow/DataFlowEntity.tsx b/datahub-web-react/src/app/entity/dataFlow/DataFlowEntity.tsx
index 3c03dfb65ccbcd..9e26bbadaca070 100644
--- a/datahub-web-react/src/app/entity/dataFlow/DataFlowEntity.tsx
+++ b/datahub-web-react/src/app/entity/dataFlow/DataFlowEntity.tsx
@@ -184,6 +184,7 @@ export class DataFlowEntity implements Entity {
degree={(result as any).degree}
paths={(result as any).paths}
health={data.health}
+ parentContainers={data.parentContainers}
/>
);
};
diff --git a/datahub-web-react/src/app/entity/dataFlow/preview/Preview.tsx b/datahub-web-react/src/app/entity/dataFlow/preview/Preview.tsx
index f210f7c985ebf7..0c86e745eba29f 100644
--- a/datahub-web-react/src/app/entity/dataFlow/preview/Preview.tsx
+++ b/datahub-web-react/src/app/entity/dataFlow/preview/Preview.tsx
@@ -10,6 +10,7 @@ import {
GlobalTags,
Health,
Owner,
+ ParentContainersResult,
SearchInsight,
} from '../../../../types.generated';
import DefaultPreviewCard from '../../../preview/DefaultPreviewCard';
@@ -40,6 +41,7 @@ export const Preview = ({
degree,
paths,
health,
+ parentContainers,
}: {
urn: string;
name: string;
@@ -59,6 +61,7 @@ export const Preview = ({
degree?: number;
paths?: EntityPath[];
health?: Health[] | null;
+ parentContainers?: ParentContainersResult | null;
}): JSX.Element => {
const entityRegistry = useEntityRegistry();
return (
@@ -91,6 +94,7 @@ export const Preview = ({
degree={degree}
paths={paths}
health={health || undefined}
+ parentContainers={parentContainers}
/>
);
};
diff --git a/datahub-web-react/src/app/entity/dataJob/DataJobEntity.tsx b/datahub-web-react/src/app/entity/dataJob/DataJobEntity.tsx
index 5b1aaeaef76d5b..ff6490ebc91b0c 100644
--- a/datahub-web-react/src/app/entity/dataJob/DataJobEntity.tsx
+++ b/datahub-web-react/src/app/entity/dataJob/DataJobEntity.tsx
@@ -205,6 +205,7 @@ export class DataJobEntity implements Entity {
degree={(result as any).degree}
paths={(result as any).paths}
health={data.health}
+ parentContainers={data.parentContainers}
/>
);
};
diff --git a/datahub-web-react/src/app/entity/dataJob/preview/Preview.tsx b/datahub-web-react/src/app/entity/dataJob/preview/Preview.tsx
index b163722b5151c7..07ff81effbbc65 100644
--- a/datahub-web-react/src/app/entity/dataJob/preview/Preview.tsx
+++ b/datahub-web-react/src/app/entity/dataJob/preview/Preview.tsx
@@ -12,6 +12,7 @@ import {
GlobalTags,
Health,
Owner,
+ ParentContainersResult,
SearchInsight,
} from '../../../../types.generated';
import DefaultPreviewCard from '../../../preview/DefaultPreviewCard';
@@ -44,6 +45,7 @@ export const Preview = ({
degree,
paths,
health,
+ parentContainers,
}: {
urn: string;
name: string;
@@ -64,6 +66,7 @@ export const Preview = ({
degree?: number;
paths?: EntityPath[];
health?: Health[] | null;
+ parentContainers?: ParentContainersResult | null;
}): JSX.Element => {
const entityRegistry = useEntityRegistry();
return (
@@ -98,6 +101,7 @@ export const Preview = ({
degree={degree}
paths={paths}
health={health || undefined}
+ parentContainers={parentContainers}
/>
);
};
diff --git a/datahub-web-react/src/graphql/dataFlow.graphql b/datahub-web-react/src/graphql/dataFlow.graphql
index 2441ce600c3c55..199c47811ce08e 100644
--- a/datahub-web-react/src/graphql/dataFlow.graphql
+++ b/datahub-web-react/src/graphql/dataFlow.graphql
@@ -50,6 +50,9 @@ fragment dataFlowFields on DataFlow {
dataPlatformInstance {
...dataPlatformInstanceFields
}
+ parentContainers {
+ ...parentContainersFields
+ }
browsePathV2 {
...browsePathV2Fields
}
diff --git a/datahub-web-react/src/graphql/fragments.graphql b/datahub-web-react/src/graphql/fragments.graphql
index 788c68349b4268..68c57c5cb5db55 100644
--- a/datahub-web-react/src/graphql/fragments.graphql
+++ b/datahub-web-react/src/graphql/fragments.graphql
@@ -403,6 +403,9 @@ fragment dataJobFields on DataJob {
dataPlatformInstance {
...dataPlatformInstanceFields
}
+ parentContainers {
+ ...parentContainersFields
+ }
privileges {
canEditLineage
}
diff --git a/datahub-web-react/src/graphql/search.graphql b/datahub-web-react/src/graphql/search.graphql
index 58c9a51f3d7e90..72e7d347187828 100644
--- a/datahub-web-react/src/graphql/search.graphql
+++ b/datahub-web-react/src/graphql/search.graphql
@@ -128,6 +128,9 @@ fragment autoCompleteFields on Entity {
dataPlatformInstance {
...dataPlatformInstanceFields
}
+ parentContainers {
+ ...parentContainersFields
+ }
}
... on DataJob {
dataFlow {
@@ -146,6 +149,9 @@ fragment autoCompleteFields on Entity {
dataPlatformInstance {
...dataPlatformInstanceFields
}
+ parentContainers {
+ ...parentContainersFields
+ }
}
... on GlossaryTerm {
name
@@ -626,6 +632,9 @@ fragment searchResultsWithoutSchemaField on Entity {
dataPlatformInstance {
...dataPlatformInstanceFields
}
+ parentContainers {
+ ...parentContainersFields
+ }
domain {
...entityDomain
}
@@ -677,6 +686,9 @@ fragment searchResultsWithoutSchemaField on Entity {
dataPlatformInstance {
...dataPlatformInstanceFields
}
+ parentContainers {
+ ...parentContainersFields
+ }
subTypes {
typeNames
}
diff --git a/docker/datahub-frontend/Dockerfile b/docker/datahub-frontend/Dockerfile
index 89974e56575b07..16e6477c37ce69 100644
--- a/docker/datahub-frontend/Dockerfile
+++ b/docker/datahub-frontend/Dockerfile
@@ -1,7 +1,7 @@
# Defining environment
ARG APP_ENV=prod
-FROM alpine:3.20 AS base
+FROM alpine:3.21 AS base
# Configurable repositories
ARG ALPINE_REPO_URL=http://dl-cdn.alpinelinux.org/alpine
diff --git a/docker/datahub-gms/Dockerfile b/docker/datahub-gms/Dockerfile
index 47b10535f8deea..52cc507f9268d1 100644
--- a/docker/datahub-gms/Dockerfile
+++ b/docker/datahub-gms/Dockerfile
@@ -23,7 +23,7 @@ WORKDIR /go/src/github.com/jwilder/dockerize
RUN go install github.com/jwilder/dockerize@$DOCKERIZE_VERSION
-FROM alpine:3.20 AS base
+FROM alpine:3.21 AS base
ENV JMX_VERSION=0.18.0
ENV JETTY_VERSION=11.0.21
diff --git a/docker/datahub-mae-consumer/Dockerfile b/docker/datahub-mae-consumer/Dockerfile
index 74375072761d89..4ddec56311fb96 100644
--- a/docker/datahub-mae-consumer/Dockerfile
+++ b/docker/datahub-mae-consumer/Dockerfile
@@ -23,7 +23,7 @@ WORKDIR /go/src/github.com/jwilder/dockerize
RUN go install github.com/jwilder/dockerize@$DOCKERIZE_VERSION
-FROM alpine:3.20 AS base
+FROM alpine:3.21 AS base
# Re-declaring args from above to make them available in this stage (will inherit default values)
ARG ALPINE_REPO_URL
diff --git a/docker/datahub-mce-consumer/Dockerfile b/docker/datahub-mce-consumer/Dockerfile
index 3adef53cd06068..42e40cd5942144 100644
--- a/docker/datahub-mce-consumer/Dockerfile
+++ b/docker/datahub-mce-consumer/Dockerfile
@@ -23,7 +23,7 @@ WORKDIR /go/src/github.com/jwilder/dockerize
RUN go install github.com/jwilder/dockerize@$DOCKERIZE_VERSION
-FROM alpine:3.20 AS base
+FROM alpine:3.21 AS base
# Re-declaring args from above to make them available in this stage (will inherit default values)
ARG ALPINE_REPO_URL
diff --git a/docker/datahub-upgrade/Dockerfile b/docker/datahub-upgrade/Dockerfile
index a8ef4e8034fdd5..d63ceb83dc5295 100644
--- a/docker/datahub-upgrade/Dockerfile
+++ b/docker/datahub-upgrade/Dockerfile
@@ -23,7 +23,7 @@ WORKDIR /go/src/github.com/jwilder/dockerize
RUN go install github.com/jwilder/dockerize@$DOCKERIZE_VERSION
-FROM alpine:3.20 AS base
+FROM alpine:3.21 AS base
# Re-declaring args from above to make them available in this stage (will inherit default values)
ARG ALPINE_REPO_URL
@@ -34,16 +34,12 @@ ARG MAVEN_CENTRAL_REPO_URL
RUN if [ "${ALPINE_REPO_URL}" != "http://dl-cdn.alpinelinux.org/alpine" ] ; then sed -i "s#http.*://dl-cdn.alpinelinux.org/alpine#${ALPINE_REPO_URL}#g" /etc/apk/repositories ; fi
ENV JMX_VERSION=0.18.0
-ENV JETTY_VERSION=11.0.21
# Upgrade Alpine and base packages
# PFP-260: Upgrade Sqlite to >=3.28.0-r0 to fix https://security.snyk.io/vuln/SNYK-ALPINE39-SQLITE-449762
RUN apk --no-cache --update-cache --available upgrade \
&& apk --no-cache add curl bash coreutils gcompat sqlite libc6-compat snappy \
&& apk --no-cache add openjdk17-jre-headless --repository=${ALPINE_REPO_URL}/edge/community \
- && curl -sS ${MAVEN_CENTRAL_REPO_URL}/org/eclipse/jetty/jetty-runner/${JETTY_VERSION}/jetty-runner-${JETTY_VERSION}.jar --output jetty-runner.jar \
- && curl -sS ${MAVEN_CENTRAL_REPO_URL}/org/eclipse/jetty/jetty-jmx/${JETTY_VERSION}/jetty-jmx-${JETTY_VERSION}.jar --output jetty-jmx.jar \
- && curl -sS ${MAVEN_CENTRAL_REPO_URL}/org/eclipse/jetty/jetty-util/${JETTY_VERSION}/jetty-util-${JETTY_VERSION}.jar --output jetty-util.jar \
&& wget --no-verbose ${GITHUB_REPO_URL}/open-telemetry/opentelemetry-java-instrumentation/releases/download/v1.24.0/opentelemetry-javaagent.jar \
&& wget --no-verbose ${MAVEN_CENTRAL_REPO_URL}/io/prometheus/jmx/jmx_prometheus_javaagent/${JMX_VERSION}/jmx_prometheus_javaagent-${JMX_VERSION}.jar -O jmx_prometheus_javaagent.jar \
&& cp /usr/lib/jvm/java-17-openjdk/jre/lib/security/cacerts /tmp/kafka.client.truststore.jks
diff --git a/docker/elasticsearch-setup/Dockerfile b/docker/elasticsearch-setup/Dockerfile
index 1a6fe5bee6c840..584007a5fb0a9c 100644
--- a/docker/elasticsearch-setup/Dockerfile
+++ b/docker/elasticsearch-setup/Dockerfile
@@ -23,7 +23,7 @@ WORKDIR /go/src/github.com/jwilder/dockerize
RUN go install github.com/jwilder/dockerize@$DOCKERIZE_VERSION
-FROM alpine:3.20 AS base
+FROM alpine:3.21 AS base
ARG ALPINE_REPO_URL
diff --git a/docker/mysql-setup/Dockerfile b/docker/mysql-setup/Dockerfile
index 8a2d42bc233180..21b696a1b906fe 100644
--- a/docker/mysql-setup/Dockerfile
+++ b/docker/mysql-setup/Dockerfile
@@ -17,7 +17,7 @@ WORKDIR /go/src/github.com/jwilder/dockerize
RUN go install github.com/jwilder/dockerize@$DOCKERIZE_VERSION
-FROM alpine:3.20
+FROM alpine:3.21
COPY --from=binary /go/bin/dockerize /usr/local/bin
ARG ALPINE_REPO_URL
diff --git a/docker/mysql-setup/init.sh b/docker/mysql-setup/init.sh
index b5ee294ddd6559..2760da86a9a33f 100755
--- a/docker/mysql-setup/init.sh
+++ b/docker/mysql-setup/init.sh
@@ -1,6 +1,7 @@
#!/bin/bash
: ${MYSQL_PORT:=3306}
+: ${MYSQL_ARGS:=--ssl=0}
sed -e "s/DATAHUB_DB_NAME/${DATAHUB_DB_NAME}/g" /init.sql | tee -a /tmp/init-final.sql
-mysql -u $MYSQL_USERNAME -p"$MYSQL_PASSWORD" -h $MYSQL_HOST -P $MYSQL_PORT < /tmp/init-final.sql
\ No newline at end of file
+mariadb -u $MYSQL_USERNAME -p"$MYSQL_PASSWORD" -h $MYSQL_HOST -P $MYSQL_PORT $MYSQL_ARGS < /tmp/init-final.sql
\ No newline at end of file
diff --git a/docker/postgres-setup/Dockerfile b/docker/postgres-setup/Dockerfile
index 31e9687cea15e8..5362e0d787c15d 100644
--- a/docker/postgres-setup/Dockerfile
+++ b/docker/postgres-setup/Dockerfile
@@ -17,7 +17,7 @@ WORKDIR /go/src/github.com/jwilder/dockerize
RUN go install github.com/jwilder/dockerize@$DOCKERIZE_VERSION
-FROM alpine:3.20
+FROM alpine:3.21
COPY --from=binary /go/bin/dockerize /usr/local/bin
ARG ALPINE_REPO_URL
diff --git a/docs-website/README.md b/docs-website/README.md
index 3b24cb869a444d..b40e4636422781 100644
--- a/docs-website/README.md
+++ b/docs-website/README.md
@@ -130,7 +130,6 @@ The purpose of this section is to provide developers & technical users with conc
This section aims to provide plain-language feature overviews for both technical and non-technical readers alike.
-
## Docs Generation Features
**Includes all markdown files**
@@ -145,16 +144,33 @@ You can suppress this check by adding the path to the file in a comment in `side
Use an "inline" directive to include code snippets from other files. The `show_path_as_comment` option will include the path to the file as a comment at the top of the snippet.
- ```python
- {{ inline /metadata-ingestion/examples/library/data_quality_mcpw_rest.py show_path_as_comment }}
- ```
+ ```python
+ {{ inline /metadata-ingestion/examples/library/data_quality_mcpw_rest.py show_path_as_comment }}
+ ```
+
+**Command Output**
+
+Use the `{{ command-output cmd }}` directive to run subprocesses and inject the outputs into the final markdown.
+
+ {{ command-output python -c 'print("Hello world")' }}
+This also works for multi-line scripts.
+
+ {{ command-output
+ source metadata-ingestion/venv/bin/activate
+ python -m
+ }}
+
+Regardless of the location of the markdown file, the subcommands will be executed with working directory set to the repo root.
+
+Only the stdout of the subprocess will be outputted. The stderr, if any, will be included as a comment in the markdown.
## Docs site generation process
This process is orchestrated by a combination of Gradle and Yarn tasks. The main entrypoint is via the `docs-website:yarnGenerate` task, which in turn eventually runs `yarn run generate`.
Steps:
+
1. Generate the GraphQL combined schema using the gradle's `docs-website:generateGraphQLSchema` task. This generates `./graphql/combined.graphql`.
2. Generate docs for ingestion sources using the `:metadata-ingestion:docGen` gradle task.
3. Generate docs for our metadata model using the `:metadata-ingestion:modelDocGen` gradle task.
diff --git a/docs-website/docusaurus.config.js b/docs-website/docusaurus.config.js
index 31644f459ed731..350521ea8ee643 100644
--- a/docs-website/docusaurus.config.js
+++ b/docs-website/docusaurus.config.js
@@ -13,6 +13,15 @@ module.exports = {
projectName: "datahub", // Usually your repo name.
staticDirectories: ["static", "genStatic"],
stylesheets: ["https://fonts.googleapis.com/css2?family=Manrope:wght@400;500;700&display=swap"],
+ headTags: [
+ {
+ tagName: 'meta',
+ attributes: {
+ httpEquiv: 'Content-Security-Policy',
+ content: "frame-ancestors 'self' https://*.acryl.io https://acryldata.io http://localhost:*"
+ }
+ },
+ ],
scripts: [
{
src: "https://tools.luckyorange.com/core/lo.js?site-id=28ea8a38",
diff --git a/docs-website/generateDocsDir.ts b/docs-website/generateDocsDir.ts
index ad82a85f9e5672..3a14baee073c2a 100644
--- a/docs-website/generateDocsDir.ts
+++ b/docs-website/generateDocsDir.ts
@@ -439,6 +439,42 @@ function markdown_process_inline_directives(
contents.content = new_content;
}
+function markdown_process_command_output(
+ contents: matter.GrayMatterFile,
+ filepath: string
+): void {
+ const new_content = contents.content.replace(
+ /^{{\s*command-output\s*([\s\S]*?)\s*}}$/gm,
+ (_, command: string) => {
+ try {
+ // Change to repo root directory before executing command
+ const repoRoot = path.resolve(__dirname, "..");
+
+ console.log(`Executing command: ${command}`);
+
+ // Execute the command and capture output
+ const output = execSync(command, {
+ cwd: repoRoot,
+ encoding: "utf8",
+ stdio: ["pipe", "pipe", "pipe"],
+ });
+
+ // Return the command output
+ return output.trim();
+ } catch (error: any) {
+ // If there's an error, include it as a comment
+ const errorMessage = error.stderr
+ ? error.stderr.toString()
+ : error.message;
+ return `${
+ error.stdout ? error.stdout.toString().trim() : ""
+ }\n`;
+ }
+ }
+ );
+ contents.content = new_content;
+}
+
function markdown_sanitize_and_linkify(content: string): string {
// MDX escaping
content = content.replace(/
-Install the relevant CLI version. Forms are available as of CLI version `0.13.1`. The corresponding DataHub Cloud release version is `v0.2.16.5`
+Install the relevant CLI version.
+Structured Properties were introduced in version `0.13.1`, but we continuously improve and add new functionality, so you should always [upgrade](https://datahubproject.io/docs/cli/#installation) to the latest cli for best results.
Connect to your instance via [init](https://datahubproject.io/docs/cli/#init):
- Run `datahub init` to update the instance you want to load into.
@@ -56,33 +58,8 @@ Requirements for OpenAPI are:
The following code will create a structured property `io.acryl.privacy.retentionTime`.
-
-```graphql
-mutation createStructuredProperty {
- createStructuredProperty(
- input: {
- id: "retentionTime",
- qualifiedName:"retentionTime",
- displayName: "Retention Time",
- description: "Retention Time is used to figure out how long to retain records in a dataset",
- valueType: "urn:li:dataType:datahub.number",
- allowedValues: [
- {numberValue: 30, description: "30 days, usually reserved for datasets that are ephemeral and contain pii"},
- {numberValue: 90, description:"description: Use this for datasets that drive monthly reporting but contain pii"},
- {numberValue: 365, description:"Use this for non-sensitive data that can be retained for longer"}
- ],
- cardinality: SINGLE,
- entityTypes: ["urn:li:entityType:datahub.dataset", "urn:li:entityType:datahub.dataFlow"],
- }
- ) {
- urn
- }
-}
-```
-
-
-
+
Create a yaml file representing the properties you’d like to load.
For example, below file represents a property `io.acryl.privacy.retentionTime`. You can see the full example [here](https://github.com/datahub-project/datahub/blob/example-yaml-sp/metadata-ingestion/examples/structured_properties/struct_props.yaml).
@@ -108,13 +85,41 @@ For example, below file represents a property `io.acryl.privacy.retentionTime`.
```
Use the CLI to create your properties:
-```commandline
+```shell
datahub properties upsert -f {properties_yaml}
```
If successful, you should see `Created structured property urn:li:structuredProperty:...`
+
+
+
+```graphql
+mutation createStructuredProperty {
+ createStructuredProperty(
+ input: {
+ id: "retentionTime",
+ qualifiedName:"retentionTime",
+ displayName: "Retention Time",
+ description: "Retention Time is used to figure out how long to retain records in a dataset",
+ valueType: "urn:li:dataType:datahub.number",
+ allowedValues: [
+ {numberValue: 30, description: "30 days, usually reserved for datasets that are ephemeral and contain pii"},
+ {numberValue: 90, description:"description: Use this for datasets that drive monthly reporting but contain pii"},
+ {numberValue: 365, description:"Use this for non-sensitive data that can be retained for longer"}
+ ],
+ cardinality: SINGLE,
+ entityTypes: ["urn:li:entityType:datahub.dataset", "urn:li:entityType:datahub.dataFlow"],
+ }
+ ) {
+ urn
+ }
+}
+```
+
+
+
```shell
@@ -236,9 +241,182 @@ Example Response:
-## Read Structured Properties
+## List Structured Properties
+
+You can list all structured properties in your DataHub instance using the following methods:
+
+
+
+
+```shell
+datahub properties list
+```
+
+This will show all properties with their full details.
+
+Example Response:
+```json
+{
+ "urn": "urn:li:structuredProperty:clusterName",
+ "qualified_name": "clusterName",
+ "type": "urn:li:dataType:datahub.string",
+ "description": "Test Cluster Name Property",
+ "display_name": "Cluster's name",
+ "entity_types": [
+ "urn:li:entityType:datahub.dataset"
+ ],
+ "cardinality": "SINGLE"
+}
+{
+ "urn": "urn:li:structuredProperty:projectNames",
+ "qualified_name": "projectNames",
+ "type": "urn:li:dataType:datahub.string",
+ "description": "Test property for project name",
+ "display_name": "Project Name",
+ "entity_types": [
+ "urn:li:entityType:datahub.dataset",
+ "urn:li:entityType:datahub.dataFlow"
+ ],
+ "cardinality": "MULTIPLE",
+ "allowed_values": [
+ {
+ "value": "Tracking",
+ "description": "test value 1 for project"
+ },
+ {
+ "value": "DataHub",
+ "description": "test value 2 for project"
+ }
+ ]
+}
+```
+
+
+If you only want to see the URNs, you can use:
+
+```shell
+datahub properties list --no-details
+```
+
+Example Response:
+```
+[2025-01-08 22:23:00,625] INFO {datahub.cli.specific.structuredproperties_cli:134} - Listing structured property urns only, use --details for more information
+urn:li:structuredProperty:clusterName
+urn:li:structuredProperty:clusterType
+urn:li:structuredProperty:io.acryl.dataManagement.deprecationDate
+urn:li:structuredProperty:projectNames
+```
+
+To download all the structured property definitions into a single file that you can use with the `upsert` command as described in the [create section](#create-structured-properties), you can run the list command with the `--to-file` option.
+
+```shell
+datahub properties list --to-file structured_properties.yaml
+```
+
+Example Response:
+```yaml
+ - urn: urn:li:structuredProperty:clusterName
+ qualified_name: clusterName
+ type: urn:li:dataType:datahub.string
+ description: Test Cluster Name Property
+ display_name: Cluster's name
+ entity_types:
+ - urn:li:entityType:datahub.dataset
+ cardinality: SINGLE
+ - urn: urn:li:structuredProperty:clusterType
+ qualified_name: clusterType
+ type: urn:li:dataType:datahub.string
+ description: Test Cluster Type Property
+ display_name: Cluster's type
+ entity_types:
+ - urn:li:entityType:datahub.dataset
+ cardinality: SINGLE
+ - urn: urn:li:structuredProperty:io.acryl.dataManagement.deprecationDate
+ qualified_name: io.acryl.dataManagement.deprecationDate
+ type: urn:li:dataType:datahub.date
+ display_name: Deprecation Date
+ entity_types:
+ - urn:li:entityType:datahub.dataset
+ - urn:li:entityType:datahub.dataFlow
+ - urn:li:entityType:datahub.dataJob
+ - urn:li:entityType:datahub.schemaField
+ cardinality: SINGLE
+ - urn: urn:li:structuredProperty:io.acryl.privacy.enumProperty5712
+ qualified_name: io.acryl.privacy.enumProperty5712
+ type: urn:li:dataType:datahub.string
+ description: The retention policy for the dataset
+ entity_types:
+ - urn:li:entityType:datahub.dataset
+ cardinality: MULTIPLE
+ allowed_values:
+ - value: foo
+ - value: bar
+... etc.
+```
+
+
+
+
+
+Example Request:
+```bash
+curl -X 'GET' \
+ 'http://localhost:9002/openapi/v3/entity/structuredproperty?systemMetadata=false&includeSoftDelete=false&skipCache=false&aspects=structuredPropertySettings&aspects=propertyDefinition&aspects=institutionalMemory&aspects=structuredPropertyKey&aspects=status&count=10&sortCriteria=urn&sortOrder=ASCENDING&query=*' \
+ -H 'accept: application/json'
+```
+
+Example Response:
+```json
+{
+ "scrollId": "...",
+ "entities": [
+ {
+ "urn": "urn:li:structuredProperty:clusterName",
+ "propertyDefinition": {
+ "value": {
+ "immutable": false,
+ "qualifiedName": "clusterName",
+ "displayName": "Cluster's name",
+ "valueType": "urn:li:dataType:datahub.string",
+ "description": "Test Cluster Name Property",
+ "entityTypes": [
+ "urn:li:entityType:datahub.dataset"
+ ],
+ "cardinality": "SINGLE"
+ }
+ },
+ "structuredPropertyKey": {
+ "value": {
+ "id": "clusterName"
+ }
+ }
+ }
+ ]
+}
+```
+
+Key Query Parameters:
+- `count`: Number of results to return per page (default: 10)
+- `sortCriteria`: Field to sort by (default: urn)
+- `sortOrder`: Sort order (ASCENDING or DESCENDING)
+- `query`: Search query to filter properties (* for all)
+
+
+
+
+The list endpoint returns all structured properties in your DataHub instance. Each property includes:
+- URN: Unique identifier for the property
+- Qualified Name: The property's qualified name
+- Type: The data type of the property (string, number, date, etc.)
+- Description: A description of the property's purpose
+- Display Name: Human-readable name for the property
+- Entity Types: The types of entities this property can be applied to
+- Cardinality: Whether the property accepts single (SINGLE) or multiple (MULTIPLE) values
+- Allowed Values: If specified, the list of allowed values for this property
-You can see the properties you created by running the following command:
+## Read a single Structured Property
+
+You can read an individual property you created by running the following command:
@@ -279,6 +457,91 @@ If successful, you should see metadata about your properties returned.
}
```
+
+
+
+Example Request:
+```graphql
+query {
+ structuredProperty(urn: "urn:li:structuredProperty:projectNames") {
+ urn
+ type
+ definition {
+ qualifiedName
+ displayName
+ description
+ cardinality
+ allowedValues {
+ value {
+ ... on StringValue {
+ stringValue
+ }
+ ... on NumberValue {
+ numberValue
+ }
+ }
+ description
+ }
+ entityTypes {
+ urn
+ info {
+ type
+ qualifiedName
+ }
+ }
+ }
+ }
+}
+```
+
+Example Response:
+```json
+{
+ "data": {
+ "structuredProperty": {
+ "urn": "urn:li:structuredProperty:projectNames",
+ "type": "STRUCTURED_PROPERTY",
+ "definition": {
+ "qualifiedName": "projectNames",
+ "displayName": "Project Name",
+ "description": "Test property for project name",
+ "cardinality": "MULTIPLE",
+ "allowedValues": [
+ {
+ "value": {
+ "stringValue": "Tracking"
+ },
+ "description": "test value 1 for project"
+ },
+ {
+ "value": {
+ "stringValue": "DataHub"
+ },
+ "description": "test value 2 for project"
+ }
+ ],
+ "entityTypes": [
+ {
+ "urn": "urn:li:entityType:datahub.dataset",
+ "info": {
+ "type": "DATASET",
+ "qualifiedName": "datahub.dataset"
+ }
+ },
+ {
+ "urn": "urn:li:entityType:datahub.dataFlow",
+ "info": {
+ "type": "DATA_FLOW",
+ "qualifiedName": "datahub.dataFlow"
+ }
+ }
+ ]
+ }
+ }
+ },
+ "extensions": {}
+}
+```
@@ -389,7 +652,7 @@ Example Response:
This action will set/replace all structured properties on the entity. See PATCH operations to add/remove a single property.
-
+
```graphql
mutation upsertStructuredProperties {
@@ -537,7 +800,7 @@ datahub dataset get --urn {urn}
For reading all structured properties from a dataset:
-
+
```graphql
query getDataset {
diff --git a/docs/businessattributes.md b/docs/businessattributes.md
index 3e912e7e609805..2359c2ac85b585 100644
--- a/docs/businessattributes.md
+++ b/docs/businessattributes.md
@@ -1,5 +1,10 @@
+import FeatureAvailability from '@site/src/components/FeatureAvailability';
+
# Business Attributes
+
+
+>**Note:** This is BETA feature
## What are Business Attributes
A Business Attribute, as its name implies, is an attribute with a business focus. It embodies the traits or properties of an entity within a business framework. This attribute is a crucial piece of data for a business, utilised to define or control the entity throughout the organisation. If a business process or concept is depicted as a comprehensive logical model, then each Business Attribute can be considered as an individual component within that model. While business names and descriptions are generally managed through glossary terms, Business Attributes encompass additional characteristics such as data quality rules/assertions, data privacy markers, data usage protocols, standard tags, and supplementary documentation, alongside Names and Descriptions.
@@ -70,9 +75,11 @@ Description inherited from business attribute is greyed out to differentiate bet
### Enable Business Attributes Feature
-By default, business attribute is disabled. To enable Business Attributes feature, set the following configuration in [application.yaml](../metadata-service/configuration/src/main/resources/application.yaml)
-
-businessAttributeEntityEnabled : true
+By default, business attribute is disabled. To enable Business Attributes feature, export environmental variable
+(may be done via `extraEnvs` for GMS deployment):
+```shell
+BUSINESS_ATTRIBUTE_ENTITY_ENABLED=true
+```
### What updates are planned for the Business Attributes feature?
diff --git a/docs/how/updating-datahub.md b/docs/how/updating-datahub.md
index 07577079d66d12..68b41c907c6ad6 100644
--- a/docs/how/updating-datahub.md
+++ b/docs/how/updating-datahub.md
@@ -44,6 +44,7 @@ This file documents any backwards-incompatible changes in DataHub and assists pe
- OpenAPI Update: PIT Keep Alive parameter added to scroll. NOTE: This parameter requires the `pointInTimeCreationEnabled` feature flag to be enabled and the `elasticSearch.implementation` configuration to be `elasticsearch`. This feature is not supported for OpenSearch at this time and the parameter will not be respected without both of these set.
- OpenAPI Update 2: Previously there was an incorrectly marked parameter named `sort` on the generic list entities endpoint for v3. This parameter is deprecated and only supports a single string value while the documentation indicates it supports a list of strings. This documentation error has been fixed and the correct field, `sortCriteria`, is now documented which supports a list of strings.
- #12223: For dbt Cloud ingestion, the "View in dbt" link will point at the "Explore" page in the dbt Cloud UI. You can revert to the old behavior of linking to the dbt Cloud IDE by setting `external_url_mode: ide".
+- #12236: Data flow and data job entities may additionally produce container aspect that will require a corresponding upgrade of server. Otherwise server can reject the aspect.
### Breaking Changes
diff --git a/docs/managed-datahub/subscription-and-notification.md b/docs/managed-datahub/subscription-and-notification.md
index c3c31d5fed7e61..c27754a6371265 100644
--- a/docs/managed-datahub/subscription-and-notification.md
+++ b/docs/managed-datahub/subscription-and-notification.md
@@ -17,9 +17,30 @@ Email will work out of box. For installing the DataHub Slack App, see:
This feature is especially useful in helping you stay on top of any upstream changes that could impact the assets you or your stakeholders rely on. It eliminates the need for you and your team to manually check for upstream changes, or for upstream stakeholders to identify and notify impacted users.
As a user, you can subscribe to and receive notifications about changes such as deprecations, schema changes, changes in ownership, assertions, or incidents. You’ll always been in the know about potential data quality issues so you can proactively manage your data resources.
+
+## Platform Admin Notifications
+
+Datahub provides three levels of notifications:
+
+- **Platform-level**
+- **Group-level** (described in other sections)
+- **User-level** (described in other sections)
+
+**Setting Platform-Level Notifications:**
+This requires appropriate permissions. Go to `Settings` > `Notifications` (under the `Platform` section, not `My Notifications`).
+
+**Platform-level Notifications:**
+Platform-level notifications are applied to all assets within Datahub.
+Example: If "An owner is added or removed from a data asset" is ticked, the designated Slack channel or email will receive notifications for any such changes across all assets.
+
+**Our Recommendations:**
+
+Notifying on tag changes for every asset in the platform would be noisy, and so we recommend to use these platform-level notifications only where appropriate. For example, we recommend notifications for ingestion failures routed to a central Slack channel or email. This will help you proactively ensure your Datahub metadata stays fresh.
+
## Prerequisites
Once you have [configured Slack within your DataHub instance](slack/saas-slack-setup.md), you will be able to subscribe to any Entity in DataHub and begin recieving notifications via DM.
+
To begin receiving personal notifications, go to Settings > "My Notifications". From here, toggle on Slack Notifications and input your Slack Member ID.
If you want to create and manage group-level Subscriptions for your team, you will need [the following privileges](../../docs/authorization/roles.md#role-privileges):
@@ -162,6 +183,21 @@ You can unsubscribe from any asset to stop receiving notifications about it. On
What if I want to be notified about different changes?
To modify your subscription, use the dropdown menu next to the Subscribe button to modify the changes you want to be notified about.
+
+
+
+I want to configure multiple channels. How many Slack channels or emails can I configure to get notified?
+
+At the platform-level, you can configure one email and one Slack channel.
+
+At the user and group -levels, you can configure one default email and Slack channel as well as overwrite that email/channel when you
+go to a specific asset to subscribe to.
+
+To configure multiple channels, as a prereq, ensure you have the appropriate privileges. And then:
+1. Create a datahub group for each channel you want notifications for.
+2. Add yourself as a member to each of the groups.
+3. Now, when you visit an asset and go to subscribe, you'll see the option "Manage Group Subscriptions".
+
## Reference
diff --git a/gradle/coverage/java-coverage.gradle b/gradle/coverage/java-coverage.gradle
index 17260c1a309788..fe8bc65336a983 100644
--- a/gradle/coverage/java-coverage.gradle
+++ b/gradle/coverage/java-coverage.gradle
@@ -22,7 +22,7 @@ afterEvaluate {
Tools that aggregate and analyse coverage tools search for the coverage result files. Keeping them under one
folder will minimize the time spent searching through the full source tree.
*/
- outputLocation = rootProject.layout.buildDirectory.file("coverage-reports/jacoco-${project.name}.xml")
+ outputLocation = rootProject.layout.buildDirectory.file("coverage-reports/${rootProject.relativePath(project.projectDir)}/jacoco-${project.name}.xml")
}
csv.required = false
html.required = false
diff --git a/gradle/coverage/python-coverage.gradle b/gradle/coverage/python-coverage.gradle
index 23d6e37387ed83..05eb79cf5659e2 100644
--- a/gradle/coverage/python-coverage.gradle
+++ b/gradle/coverage/python-coverage.gradle
@@ -7,7 +7,7 @@ ext.get_coverage_args = { test_name = "" ->
Tools that aggregate and analyse coverage tools search for the coverage result files. Keeping them under one folder
will minimize the time spent searching through the full source tree.
*/
- def base_path = "${rootProject.buildDir}/coverage-reports"
+ def base_path = "${rootProject.buildDir}/coverage-reports/${rootProject.relativePath(project.projectDir)}/"
/*
--cov=src was added via setup.cfg in many of the python projects but for some reason, was not getting picked up
diff --git a/metadata-ingestion-modules/airflow-plugin/setup.py b/metadata-ingestion-modules/airflow-plugin/setup.py
index 2693aab0700da3..d07063dbffc5c4 100644
--- a/metadata-ingestion-modules/airflow-plugin/setup.py
+++ b/metadata-ingestion-modules/airflow-plugin/setup.py
@@ -119,6 +119,7 @@ def get_long_description():
"pendulum<3.0",
"Flask-Session<0.6.0",
"connexion<3.0",
+ "marshmallow<3.24.0",
},
}
diff --git a/metadata-ingestion/docs/dev_guides/classification.md b/metadata-ingestion/docs/dev_guides/classification.md
index 39eac229a66013..457725b6783e52 100644
--- a/metadata-ingestion/docs/dev_guides/classification.md
+++ b/metadata-ingestion/docs/dev_guides/classification.md
@@ -7,10 +7,10 @@ The classification feature enables sources to be configured to automatically pre
Note that a `.` is used to denote nested fields in the YAML recipe.
| Field | Required | Type | Description | Default |
-| ------------------------- | -------- | --------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------- |
+| ------------------------- | -------- | --------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |------------------------------------------------------------|
| enabled | | boolean | Whether classification should be used to auto-detect glossary terms | False |
| sample_size | | int | Number of sample values used for classification. | 100 |
-| max_workers | | int | Number of worker processes to use for classification. Set to 1 to disable. | Number of cpu cores or 4 |
+| max_workers | | int | Number of worker processes to use for classification. Note that any number above 1 might lead to a deadlock. Set to 1 to disable. | 1 |
| info_type_to_term | | Dict[str,string] | Optional mapping to provide glossary term identifier for info type. | By default, info type is used as glossary term identifier. |
| classifiers | | Array of object | Classifiers to use to auto-detect glossary terms. If more than one classifier, infotype predictions from the classifier defined later in sequence take precedance. | [{'type': 'datahub', 'config': None}] |
| table_pattern | | AllowDenyPattern (see below for fields) | Regex patterns to filter tables for classification. This is used in combination with other patterns in parent config. Specify regex to match the entire table name in `database.schema.table` format. e.g. to match all tables starting with customer in Customer database and public schema, use the regex 'Customer.public.customer.*' | {'allow': ['.*'], 'deny': [], 'ignoreCase': True} |
diff --git a/metadata-ingestion/docs/sources/metadata-file/metadata-file_recipe.yml b/metadata-ingestion/docs/sources/metadata-file/file_recipe.yml
similarity index 100%
rename from metadata-ingestion/docs/sources/metadata-file/metadata-file_recipe.yml
rename to metadata-ingestion/docs/sources/metadata-file/file_recipe.yml
diff --git a/metadata-ingestion/docs/sources/powerbi-report-server/powerbi-report-server_pre.md b/metadata-ingestion/docs/sources/powerbi-report-server/powerbi-report-server_pre.md
new file mode 100644
index 00000000000000..ae9812b2a48ad3
--- /dev/null
+++ b/metadata-ingestion/docs/sources/powerbi-report-server/powerbi-report-server_pre.md
@@ -0,0 +1,16 @@
+### Configuration Notes
+
+See the
+
+1. [Microsoft Grant user access to a Report Server doc](https://docs.microsoft.com/en-us/sql/reporting-services/security/grant-user-access-to-a-report-server?view=sql-server-ver16)
+2. Use your user credentials from previous step in yaml file
+
+### Concept mapping
+
+| Power BI Report Server | Datahub |
+| ---------------------- | ----------- |
+| `Paginated Report` | `Dashboard` |
+| `Power BI Report` | `Dashboard` |
+| `Mobile Report` | `Dashboard` |
+| `Linked Report` | `Dashboard` |
+| `Dataset, Datasource` | `N/A` |
diff --git a/metadata-ingestion/docs/sources/powerbi/powerbi-report-server_recipe.yml b/metadata-ingestion/docs/sources/powerbi-report-server/powerbi-report-server_recipe.yml
similarity index 100%
rename from metadata-ingestion/docs/sources/powerbi/powerbi-report-server_recipe.yml
rename to metadata-ingestion/docs/sources/powerbi-report-server/powerbi-report-server_recipe.yml
diff --git a/metadata-ingestion/docs/sources/powerbi/powerbi-report-server_pre.md b/metadata-ingestion/docs/sources/powerbi/powerbi-report-server_pre.md
deleted file mode 100644
index ca600f10786758..00000000000000
--- a/metadata-ingestion/docs/sources/powerbi/powerbi-report-server_pre.md
+++ /dev/null
@@ -1,13 +0,0 @@
-## Configuration Notes
-See the
-1. [Microsoft Grant user access to a Report Server doc](https://docs.microsoft.com/en-us/sql/reporting-services/security/grant-user-access-to-a-report-server?view=sql-server-ver16)
-2. Use your user credentials from previous step in yaml file
-## Concept mapping
-
-| Power BI Report Server | Datahub |
-| ------------------------- | ------------------- |
-| `Paginated Report` | `Dashboard` |
-| `Power BI Report` | `Dashboard` |
-| `Mobile Report` | `Dashboard` |
-| `Linked Report` | `Dashboard` |
-| `Dataset, Datasource` | `N/A` |
diff --git a/metadata-ingestion/docs/sources/snowflake/snowflake_recipe.yml b/metadata-ingestion/docs/sources/snowflake/snowflake_recipe.yml
index 7e8dbcff88e1c0..3226f23c963ddf 100644
--- a/metadata-ingestion/docs/sources/snowflake/snowflake_recipe.yml
+++ b/metadata-ingestion/docs/sources/snowflake/snowflake_recipe.yml
@@ -4,6 +4,9 @@ source:
# This option is recommended to be used to ingest all lineage
ignore_start_time_lineage: true
+ # This flag tells the snowflake ingestion to use the more advanced query parsing. This will become the default eventually.
+ use_queries_v2: true
+
# Coordinates
account_id: "abc48144"
warehouse: "COMPUTE_WH"
diff --git a/metadata-ingestion/docs/sources/tableau/tableau_pre.md b/metadata-ingestion/docs/sources/tableau/tableau_pre.md
index aeb67f85b241b9..65ff08367fdc8f 100644
--- a/metadata-ingestion/docs/sources/tableau/tableau_pre.md
+++ b/metadata-ingestion/docs/sources/tableau/tableau_pre.md
@@ -3,9 +3,24 @@
In order to ingest metadata from Tableau, you will need:
- Tableau Server Version 2021.1.10 and above. It may also work for older versions.
-- [Enable the Tableau Metadata API](https://help.tableau.com/current/api/metadata_api/en-us/docs/meta_api_start.html#enable-the-tableau-metadata-api-for-tableau-server) for Tableau Server, if its not already enabled.
-- Tableau Credentials (Username/Password or [Personal Access Token](https://help.tableau.com/current/pro/desktop/en-us/useracct.htm#create-and-revoke-personal-access-tokens))
-- The user or token must have **Site Administrator Explorer** permissions.
+- [Enable the Tableau Metadata API](https://help.tableau.com/current/api/metadata_api/en-us/docs/meta_api_start.html#enable-the-tableau-metadata-api-for-tableau-server) for Tableau Server, if its not already enabled. This is always enabled for Tableau Cloud.
+
+### Authentication
+
+DataHub supports two authentication methods:
+
+1. Username/Password
+2. [Personal Access Token](https://help.tableau.com/current/pro/desktop/en-us/useracct.htm#create-and-revoke-personal-access-tokens)
+
+Either way, the user/token must have the **Site Administrator Explorer** site role.
+
+:::info
+
+We need the `Site Administrator Explorer` site role in order to get complete metadata from Tableau.
+
+With any lower role, the Tableau Metadata API returns missing/partial metadata. This particularly affects data source fields and definitions, which impacts our ability to extract columns and generate column lineage. As such, other site roles like `Viewer` are insufficient with the current Tableau Metadata API.
+
+:::
### Ingestion through UI
@@ -46,8 +61,8 @@ This ingestion source maps the following Source System Concepts to DataHub Conce
| Source Concept | DataHub Concept | Notes |
| --------------------------- | ------------------------------------------------------------- | --------------------------------- |
-| `"Tableau"` | [Data Platform](../../metamodel/entities/dataPlatform.md) |
-| Project | [Container](../../metamodel/entities/container.md) | SubType `"Project"` |
+| `"Tableau"` | [Data Platform](../../metamodel/entities/dataPlatform.md) |
+| Project | [Container](../../metamodel/entities/container.md) | SubType `"Project"` |
| Embedded DataSource | [Dataset](../../metamodel/entities/dataset.md) | SubType `"Embedded Data Source"` |
| Published DataSource | [Dataset](../../metamodel/entities/dataset.md) | SubType `"Published Data Source"` |
| Custom SQL Table | [Dataset](../../metamodel/entities/dataset.md) | SubTypes `"View"`, `"Custom SQL"` |
@@ -75,14 +90,15 @@ Lineage is emitted as received from Tableau's metadata API for
### Troubleshooting
-### Why are only some workbooks/custom SQLs/published datasources ingested from the specified project?
+#### Why are only some workbooks/custom SQLs/published datasources ingested from the specified project?
This may happen when the Tableau API returns NODE_LIMIT_EXCEEDED error in response to metadata query and returns partial results with message "Showing partial results. , The request exceeded the ‘n’ node limit. Use pagination, additional filtering, or both in the query to adjust results." To resolve this, consider
- reducing the page size using the `page_size` config param in datahub recipe (Defaults to 10).
- increasing tableau configuration [metadata query node limit](https://help.tableau.com/current/server/en-us/cli_configuration-set_tsm.htm#metadata_nodelimit) to higher value.
-### `PERMISSIONS_MODE_SWITCHED` error in ingestion report
+#### `PERMISSIONS_MODE_SWITCHED` error in ingestion report
+
This error occurs if the Tableau site is using external assets. For more detail, refer to the Tableau documentation [Manage Permissions for External Assets](https://help.tableau.com/current/online/en-us/dm_perms_assets.htm).
Follow the below steps to enable the derived permissions:
diff --git a/metadata-ingestion/examples/structured_properties/list_structured_properties.py b/metadata-ingestion/examples/structured_properties/list_structured_properties.py
new file mode 100644
index 00000000000000..66ac90c1228a37
--- /dev/null
+++ b/metadata-ingestion/examples/structured_properties/list_structured_properties.py
@@ -0,0 +1,12 @@
+# Usage: python3 list_structured_properties.py
+# Expected Output: List of structured properties
+# This script lists all structured properties in DataHub
+from datahub.api.entities.structuredproperties.structuredproperties import (
+ StructuredProperties,
+)
+from datahub.ingestion.graph.client import get_default_graph
+
+with get_default_graph() as graph:
+ structuredproperties = StructuredProperties.list(graph)
+ for structuredproperty in structuredproperties:
+ print(structuredproperty.dict())
diff --git a/metadata-ingestion/scripts/avro_codegen.py b/metadata-ingestion/scripts/avro_codegen.py
index 2841985ad07808..0fe79a2c6a8e47 100644
--- a/metadata-ingestion/scripts/avro_codegen.py
+++ b/metadata-ingestion/scripts/avro_codegen.py
@@ -346,7 +346,7 @@ def write_urn_classes(key_aspects: List[dict], urn_dir: Path) -> None:
code = """
# This file contains classes corresponding to entity URNs.
-from typing import ClassVar, List, Optional, Type, TYPE_CHECKING
+from typing import ClassVar, List, Optional, Type, TYPE_CHECKING, Union
import functools
from deprecated.sphinx import deprecated as _sphinx_deprecated
@@ -547,10 +547,31 @@ def generate_urn_class(entity_type: str, key_aspect: dict) -> str:
assert fields[0]["type"] == ["null", "string"]
fields[0]["type"] = "string"
+ field_urn_type_classes = {}
+ for field in fields:
+ # Figure out if urn types are valid for each field.
+ field_urn_type_class = None
+ if field_name(field) == "platform":
+ field_urn_type_class = "DataPlatformUrn"
+ elif field.get("Urn"):
+ if len(field.get("entityTypes", [])) == 1:
+ field_entity_type = field["entityTypes"][0]
+ field_urn_type_class = f"{capitalize_entity_name(field_entity_type)}Urn"
+ else:
+ field_urn_type_class = "Urn"
+
+ field_urn_type_classes[field_name(field)] = field_urn_type_class
+
_init_arg_parts: List[str] = []
for field in fields:
+ field_urn_type_class = field_urn_type_classes[field_name(field)]
+
default = '"PROD"' if field_name(field) == "env" else None
- _arg_part = f"{field_name(field)}: {field_type(field)}"
+
+ type_hint = field_type(field)
+ if field_urn_type_class:
+ type_hint = f'Union["{field_urn_type_class}", str]'
+ _arg_part = f"{field_name(field)}: {type_hint}"
if default:
_arg_part += f" = {default}"
_init_arg_parts.append(_arg_part)
@@ -579,16 +600,7 @@ def generate_urn_class(entity_type: str, key_aspect: dict) -> str:
init_validation += f'if not {field_name(field)}:\n raise InvalidUrnError("{class_name} {field_name(field)} cannot be empty")\n'
# Generalized mechanism for validating embedded urns.
- field_urn_type_class = None
- if field_name(field) == "platform":
- field_urn_type_class = "DataPlatformUrn"
- elif field.get("Urn"):
- if len(field.get("entityTypes", [])) == 1:
- field_entity_type = field["entityTypes"][0]
- field_urn_type_class = f"{capitalize_entity_name(field_entity_type)}Urn"
- else:
- field_urn_type_class = "Urn"
-
+ field_urn_type_class = field_urn_type_classes[field_name(field)]
if field_urn_type_class:
init_validation += f"{field_name(field)} = str({field_name(field)})\n"
init_validation += (
@@ -608,7 +620,7 @@ def generate_urn_class(entity_type: str, key_aspect: dict) -> str:
init_coercion += " platform_name = DataPlatformUrn.from_string(platform_name).platform_name\n"
if field_name(field) == "platform":
- init_coercion += "platform = DataPlatformUrn(platform).urn()\n"
+ init_coercion += "platform = platform.urn() if isinstance(platform, DataPlatformUrn) else DataPlatformUrn(platform).urn()\n"
elif field_urn_type_class is None:
# For all non-urns, run the value through the UrnEncoder.
init_coercion += (
diff --git a/metadata-ingestion/scripts/docgen.py b/metadata-ingestion/scripts/docgen.py
index 402cd8a8141990..71eef96f5b9262 100644
--- a/metadata-ingestion/scripts/docgen.py
+++ b/metadata-ingestion/scripts/docgen.py
@@ -1,381 +1,25 @@
+import dataclasses
import glob
-import html
import json
import logging
import os
+import pathlib
import re
import sys
import textwrap
from importlib.metadata import metadata, requires
-from typing import Any, Dict, Iterable, List, Optional
+from typing import Dict, List, Optional
import click
-from pydantic import BaseModel, Field
+from docgen_types import Platform, Plugin
+from docs_config_table import gen_md_table_from_json_schema
from datahub.configuration.common import ConfigModel
-from datahub.ingestion.api.decorators import (
- CapabilitySetting,
- SourceCapability,
- SupportStatus,
-)
+from datahub.ingestion.api.decorators import SourceCapability, SupportStatus
from datahub.ingestion.source.source_registry import source_registry
-from datahub.metadata.schema_classes import SchemaFieldClass
logger = logging.getLogger(__name__)
-DEFAULT_VALUE_MAX_LENGTH = 50
-DEFAULT_VALUE_TRUNCATION_MESSAGE = "..."
-
-
-def _truncate_default_value(value: str) -> str:
- if len(value) > DEFAULT_VALUE_MAX_LENGTH:
- return value[:DEFAULT_VALUE_MAX_LENGTH] + DEFAULT_VALUE_TRUNCATION_MESSAGE
- return value
-
-
-def _format_path_component(path: str) -> str:
- """
- Given a path like 'a.b.c', adds css tags to the components.
- """
- path_components = path.rsplit(".", maxsplit=1)
- if len(path_components) == 1:
- return f'{path_components[0]}'
-
- return (
- f'{path_components[0]}.'
- f'{path_components[1]}'
- )
-
-
-def _format_type_name(type_name: str) -> str:
- return f'{type_name}'
-
-
-def _format_default_line(default_value: str, has_desc_above: bool) -> str:
- default_value = _truncate_default_value(default_value)
- escaped_value = (
- html.escape(default_value)
- # Replace curly braces to avoid JSX issues.
- .replace("{", "{")
- .replace("}", "}")
- # We also need to replace markdown special characters.
- .replace("*", "*")
- .replace("_", "_")
- .replace("[", "[")
- .replace("]", "]")
- .replace("|", "|")
- .replace("`", "`")
- )
- value_elem = f'{escaped_value}'
- return f'Default: {value_elem}
'
-
-
-class FieldRow(BaseModel):
- path: str
- parent: Optional[str]
- type_name: str
- required: bool
- has_default: bool
- default: str
- description: str
- inner_fields: List["FieldRow"] = Field(default_factory=list)
- discriminated_type: Optional[str] = None
-
- class Component(BaseModel):
- type: str
- field_name: Optional[str]
-
- # matches any [...] style section inside a field path
- _V2_FIELD_PATH_TOKEN_MATCHER = r"\[[\w.]*[=]*[\w\(\-\ \_\).]*\][\.]*"
- # matches a .?[...] style section inside a field path anchored to the beginning
- _V2_FIELD_PATH_TOKEN_MATCHER_PREFIX = rf"^[\.]*{_V2_FIELD_PATH_TOKEN_MATCHER}"
- _V2_FIELD_PATH_FIELD_NAME_MATCHER = r"^\w+"
-
- @staticmethod
- def map_field_path_to_components(field_path: str) -> List[Component]:
- m = re.match(FieldRow._V2_FIELD_PATH_TOKEN_MATCHER_PREFIX, field_path)
- v = re.match(FieldRow._V2_FIELD_PATH_FIELD_NAME_MATCHER, field_path)
- components: List[FieldRow.Component] = []
- while m or v:
- token = m.group() if m else v.group() # type: ignore
- if v:
- if components:
- if components[-1].field_name is None:
- components[-1].field_name = token
- else:
- components.append(
- FieldRow.Component(type="non_map_type", field_name=token)
- )
- else:
- components.append(
- FieldRow.Component(type="non_map_type", field_name=token)
- )
-
- if m:
- if token.startswith("[version="):
- pass
- elif "[type=" in token:
- type_match = re.match(r"[\.]*\[type=(.*)\]", token)
- if type_match:
- type_string = type_match.group(1)
- if components and components[-1].type == "map":
- if components[-1].field_name is None:
- pass
- else:
- new_component = FieldRow.Component(
- type="map_key", field_name="`key`"
- )
- components.append(new_component)
- new_component = FieldRow.Component(
- type=type_string, field_name=None
- )
- components.append(new_component)
- if type_string == "map":
- new_component = FieldRow.Component(
- type=type_string, field_name=None
- )
- components.append(new_component)
-
- field_path = field_path[m.span()[1] :] if m else field_path[v.span()[1] :] # type: ignore
- m = re.match(FieldRow._V2_FIELD_PATH_TOKEN_MATCHER_PREFIX, field_path)
- v = re.match(FieldRow._V2_FIELD_PATH_FIELD_NAME_MATCHER, field_path)
-
- return components
-
- @staticmethod
- def field_path_to_components(field_path: str) -> List[str]:
- """
- Inverts the field_path v2 format to get the canonical field path
- [version=2.0].[type=x].foo.[type=string(format=uri)].bar => ["foo","bar"]
- """
- if "type=map" not in field_path:
- return re.sub(FieldRow._V2_FIELD_PATH_TOKEN_MATCHER, "", field_path).split(
- "."
- )
- else:
- # fields with maps in them need special handling to insert the `key` fragment
- return [
- c.field_name
- for c in FieldRow.map_field_path_to_components(field_path)
- if c.field_name
- ]
-
- @classmethod
- def from_schema_field(cls, schema_field: SchemaFieldClass) -> "FieldRow":
- path_components = FieldRow.field_path_to_components(schema_field.fieldPath)
-
- parent = path_components[-2] if len(path_components) >= 2 else None
- if parent == "`key`":
- # the real parent node is one index above
- parent = path_components[-3]
- json_props = (
- json.loads(schema_field.jsonProps) if schema_field.jsonProps else {}
- )
-
- required = json_props.get("required", True)
- has_default = "default" in json_props
- default_value = str(json_props.get("default"))
-
- field_path = ".".join(path_components)
-
- return FieldRow(
- path=field_path,
- parent=parent,
- type_name=str(schema_field.nativeDataType),
- required=required,
- has_default=has_default,
- default=default_value,
- description=schema_field.description,
- inner_fields=[],
- discriminated_type=schema_field.nativeDataType,
- )
-
- def get_checkbox(self) -> str:
- if self.required and not self.has_default:
- # Using a non-breaking space to prevent the checkbox from being
- # broken into a new line.
- if not self.parent: # None and empty string both count
- return ' ✅'
- else:
- return f' ❓'
- else:
- return ""
-
- def to_md_line(self) -> str:
- if self.inner_fields:
- if len(self.inner_fields) == 1:
- type_name = self.inner_fields[0].type_name or self.type_name
- else:
- # To deal with unions that have essentially the same simple field path,
- # we combine the type names into a single string.
- type_name = "One of " + ", ".join(
- [x.type_name for x in self.inner_fields if x.discriminated_type]
- )
- else:
- type_name = self.type_name
-
- description = self.description.strip()
- description = self.description.replace(
- "\n", "
"
- ) # descriptions with newlines in them break markdown rendering
-
- md_line = (
- f'| {_format_path_component(self.path)}'
- f"{self.get_checkbox()}
"
- f' {_format_type_name(type_name)}
'
- f"| {description} "
- f"{_format_default_line(self.default, bool(description)) if self.has_default else ''} |\n"
- )
- return md_line
-
-
-class FieldHeader(FieldRow):
- def to_md_line(self) -> str:
- return "\n".join(
- [
- "| Field | Description |",
- "|:--- |:--- |",
- "",
- ]
- )
-
- def __init__(self):
- pass
-
-
-def get_prefixed_name(field_prefix: Optional[str], field_name: Optional[str]) -> str:
- assert (
- field_prefix or field_name
- ), "One of field_prefix or field_name should be present"
- return (
- f"{field_prefix}.{field_name}" # type: ignore
- if field_prefix and field_name
- else field_name
- if not field_prefix
- else field_prefix
- )
-
-
-def custom_comparator(path: str) -> str:
- """
- Projects a string onto a separate space
- Low_prio string will start with Z else start with A
- Number of field paths will add the second set of letters: 00 - 99
-
- """
- opt1 = path
- prio_value = priority_value(opt1)
- projection = f"{prio_value}"
- projection = f"{projection}{opt1}"
- return projection
-
-
-class FieldTree:
- """
- A helper class that re-constructs the tree hierarchy of schema fields
- to help sort fields by importance while keeping nesting intact
- """
-
- def __init__(self, field: Optional[FieldRow] = None):
- self.field = field
- self.fields: Dict[str, "FieldTree"] = {}
-
- def add_field(self, row: FieldRow, path: Optional[str] = None) -> "FieldTree":
- # logger.warn(f"Add field: path:{path}, row:{row}")
- if self.field and self.field.path == row.path:
- # we have an incoming field with the same path as us, this is probably a union variant
- # attach to existing field
- self.field.inner_fields.append(row)
- else:
- path = path if path is not None else row.path
- top_level_field = path.split(".")[0]
- if top_level_field in self.fields:
- self.fields[top_level_field].add_field(
- row, ".".join(path.split(".")[1:])
- )
- else:
- self.fields[top_level_field] = FieldTree(field=row)
- # logger.warn(f"{self}")
- return self
-
- def sort(self):
- # Required fields before optionals
- required_fields = {
- k: v for k, v in self.fields.items() if v.field and v.field.required
- }
- optional_fields = {
- k: v for k, v in self.fields.items() if v.field and not v.field.required
- }
-
- self.sorted_fields = []
- for field_map in [required_fields, optional_fields]:
- # Top-level fields before fields with nesting
- self.sorted_fields.extend(
- sorted(
- [f for f, val in field_map.items() if val.fields == {}],
- key=custom_comparator,
- )
- )
- self.sorted_fields.extend(
- sorted(
- [f for f, val in field_map.items() if val.fields != {}],
- key=custom_comparator,
- )
- )
-
- for field_tree in self.fields.values():
- field_tree.sort()
-
- def get_fields(self) -> Iterable[FieldRow]:
- if self.field:
- yield self.field
- for key in self.sorted_fields:
- yield from self.fields[key].get_fields()
-
- def __repr__(self) -> str:
- result = {}
- if self.field:
- result["_self"] = json.loads(json.dumps(self.field.dict()))
- for f in self.fields:
- result[f] = json.loads(str(self.fields[f]))
- return json.dumps(result, indent=2)
-
-
-def priority_value(path: str) -> str:
- # A map of low value tokens to their relative importance
- low_value_token_map = {"env": "X", "profiling": "Y", "stateful_ingestion": "Z"}
- tokens = path.split(".")
- for low_value_token in low_value_token_map:
- if low_value_token in tokens:
- return low_value_token_map[low_value_token]
-
- # everything else high-prio
- return "A"
-
-
-def gen_md_table_from_struct(schema_dict: Dict[str, Any]) -> List[str]:
- from datahub.ingestion.extractor.json_schema_util import JsonSchemaTranslator
-
- # we don't want default field values to be injected into the description of the field
- JsonSchemaTranslator._INJECT_DEFAULTS_INTO_DESCRIPTION = False
- schema_fields = list(JsonSchemaTranslator.get_fields_from_schema(schema_dict))
- result: List[str] = [FieldHeader().to_md_line()]
-
- field_tree = FieldTree(field=None)
- for field in schema_fields:
- row: FieldRow = FieldRow.from_schema_field(field)
- field_tree.add_field(row)
-
- field_tree.sort()
-
- for row in field_tree.get_fields():
- result.append(row.to_md_line())
-
- # Wrap with a .config-table div.
- result = ["\n\n\n", *result, "\n
\n"]
-
- return result
-
def get_snippet(long_string: str, max_length: int = 100) -> str:
snippet = ""
@@ -424,19 +68,6 @@ def get_capability_text(src_capability: SourceCapability) -> str:
)
-def create_or_update(
- something: Dict[Any, Any], path: List[str], value: Any
-) -> Dict[Any, Any]:
- dict_under_operation = something
- for p in path[:-1]:
- if p not in dict_under_operation:
- dict_under_operation[p] = {}
- dict_under_operation = dict_under_operation[p]
-
- dict_under_operation[path[-1]] = value
- return something
-
-
def does_extra_exist(extra_name: str) -> bool:
for key, value in metadata("acryl-datahub").items():
if key == "Provides-Extra" and value == extra_name:
@@ -498,6 +129,102 @@ def new_url(original_url: str, file_path: str) -> str:
return new_content
+def load_plugin(plugin_name: str, out_dir: str) -> Plugin:
+ logger.debug(f"Loading {plugin_name}")
+ class_or_exception = source_registry._ensure_not_lazy(plugin_name)
+ if isinstance(class_or_exception, Exception):
+ raise class_or_exception
+ source_type = source_registry.get(plugin_name)
+ logger.debug(f"Source class is {source_type}")
+
+ if hasattr(source_type, "get_platform_name"):
+ platform_name = source_type.get_platform_name()
+ else:
+ platform_name = (
+ plugin_name.title()
+ ) # we like platform names to be human readable
+
+ platform_id = None
+ if hasattr(source_type, "get_platform_id"):
+ platform_id = source_type.get_platform_id()
+ if platform_id is None:
+ raise ValueError(f"Platform ID not found for {plugin_name}")
+
+ plugin = Plugin(
+ name=plugin_name,
+ platform_id=platform_id,
+ platform_name=platform_name,
+ classname=".".join([source_type.__module__, source_type.__name__]),
+ )
+
+ if hasattr(source_type, "get_platform_doc_order"):
+ platform_doc_order = source_type.get_platform_doc_order()
+ plugin.doc_order = platform_doc_order
+
+ plugin_file_name = "src/" + "/".join(source_type.__module__.split("."))
+ if os.path.exists(plugin_file_name) and os.path.isdir(plugin_file_name):
+ plugin_file_name = plugin_file_name + "/__init__.py"
+ else:
+ plugin_file_name = plugin_file_name + ".py"
+ if os.path.exists(plugin_file_name):
+ plugin.filename = plugin_file_name
+ else:
+ logger.info(
+ f"Failed to locate filename for {plugin_name}. Guessed {plugin_file_name}, but that doesn't exist"
+ )
+
+ if hasattr(source_type, "__doc__"):
+ plugin.source_docstring = textwrap.dedent(source_type.__doc__ or "")
+
+ if hasattr(source_type, "get_support_status"):
+ plugin.support_status = source_type.get_support_status()
+
+ if hasattr(source_type, "get_capabilities"):
+ capabilities = list(source_type.get_capabilities())
+ capabilities.sort(key=lambda x: x.capability.value)
+ plugin.capabilities = capabilities
+
+ try:
+ extra_plugin = plugin_name if does_extra_exist(plugin_name) else None
+ plugin.extra_deps = (
+ get_additional_deps_for_extra(extra_plugin) if extra_plugin else []
+ )
+ except Exception as e:
+ logger.info(
+ f"Failed to load extras for {plugin_name} due to exception {e}", exc_info=e
+ )
+
+ if hasattr(source_type, "get_config_class"):
+ source_config_class: ConfigModel = source_type.get_config_class()
+
+ plugin.config_json_schema = source_config_class.schema_json(indent=2)
+ plugin.config_md = gen_md_table_from_json_schema(source_config_class.schema())
+
+ # Write the config json schema to the out_dir.
+ config_dir = pathlib.Path(out_dir) / "config_schemas"
+ config_dir.mkdir(parents=True, exist_ok=True)
+ (config_dir / f"{plugin_name}_config.json").write_text(
+ plugin.config_json_schema
+ )
+
+ return plugin
+
+
+@dataclasses.dataclass
+class PluginMetrics:
+ discovered: int = 0
+ loaded: int = 0
+ generated: int = 0
+ failed: int = 0
+
+
+@dataclasses.dataclass
+class PlatformMetrics:
+ discovered: int = 0
+ generated: int = 0
+ warnings: List[str] = dataclasses.field(default_factory=list)
+
+
@click.command()
@click.option("--out-dir", type=str, required=True)
@click.option("--extra-docs", type=str, required=False)
@@ -505,239 +232,111 @@ def new_url(original_url: str, file_path: str) -> str:
def generate(
out_dir: str, extra_docs: Optional[str] = None, source: Optional[str] = None
) -> None: # noqa: C901
- source_documentation: Dict[str, Any] = {}
- metrics = {}
- metrics["source_platforms"] = {"discovered": 0, "generated": 0, "warnings": []}
- metrics["plugins"] = {"discovered": 0, "generated": 0, "failed": 0}
-
- if extra_docs:
- for path in glob.glob(f"{extra_docs}/**/*[.md|.yaml|.yml]", recursive=True):
- m = re.search("/docs/sources/(.*)/(.*).md", path)
- if m:
- platform_name = m.group(1).lower()
- file_name = m.group(2)
- destination_md: str = (
- f"../docs/generated/ingestion/sources/{platform_name}.md"
- )
-
- with open(path, "r") as doc_file:
- file_contents = doc_file.read()
- final_markdown = rewrite_markdown(
- file_contents, path, destination_md
- )
-
- if file_name == "README":
- # README goes as platform level docs
- # all other docs are assumed to be plugin level
- create_or_update(
- source_documentation,
- [platform_name, "custom_docs"],
- final_markdown,
- )
- else:
- if "_" in file_name:
- plugin_doc_parts = file_name.split("_")
- if len(plugin_doc_parts) != 2 or plugin_doc_parts[
- 1
- ] not in ["pre", "post"]:
- raise Exception(
- f"{file_name} needs to be of the form _pre.md or _post.md"
- )
-
- docs_key_name = f"custom_docs_{plugin_doc_parts[1]}"
- create_or_update(
- source_documentation,
- [
- platform_name,
- "plugins",
- plugin_doc_parts[0],
- docs_key_name,
- ],
- final_markdown,
- )
- else:
- create_or_update(
- source_documentation,
- [
- platform_name,
- "plugins",
- file_name,
- "custom_docs_post",
- ],
- final_markdown,
- )
- else:
- yml_match = re.search("/docs/sources/(.*)/(.*)_recipe.yml", path)
- if yml_match:
- platform_name = yml_match.group(1).lower()
- plugin_name = yml_match.group(2)
- with open(path, "r") as doc_file:
- file_contents = doc_file.read()
- create_or_update(
- source_documentation,
- [platform_name, "plugins", plugin_name, "recipe"],
- file_contents,
- )
+ plugin_metrics = PluginMetrics()
+ platform_metrics = PlatformMetrics()
+ platforms: Dict[str, Platform] = {}
for plugin_name in sorted(source_registry.mapping.keys()):
if source and source != plugin_name:
continue
if plugin_name in {
"snowflake-summary",
+ "snowflake-queries",
+ "bigquery-queries",
}:
logger.info(f"Skipping {plugin_name} as it is on the deny list")
continue
- metrics["plugins"]["discovered"] = metrics["plugins"]["discovered"] + 1 # type: ignore
- # We want to attempt to load all plugins before printing a summary.
- source_type = None
+ plugin_metrics.discovered += 1
try:
- # output = subprocess.check_output(
- # ["/bin/bash", "-c", f"pip install -e '.[{key}]'"]
- # )
- class_or_exception = source_registry._ensure_not_lazy(plugin_name)
- if isinstance(class_or_exception, Exception):
- raise class_or_exception
- logger.debug(f"Processing {plugin_name}")
- source_type = source_registry.get(plugin_name)
- logger.debug(f"Source class is {source_type}")
- extra_plugin = plugin_name if does_extra_exist(plugin_name) else None
- extra_deps = (
- get_additional_deps_for_extra(extra_plugin) if extra_plugin else []
- )
+ plugin = load_plugin(plugin_name, out_dir=out_dir)
except Exception as e:
- logger.warning(
- f"Failed to process {plugin_name} due to exception {e}", exc_info=e
+ logger.error(
+ f"Failed to load {plugin_name} due to exception {e}", exc_info=e
)
- metrics["plugins"]["failed"] = metrics["plugins"].get("failed", 0) + 1 # type: ignore
-
- if source_type and hasattr(source_type, "get_config_class"):
- try:
- source_config_class: ConfigModel = source_type.get_config_class()
- support_status = SupportStatus.UNKNOWN
- capabilities = []
- if hasattr(source_type, "__doc__"):
- source_doc = textwrap.dedent(source_type.__doc__ or "")
- if hasattr(source_type, "get_platform_name"):
- platform_name = source_type.get_platform_name()
- else:
- platform_name = (
- plugin_name.title()
- ) # we like platform names to be human readable
-
- if hasattr(source_type, "get_platform_id"):
- platform_id = source_type.get_platform_id()
-
- if hasattr(source_type, "get_platform_doc_order"):
- platform_doc_order = source_type.get_platform_doc_order()
- create_or_update(
- source_documentation,
- [platform_id, "plugins", plugin_name, "doc_order"],
- platform_doc_order,
- )
-
- source_documentation[platform_id] = (
- source_documentation.get(platform_id) or {}
- )
-
- create_or_update(
- source_documentation,
- [platform_id, "plugins", plugin_name, "classname"],
- ".".join([source_type.__module__, source_type.__name__]),
- )
- plugin_file_name = "src/" + "/".join(source_type.__module__.split("."))
- if os.path.exists(plugin_file_name) and os.path.isdir(plugin_file_name):
- plugin_file_name = plugin_file_name + "/__init__.py"
- else:
- plugin_file_name = plugin_file_name + ".py"
- if os.path.exists(plugin_file_name):
- create_or_update(
- source_documentation,
- [platform_id, "plugins", plugin_name, "filename"],
- plugin_file_name,
- )
- else:
- logger.info(
- f"Failed to locate filename for {plugin_name}. Guessed {plugin_file_name}"
- )
-
- if hasattr(source_type, "get_support_status"):
- support_status = source_type.get_support_status()
-
- if hasattr(source_type, "get_capabilities"):
- capabilities = list(source_type.get_capabilities())
- capabilities.sort(key=lambda x: x.capability.value)
-
- create_or_update(
- source_documentation,
- [platform_id, "plugins", plugin_name, "capabilities"],
- capabilities,
- )
-
- create_or_update(
- source_documentation, [platform_id, "name"], platform_name
- )
-
- create_or_update(
- source_documentation,
- [platform_id, "plugins", plugin_name, "extra_deps"],
- extra_deps,
- )
+ plugin_metrics.failed += 1
+ continue
+ else:
+ plugin_metrics.loaded += 1
- config_dir = f"{out_dir}/config_schemas"
- os.makedirs(config_dir, exist_ok=True)
- with open(f"{config_dir}/{plugin_name}_config.json", "w") as f:
- f.write(source_config_class.schema_json(indent=2))
+ # Add to the platform list if not already present.
+ platforms.setdefault(
+ plugin.platform_id,
+ Platform(
+ id=plugin.platform_id,
+ name=plugin.platform_name,
+ ),
+ ).add_plugin(plugin_name=plugin.name, plugin=plugin)
- create_or_update(
- source_documentation,
- [platform_id, "plugins", plugin_name, "config_schema"],
- source_config_class.schema_json(indent=2) or "",
+ if extra_docs:
+ for path in glob.glob(f"{extra_docs}/**/*[.md|.yaml|.yml]", recursive=True):
+ if m := re.search("/docs/sources/(.*)/(.*).md", path):
+ platform_name = m.group(1).lower() # TODO: rename this to platform_id
+ file_name = m.group(2)
+ destination_md: str = (
+ f"../docs/generated/ingestion/sources/{platform_name}.md"
)
- table_md = gen_md_table_from_struct(source_config_class.schema())
- create_or_update(
- source_documentation,
- [platform_id, "plugins", plugin_name, "source_doc"],
- source_doc or "",
- )
- create_or_update(
- source_documentation,
- [platform_id, "plugins", plugin_name, "config"],
- table_md,
- )
- create_or_update(
- source_documentation,
- [platform_id, "plugins", plugin_name, "support_status"],
- support_status,
- )
+ with open(path, "r") as doc_file:
+ file_contents = doc_file.read()
+ final_markdown = rewrite_markdown(file_contents, path, destination_md)
+
+ if file_name == "README":
+ # README goes as platform level docs
+ # all other docs are assumed to be plugin level
+ platforms[platform_name].custom_docs_pre = final_markdown
+
+ elif "_" in file_name:
+ plugin_doc_parts = file_name.split("_")
+ if len(plugin_doc_parts) != 2:
+ raise ValueError(
+ f"{file_name} needs to be of the form _pre.md or _post.md"
+ )
+ plugin_name, suffix = plugin_doc_parts
+ if suffix == "pre":
+ platforms[platform_name].plugins[
+ plugin_name
+ ].custom_docs_pre = final_markdown
+ elif suffix == "post":
+ platforms[platform_name].plugins[
+ plugin_name
+ ].custom_docs_post = final_markdown
+ else:
+ raise ValueError(
+ f"{file_name} needs to be of the form _pre.md or _post.md"
+ )
- except Exception as e:
- raise e
+ else: # assume this is the platform post.
+ # TODO: Probably need better error checking here.
+ platforms[platform_name].plugins[
+ file_name
+ ].custom_docs_post = final_markdown
+ elif yml_match := re.search("/docs/sources/(.*)/(.*)_recipe.yml", path):
+ platform_name = yml_match.group(1).lower()
+ plugin_name = yml_match.group(2)
+ platforms[platform_name].plugins[
+ plugin_name
+ ].starter_recipe = pathlib.Path(path).read_text()
sources_dir = f"{out_dir}/sources"
os.makedirs(sources_dir, exist_ok=True)
+ # Sort platforms by platform name.
+ platforms = dict(sorted(platforms.items(), key=lambda x: x[1].name.casefold()))
+
i = 0
- for platform_id, platform_docs in sorted(
- source_documentation.items(),
- key=lambda x: (x[1]["name"].casefold(), x[1]["name"])
- if "name" in x[1]
- else (x[0].casefold(), x[0]),
- ):
+ for platform_id, platform in platforms.items():
if source and platform_id != source:
continue
- metrics["source_platforms"]["discovered"] = (
- metrics["source_platforms"]["discovered"] + 1 # type: ignore
- )
+ platform_metrics.discovered += 1
platform_doc_file = f"{sources_dir}/{platform_id}.md"
- if "name" not in platform_docs:
- # We seem to have discovered written docs that corresponds to a platform, but haven't found linkage to it from the source classes
- warning_msg = f"Failed to find source classes for platform {platform_id}. Did you remember to annotate your source class with @platform_name({platform_id})?"
- logger.error(warning_msg)
- metrics["source_platforms"]["warnings"].append(warning_msg) # type: ignore
- continue
+ # if "name" not in platform_docs:
+ # # We seem to have discovered written docs that corresponds to a platform, but haven't found linkage to it from the source classes
+ # warning_msg = f"Failed to find source classes for platform {platform_id}. Did you remember to annotate your source class with @platform_name({platform_id})?"
+ # logger.error(warning_msg)
+ # metrics["source_platforms"]["warnings"].append(warning_msg) # type: ignore
+ # continue
with open(platform_doc_file, "w") as f:
i += 1
@@ -745,12 +344,12 @@ def generate(
f.write(
"import Tabs from '@theme/Tabs';\nimport TabItem from '@theme/TabItem';\n\n"
)
- f.write(f"# {platform_docs['name']}\n")
+ f.write(f"# {platform.name}\n")
- if len(platform_docs["plugins"].keys()) > 1:
+ if len(platform.plugins) > 1:
# More than one plugin used to provide integration with this platform
f.write(
- f"There are {len(platform_docs['plugins'].keys())} sources that provide integration with {platform_docs['name']}\n"
+ f"There are {len(platform.plugins)} sources that provide integration with {platform.name}\n"
)
f.write("\n")
f.write("\n")
@@ -759,18 +358,22 @@ def generate(
f.write(f"{col_header} | ")
f.write("")
+ # Sort plugins in the platform.
+ # It's a dict, so we need to recreate it.
+ platform.plugins = dict(
+ sorted(
+ platform.plugins.items(),
+ key=lambda x: str(x[1].doc_order) if x[1].doc_order else x[0],
+ )
+ )
+
# f.write("| Source Module | Documentation |\n")
# f.write("| ------ | ---- |\n")
- for plugin, plugin_docs in sorted(
- platform_docs["plugins"].items(),
- key=lambda x: str(x[1].get("doc_order"))
- if x[1].get("doc_order")
- else x[0],
- ):
+ for plugin_name, plugin in platform.plugins.items():
f.write("\n")
- f.write(f"\n\n`{plugin}`\n\n | \n")
+ f.write(f"\n\n`{plugin_name}`\n\n | \n")
f.write(
- f"\n\n\n{platform_docs['plugins'][plugin].get('source_doc') or ''} [Read more...](#module-{plugin})\n\n\n | \n"
+ f"\n\n\n{plugin.source_docstring or ''} [Read more...](#module-{plugin_name})\n\n\n | \n"
)
f.write("
\n")
# f.write(
@@ -778,43 +381,33 @@ def generate(
# )
f.write("
\n\n")
# insert platform level custom docs before plugin section
- f.write(platform_docs.get("custom_docs") or "")
+ f.write(platform.custom_docs_pre or "")
# all_plugins = platform_docs["plugins"].keys()
- for plugin, plugin_docs in sorted(
- platform_docs["plugins"].items(),
- key=lambda x: str(x[1].get("doc_order"))
- if x[1].get("doc_order")
- else x[0],
- ):
- if len(platform_docs["plugins"].keys()) > 1:
+ for plugin_name, plugin in platform.plugins.items():
+ if len(platform.plugins) > 1:
# We only need to show this if there are multiple modules.
- f.write(f"\n\n## Module `{plugin}`\n")
+ f.write(f"\n\n## Module `{plugin_name}`\n")
- if "support_status" in plugin_docs:
- f.write(
- get_support_status_badge(plugin_docs["support_status"]) + "\n\n"
- )
- if "capabilities" in plugin_docs and len(plugin_docs["capabilities"]):
+ if plugin.support_status != SupportStatus.UNKNOWN:
+ f.write(get_support_status_badge(plugin.support_status) + "\n\n")
+ if plugin.capabilities and len(plugin.capabilities):
f.write("\n### Important Capabilities\n")
f.write("| Capability | Status | Notes |\n")
f.write("| ---------- | ------ | ----- |\n")
- plugin_capabilities: List[CapabilitySetting] = plugin_docs[
- "capabilities"
- ]
- for cap_setting in plugin_capabilities:
+ for cap_setting in plugin.capabilities:
f.write(
f"| {get_capability_text(cap_setting.capability)} | {get_capability_supported_badge(cap_setting.supported)} | {cap_setting.description} |\n"
)
f.write("\n")
- f.write(f"{plugin_docs.get('source_doc') or ''}\n")
+ f.write(f"{plugin.source_docstring or ''}\n")
# Insert custom pre section
- f.write(plugin_docs.get("custom_docs_pre", ""))
+ f.write(plugin.custom_docs_pre or "")
f.write("\n### CLI based Ingestion\n")
- if "extra_deps" in plugin_docs:
+ if plugin.extra_deps and len(plugin.extra_deps):
f.write("\n#### Install the Plugin\n")
- if plugin_docs["extra_deps"] != []:
+ if plugin.extra_deps != []:
f.write("```shell\n")
f.write(f"pip install 'acryl-datahub[{plugin}]'\n")
f.write("```\n")
@@ -822,7 +415,7 @@ def generate(
f.write(
f"The `{plugin}` source works out of the box with `acryl-datahub`.\n"
)
- if "recipe" in plugin_docs:
+ if plugin.starter_recipe:
f.write("\n### Starter Recipe\n")
f.write(
"Check out the following recipe to get started with ingestion! See [below](#config-details) for full configuration options.\n\n\n"
@@ -831,9 +424,10 @@ def generate(
"For general pointers on writing and running a recipe, see our [main recipe guide](../../../../metadata-ingestion/README.md#recipes).\n"
)
f.write("```yaml\n")
- f.write(plugin_docs["recipe"])
+ f.write(plugin.starter_recipe)
f.write("\n```\n")
- if "config" in plugin_docs:
+ if plugin.config_json_schema:
+ assert plugin.config_md is not None
f.write("\n### Config Details\n")
f.write(
"""
@@ -845,8 +439,8 @@ def generate(
# f.write(
# "\n\nView All Configuration Options
\n\n"
# )
- for doc in plugin_docs["config"]:
- f.write(doc)
+ f.write(plugin.config_md)
+ f.write("\n\n")
# f.write("\n \n\n")
f.write(
f"""
@@ -854,39 +448,49 @@ def generate(
The [JSONSchema](https://json-schema.org/) for this configuration is inlined below.\n\n
```javascript
-{plugin_docs['config_schema']}
+{plugin.config_json_schema}
```\n\n
\n\n"""
)
+
# insert custom plugin docs after config details
- f.write(plugin_docs.get("custom_docs_post", ""))
- if "classname" in plugin_docs:
+ f.write(plugin.custom_docs_post or "")
+ if plugin.classname:
f.write("\n### Code Coordinates\n")
- f.write(f"- Class Name: `{plugin_docs['classname']}`\n")
- if "filename" in plugin_docs:
+ f.write(f"- Class Name: `{plugin.classname}`\n")
+ if plugin.filename:
f.write(
- f"- Browse on [GitHub](../../../../metadata-ingestion/{plugin_docs['filename']})\n\n"
+ f"- Browse on [GitHub](../../../../metadata-ingestion/{plugin.filename})\n\n"
)
- metrics["plugins"]["generated"] = metrics["plugins"]["generated"] + 1 # type: ignore
+ plugin_metrics.generated += 1
# Using an h2 tag to prevent this from showing up in page's TOC sidebar.
f.write("\nQuestions
\n\n")
f.write(
- f"If you've got any questions on configuring ingestion for {platform_docs.get('name',platform_id)}, feel free to ping us on [our Slack](https://slack.datahubproject.io).\n"
- )
- metrics["source_platforms"]["generated"] = (
- metrics["source_platforms"]["generated"] + 1 # type: ignore
+ f"If you've got any questions on configuring ingestion for {platform.name}, feel free to ping us on [our Slack](https://slack.datahubproject.io).\n"
)
+ platform_metrics.generated += 1
print("Ingestion Documentation Generation Complete")
print("############################################")
- print(json.dumps(metrics, indent=2))
+ print(
+ json.dumps(
+ {
+ "plugin_metrics": dataclasses.asdict(plugin_metrics),
+ "platform_metrics": dataclasses.asdict(platform_metrics),
+ },
+ indent=2,
+ )
+ )
print("############################################")
- if metrics["plugins"].get("failed", 0) > 0: # type: ignore
+ if plugin_metrics.failed > 0:
sys.exit(1)
- ### Create Lineage doc
+ # Create Lineage doc
+ generate_lineage_doc(platforms)
+
+def generate_lineage_doc(platforms: Dict[str, Platform]) -> None:
source_dir = "../docs/generated/lineage"
os.makedirs(source_dir, exist_ok=True)
doc_file = f"{source_dir}/lineage-feature-guide.md"
@@ -894,7 +498,7 @@ def generate(
f.write(
"import FeatureAvailability from '@site/src/components/FeatureAvailability';\n\n"
)
- f.write(f"# About DataHub Lineage\n\n")
+ f.write("# About DataHub Lineage\n\n")
f.write("\n")
f.write(
@@ -996,30 +600,24 @@ def generate(
)
f.write("| ---------- | ------ | ----- |----- |\n")
- for platform_id, platform_docs in sorted(
- source_documentation.items(),
- key=lambda x: (x[1]["name"].casefold(), x[1]["name"])
- if "name" in x[1]
- else (x[0].casefold(), x[0]),
- ):
- for plugin, plugin_docs in sorted(
- platform_docs["plugins"].items(),
- key=lambda x: str(x[1].get("doc_order"))
- if x[1].get("doc_order")
- else x[0],
+ for platform_id, platform in platforms.items():
+ for plugin in sorted(
+ platform.plugins.values(),
+ key=lambda x: str(x.doc_order) if x.doc_order else x.name,
):
- platform_name = platform_docs["name"]
- if len(platform_docs["plugins"].keys()) > 1:
+ if len(platform.plugins) > 1:
# We only need to show this if there are multiple modules.
- platform_name = f"{platform_name} `{plugin}`"
+ platform_plugin_name = f"{platform.name} `{plugin.name}`"
+ else:
+ platform_plugin_name = platform.name
# Initialize variables
table_level_supported = "❌"
column_level_supported = "❌"
config_names = ""
- if "capabilities" in plugin_docs:
- plugin_capabilities = plugin_docs["capabilities"]
+ if plugin.capabilities and len(plugin.capabilities):
+ plugin_capabilities = plugin.capabilities
for cap_setting in plugin_capabilities:
capability_text = get_capability_text(cap_setting.capability)
@@ -1040,10 +638,10 @@ def generate(
column_level_supported = "✅"
if not (table_level_supported == "❌" and column_level_supported == "❌"):
- if "config_schema" in plugin_docs:
- config_properties = json.loads(
- plugin_docs["config_schema"]
- ).get("properties", {})
+ if plugin.config_json_schema:
+ config_properties = json.loads(plugin.config_json_schema).get(
+ "properties", {}
+ )
config_names = "
".join(
[
f"- {property_name}"
@@ -1065,7 +663,7 @@ def generate(
]
if platform_id not in lineage_not_applicable_sources:
f.write(
- f"| [{platform_name}](../../generated/ingestion/sources/{platform_id}.md) | {table_level_supported} | {column_level_supported} | {config_names}|\n"
+ f"| [{platform_plugin_name}](../../generated/ingestion/sources/{platform_id}.md) | {table_level_supported} | {column_level_supported} | {config_names}|\n"
)
f.write(
diff --git a/metadata-ingestion/scripts/docgen_types.py b/metadata-ingestion/scripts/docgen_types.py
new file mode 100644
index 00000000000000..c96ab955e8cce2
--- /dev/null
+++ b/metadata-ingestion/scripts/docgen_types.py
@@ -0,0 +1,45 @@
+from dataclasses import dataclass, field
+from typing import Dict, List, Optional
+
+from datahub.ingestion.api.decorators import CapabilitySetting, SupportStatus
+
+
+@dataclass
+class Plugin:
+ # Required fields
+ name: str
+ platform_id: str
+ platform_name: str
+ classname: str
+
+ # Optional documentation fields
+ source_docstring: Optional[str] = None
+ config_json_schema: Optional[str] = None
+ config_md: Optional[str] = None
+ custom_docs_pre: Optional[str] = None
+ custom_docs_post: Optional[str] = None
+ starter_recipe: Optional[str] = None
+
+ # Optional metadata fields
+ support_status: SupportStatus = SupportStatus.UNKNOWN
+ filename: Optional[str] = None
+ doc_order: Optional[int] = None
+
+ # Lists with empty defaults
+ capabilities: List[CapabilitySetting] = field(default_factory=list)
+ extra_deps: List[str] = field(default_factory=list)
+
+
+@dataclass
+class Platform:
+ # Required fields
+ id: str
+ name: str
+
+ # Optional fields
+ custom_docs_pre: Optional[str] = None
+ plugins: Dict[str, Plugin] = field(default_factory=dict)
+
+ def add_plugin(self, plugin_name: str, plugin: Plugin) -> None:
+ """Helper method to add a plugin to the platform"""
+ self.plugins[plugin_name] = plugin
diff --git a/metadata-ingestion/scripts/docs_config_table.py b/metadata-ingestion/scripts/docs_config_table.py
new file mode 100644
index 00000000000000..3c5d9d0b0a2ba5
--- /dev/null
+++ b/metadata-ingestion/scripts/docs_config_table.py
@@ -0,0 +1,376 @@
+import html
+import json
+import re
+from typing import Any, Dict, Iterable, List, Optional, Type
+
+from pydantic import BaseModel, Field
+
+from datahub.ingestion.extractor.json_schema_util import JsonSchemaTranslator
+from datahub.metadata.schema_classes import SchemaFieldClass
+
+DEFAULT_VALUE_MAX_LENGTH = 50
+DEFAULT_VALUE_TRUNCATION_MESSAGE = "..."
+
+
+def _truncate_default_value(value: str) -> str:
+ if len(value) > DEFAULT_VALUE_MAX_LENGTH:
+ return value[:DEFAULT_VALUE_MAX_LENGTH] + DEFAULT_VALUE_TRUNCATION_MESSAGE
+ return value
+
+
+def _format_path_component(path: str) -> str:
+ """
+ Given a path like 'a.b.c', adds css tags to the components.
+ """
+ path_components = path.rsplit(".", maxsplit=1)
+ if len(path_components) == 1:
+ return f'{path_components[0]}'
+
+ return (
+ f'{path_components[0]}.'
+ f'{path_components[1]}'
+ )
+
+
+def _format_type_name(type_name: str) -> str:
+ return f'{type_name}'
+
+
+def _format_default_line(default_value: str, has_desc_above: bool) -> str:
+ default_value = _truncate_default_value(default_value)
+ escaped_value = (
+ html.escape(default_value)
+ # Replace curly braces to avoid JSX issues.
+ .replace("{", "{")
+ .replace("}", "}")
+ # We also need to replace markdown special characters.
+ .replace("*", "*")
+ .replace("_", "_")
+ .replace("[", "[")
+ .replace("]", "]")
+ .replace("|", "|")
+ .replace("`", "`")
+ )
+ value_elem = f'{escaped_value}'
+ return f'Default: {value_elem}
'
+
+
+class FieldRow(BaseModel):
+ path: str
+ parent: Optional[str]
+ type_name: str
+ required: bool
+ has_default: bool
+ default: str
+ description: str
+ inner_fields: List["FieldRow"] = Field(default_factory=list)
+ discriminated_type: Optional[str] = None
+
+ class Component(BaseModel):
+ type: str
+ field_name: Optional[str]
+
+ # matches any [...] style section inside a field path
+ _V2_FIELD_PATH_TOKEN_MATCHER = r"\[[\w.]*[=]*[\w\(\-\ \_\).]*\][\.]*"
+ # matches a .?[...] style section inside a field path anchored to the beginning
+ _V2_FIELD_PATH_TOKEN_MATCHER_PREFIX = rf"^[\.]*{_V2_FIELD_PATH_TOKEN_MATCHER}"
+ _V2_FIELD_PATH_FIELD_NAME_MATCHER = r"^\w+"
+
+ @staticmethod
+ def map_field_path_to_components(field_path: str) -> List[Component]:
+ m = re.match(FieldRow._V2_FIELD_PATH_TOKEN_MATCHER_PREFIX, field_path)
+ v = re.match(FieldRow._V2_FIELD_PATH_FIELD_NAME_MATCHER, field_path)
+ components: List[FieldRow.Component] = []
+ while m or v:
+ token = m.group() if m else v.group() # type: ignore
+ if v:
+ if components:
+ if components[-1].field_name is None:
+ components[-1].field_name = token
+ else:
+ components.append(
+ FieldRow.Component(type="non_map_type", field_name=token)
+ )
+ else:
+ components.append(
+ FieldRow.Component(type="non_map_type", field_name=token)
+ )
+
+ if m:
+ if token.startswith("[version="):
+ pass
+ elif "[type=" in token:
+ type_match = re.match(r"[\.]*\[type=(.*)\]", token)
+ if type_match:
+ type_string = type_match.group(1)
+ if components and components[-1].type == "map":
+ if components[-1].field_name is None:
+ pass
+ else:
+ new_component = FieldRow.Component(
+ type="map_key", field_name="`key`"
+ )
+ components.append(new_component)
+ new_component = FieldRow.Component(
+ type=type_string, field_name=None
+ )
+ components.append(new_component)
+ if type_string == "map":
+ new_component = FieldRow.Component(
+ type=type_string, field_name=None
+ )
+ components.append(new_component)
+
+ field_path = field_path[m.span()[1] :] if m else field_path[v.span()[1] :] # type: ignore
+ m = re.match(FieldRow._V2_FIELD_PATH_TOKEN_MATCHER_PREFIX, field_path)
+ v = re.match(FieldRow._V2_FIELD_PATH_FIELD_NAME_MATCHER, field_path)
+
+ return components
+
+ @staticmethod
+ def field_path_to_components(field_path: str) -> List[str]:
+ """
+ Inverts the field_path v2 format to get the canonical field path
+ [version=2.0].[type=x].foo.[type=string(format=uri)].bar => ["foo","bar"]
+ """
+ if "type=map" not in field_path:
+ return re.sub(FieldRow._V2_FIELD_PATH_TOKEN_MATCHER, "", field_path).split(
+ "."
+ )
+ else:
+ # fields with maps in them need special handling to insert the `key` fragment
+ return [
+ c.field_name
+ for c in FieldRow.map_field_path_to_components(field_path)
+ if c.field_name
+ ]
+
+ @classmethod
+ def from_schema_field(cls, schema_field: SchemaFieldClass) -> "FieldRow":
+ path_components = FieldRow.field_path_to_components(schema_field.fieldPath)
+
+ parent = path_components[-2] if len(path_components) >= 2 else None
+ if parent == "`key`":
+ # the real parent node is one index above
+ parent = path_components[-3]
+ json_props = (
+ json.loads(schema_field.jsonProps) if schema_field.jsonProps else {}
+ )
+
+ required = json_props.get("required", True)
+ has_default = "default" in json_props
+ default_value = str(json_props.get("default"))
+
+ field_path = ".".join(path_components)
+
+ return FieldRow(
+ path=field_path,
+ parent=parent,
+ type_name=str(schema_field.nativeDataType),
+ required=required,
+ has_default=has_default,
+ default=default_value,
+ description=schema_field.description,
+ inner_fields=[],
+ discriminated_type=schema_field.nativeDataType,
+ )
+
+ def get_checkbox(self) -> str:
+ if self.required and not self.has_default:
+ # Using a non-breaking space to prevent the checkbox from being
+ # broken into a new line.
+ if not self.parent: # None and empty string both count
+ return ' ✅'
+ else:
+ return f' ❓'
+ else:
+ return ""
+
+ def to_md_line(self) -> str:
+ if self.inner_fields:
+ if len(self.inner_fields) == 1:
+ type_name = self.inner_fields[0].type_name or self.type_name
+ else:
+ # To deal with unions that have essentially the same simple field path,
+ # we combine the type names into a single string.
+ type_name = "One of " + ", ".join(
+ [x.type_name for x in self.inner_fields if x.discriminated_type]
+ )
+ else:
+ type_name = self.type_name
+
+ description = self.description.strip()
+ description = self.description.replace(
+ "\n", "
"
+ ) # descriptions with newlines in them break markdown rendering
+
+ md_line = (
+ f'| {_format_path_component(self.path)}'
+ f"{self.get_checkbox()}
"
+ f' {_format_type_name(type_name)}
'
+ f"| {description} "
+ f"{_format_default_line(self.default, bool(description)) if self.has_default else ''} |\n"
+ )
+ return md_line
+
+
+class FieldHeader(FieldRow):
+ def to_md_line(self) -> str:
+ return "\n".join(
+ [
+ "| Field | Description |",
+ "|:--- |:--- |",
+ "",
+ ]
+ )
+
+ def __init__(self):
+ pass
+
+
+def get_prefixed_name(field_prefix: Optional[str], field_name: Optional[str]) -> str:
+ assert (
+ field_prefix or field_name
+ ), "One of field_prefix or field_name should be present"
+ return (
+ f"{field_prefix}.{field_name}" # type: ignore
+ if field_prefix and field_name
+ else field_name
+ if not field_prefix
+ else field_prefix
+ )
+
+
+def custom_comparator(path: str) -> str:
+ """
+ Projects a string onto a separate space
+ Low_prio string will start with Z else start with A
+ Number of field paths will add the second set of letters: 00 - 99
+
+ """
+ opt1 = path
+ prio_value = priority_value(opt1)
+ projection = f"{prio_value}"
+ projection = f"{projection}{opt1}"
+ return projection
+
+
+class FieldTree:
+ """
+ A helper class that re-constructs the tree hierarchy of schema fields
+ to help sort fields by importance while keeping nesting intact
+ """
+
+ def __init__(self, field: Optional[FieldRow] = None):
+ self.field = field
+ self.fields: Dict[str, "FieldTree"] = {}
+
+ def add_field(self, row: FieldRow, path: Optional[str] = None) -> "FieldTree":
+ # logger.warn(f"Add field: path:{path}, row:{row}")
+ if self.field and self.field.path == row.path:
+ # we have an incoming field with the same path as us, this is probably a union variant
+ # attach to existing field
+ self.field.inner_fields.append(row)
+ else:
+ path = path if path is not None else row.path
+ top_level_field = path.split(".")[0]
+ if top_level_field in self.fields:
+ self.fields[top_level_field].add_field(
+ row, ".".join(path.split(".")[1:])
+ )
+ else:
+ self.fields[top_level_field] = FieldTree(field=row)
+ # logger.warn(f"{self}")
+ return self
+
+ def sort(self):
+ # Required fields before optionals
+ required_fields = {
+ k: v for k, v in self.fields.items() if v.field and v.field.required
+ }
+ optional_fields = {
+ k: v for k, v in self.fields.items() if v.field and not v.field.required
+ }
+
+ self.sorted_fields = []
+ for field_map in [required_fields, optional_fields]:
+ # Top-level fields before fields with nesting
+ self.sorted_fields.extend(
+ sorted(
+ [f for f, val in field_map.items() if val.fields == {}],
+ key=custom_comparator,
+ )
+ )
+ self.sorted_fields.extend(
+ sorted(
+ [f for f, val in field_map.items() if val.fields != {}],
+ key=custom_comparator,
+ )
+ )
+
+ for field_tree in self.fields.values():
+ field_tree.sort()
+
+ def get_fields(self) -> Iterable[FieldRow]:
+ if self.field:
+ yield self.field
+ for key in self.sorted_fields:
+ yield from self.fields[key].get_fields()
+
+ def __repr__(self) -> str:
+ result = {}
+ if self.field:
+ result["_self"] = json.loads(json.dumps(self.field.dict()))
+ for f in self.fields:
+ result[f] = json.loads(str(self.fields[f]))
+ return json.dumps(result, indent=2)
+
+
+def priority_value(path: str) -> str:
+ # A map of low value tokens to their relative importance
+ low_value_token_map = {
+ "env": "X",
+ "classification": "Y",
+ "profiling": "Y",
+ "stateful_ingestion": "Z",
+ }
+ tokens = path.split(".")
+ for low_value_token in low_value_token_map:
+ if low_value_token in tokens:
+ return low_value_token_map[low_value_token]
+
+ # everything else high-prio
+ return "A"
+
+
+def gen_md_table_from_json_schema(schema_dict: Dict[str, Any]) -> str:
+ # we don't want default field values to be injected into the description of the field
+ JsonSchemaTranslator._INJECT_DEFAULTS_INTO_DESCRIPTION = False
+ schema_fields = list(JsonSchemaTranslator.get_fields_from_schema(schema_dict))
+ result: List[str] = [FieldHeader().to_md_line()]
+
+ field_tree = FieldTree(field=None)
+ for field in schema_fields:
+ row: FieldRow = FieldRow.from_schema_field(field)
+ field_tree.add_field(row)
+
+ field_tree.sort()
+
+ for row in field_tree.get_fields():
+ result.append(row.to_md_line())
+
+ # Wrap with a .config-table div.
+ result = ["\n\n\n", *result, "\n
\n"]
+
+ return "".join(result)
+
+
+def gen_md_table_from_pydantic(model: Type[BaseModel]) -> str:
+ return gen_md_table_from_json_schema(model.schema())
+
+
+if __name__ == "__main__":
+ # Simple test code.
+ from datahub.ingestion.source.snowflake.snowflake_config import SnowflakeV2Config
+
+ print("".join(gen_md_table_from_pydantic(SnowflakeV2Config)))
diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py
index 5a48f8b7918dce..d5dbb98d3cb17b 100644
--- a/metadata-ingestion/setup.py
+++ b/metadata-ingestion/setup.py
@@ -461,7 +461,7 @@
"mssql-odbc": sql_common | mssql_common | {"pyodbc"},
"mysql": mysql,
# mariadb should have same dependency as mysql
- "mariadb": sql_common | {"pymysql>=1.0.2"},
+ "mariadb": sql_common | mysql,
"okta": {"okta~=1.7.0", "nest-asyncio"},
"oracle": sql_common | {"oracledb"},
"postgres": sql_common | postgres_common,
diff --git a/metadata-ingestion/src/datahub/api/entities/structuredproperties/structuredproperties.py b/metadata-ingestion/src/datahub/api/entities/structuredproperties/structuredproperties.py
index 619f69b016262d..179dbdb231c912 100644
--- a/metadata-ingestion/src/datahub/api/entities/structuredproperties/structuredproperties.py
+++ b/metadata-ingestion/src/datahub/api/entities/structuredproperties/structuredproperties.py
@@ -1,7 +1,7 @@
import logging
from enum import Enum
from pathlib import Path
-from typing import List, Optional
+from typing import Iterable, List, Optional
import yaml
from pydantic import validator
@@ -226,3 +226,14 @@ def to_yaml(
yaml.indent(mapping=2, sequence=4, offset=2)
yaml.default_flow_style = False
yaml.dump(self.dict(), fp)
+
+ @staticmethod
+ def list_urns(graph: DataHubGraph) -> Iterable[str]:
+ return graph.get_urns_by_filter(
+ entity_types=["structuredProperty"],
+ )
+
+ @staticmethod
+ def list(graph: DataHubGraph) -> Iterable["StructuredProperties"]:
+ for urn in StructuredProperties.list_urns(graph):
+ yield StructuredProperties.from_datahub(graph, urn)
diff --git a/metadata-ingestion/src/datahub/cli/cli_utils.py b/metadata-ingestion/src/datahub/cli/cli_utils.py
index f80181192ba583..f6b5ba6176c59d 100644
--- a/metadata-ingestion/src/datahub/cli/cli_utils.py
+++ b/metadata-ingestion/src/datahub/cli/cli_utils.py
@@ -3,7 +3,7 @@
import time
import typing
from datetime import datetime
-from typing import Any, Dict, List, Optional, Tuple, Type, Union
+from typing import Any, Dict, List, Optional, Tuple, Type, TypeVar, Union
import click
import requests
@@ -33,6 +33,15 @@ def first_non_null(ls: List[Optional[str]]) -> Optional[str]:
return next((el for el in ls if el is not None and el.strip() != ""), None)
+_T = TypeVar("_T")
+
+
+def get_or_else(value: Optional[_T], default: _T) -> _T:
+ # Normally we'd use `value or default`. However, that runs into issues
+ # when value is falsey but not None.
+ return value if value is not None else default
+
+
def parse_run_restli_response(response: requests.Response) -> dict:
response_json = response.json()
if response.status_code != 200:
@@ -321,6 +330,8 @@ def get_frontend_session_login_as(
def _ensure_valid_gms_url_acryl_cloud(url: str) -> str:
if "acryl.io" not in url:
return url
+ if url.endswith(":8080"):
+ url = url.replace(":8080", "")
if url.startswith("http://"):
url = url.replace("http://", "https://")
if url.endswith("acryl.io"):
diff --git a/metadata-ingestion/src/datahub/cli/delete_cli.py b/metadata-ingestion/src/datahub/cli/delete_cli.py
index 1a75459a92c5cf..8501cf71f0d544 100644
--- a/metadata-ingestion/src/datahub/cli/delete_cli.py
+++ b/metadata-ingestion/src/datahub/cli/delete_cli.py
@@ -1,8 +1,8 @@
import logging
+import random
from concurrent.futures import ThreadPoolExecutor, as_completed
from dataclasses import dataclass
from datetime import datetime
-from random import choices
from typing import Dict, List, Optional
import click
@@ -457,11 +457,11 @@ def by_filter(
click.echo("Found urns of multiple entity types")
for entity_type, entity_urns in urns_by_type.items():
click.echo(
- f"- {len(entity_urns)} {entity_type} urn(s). Sample: {choices(entity_urns, k=min(5, len(entity_urns)))}"
+ f"- {len(entity_urns)} {entity_type} urn(s). Sample: {random.sample(entity_urns, k=min(5, len(entity_urns)))}"
)
else:
click.echo(
- f"Found {len(urns)} {entity_type} urn(s). Sample: {choices(urns, k=min(5, len(urns)))}"
+ f"Found {len(urns)} {entity_type} urn(s). Sample: {random.sample(urns, k=min(5, len(urns)))}"
)
if not force and not dry_run:
diff --git a/metadata-ingestion/src/datahub/cli/migrate.py b/metadata-ingestion/src/datahub/cli/migrate.py
index ea5375c9471283..1bf1211674f596 100644
--- a/metadata-ingestion/src/datahub/cli/migrate.py
+++ b/metadata-ingestion/src/datahub/cli/migrate.py
@@ -179,7 +179,7 @@ def dataplatform2instance_func(
if not force and not dry_run:
# get a confirmation from the operator before proceeding if this is not a dry run
- sampled_urns_to_migrate = random.choices(
+ sampled_urns_to_migrate = random.sample(
urns_to_migrate, k=min(10, len(urns_to_migrate))
)
sampled_new_urns: List[str] = [
@@ -193,7 +193,7 @@ def dataplatform2instance_func(
if key
]
click.echo(
- f"Will migrate {len(urns_to_migrate)} urns such as {random.choices(urns_to_migrate, k=min(10, len(urns_to_migrate)))}"
+ f"Will migrate {len(urns_to_migrate)} urns such as {random.sample(urns_to_migrate, k=min(10, len(urns_to_migrate)))}"
)
click.echo(f"New urns will look like {sampled_new_urns}")
click.confirm("Ok to proceed?", abort=True)
diff --git a/metadata-ingestion/src/datahub/cli/specific/structuredproperties_cli.py b/metadata-ingestion/src/datahub/cli/specific/structuredproperties_cli.py
index 42285cf13a5ddc..5cd28516a076d9 100644
--- a/metadata-ingestion/src/datahub/cli/specific/structuredproperties_cli.py
+++ b/metadata-ingestion/src/datahub/cli/specific/structuredproperties_cli.py
@@ -1,9 +1,11 @@
import json
import logging
from pathlib import Path
+from typing import Iterable
import click
from click_default_group import DefaultGroup
+from ruamel.yaml import YAML
from datahub.api.entities.structuredproperties.structuredproperties import (
StructuredProperties,
@@ -61,3 +63,85 @@ def get(urn: str, to_file: str) -> None:
)
else:
click.secho(f"Structured property {urn} does not exist")
+
+
+@properties.command(
+ name="list",
+)
+@click.option("--details/--no-details", is_flag=True, default=True)
+@click.option("--to-file", required=False, type=str)
+@telemetry.with_telemetry()
+def list(details: bool, to_file: str) -> None:
+ """List structured properties in DataHub"""
+
+ def to_yaml_list(
+ objects: Iterable[StructuredProperties], # iterable of objects to dump
+ file: Path,
+ ) -> None:
+ # if file exists, first we read it
+ yaml = YAML(typ="rt") # default, if not specfied, is 'rt' (round-trip)
+ yaml.indent(mapping=2, sequence=4, offset=2)
+ yaml.default_flow_style = False
+ serialized_objects = []
+ if file.exists():
+ with open(file, "r") as fp:
+ existing_objects = yaml.load(fp) # this is a list of dicts
+ existing_objects = [
+ StructuredProperties.parse_obj(obj) for obj in existing_objects
+ ]
+ objects = [obj for obj in objects]
+ # do a positional update of the existing objects
+ existing_urns = {obj.urn for obj in existing_objects}
+ # existing_urns = {obj["urn"] if "urn" in obj else f"urn:li:structuredProperty:{obj['id']}" for obj in existing_objects}
+ for i, obj in enumerate(existing_objects):
+ # existing_urn = obj["urn"] if "urn" in obj else f"urn:li:structuredProperty:{obj['id']}"
+ existing_urn = obj.urn
+ # breakpoint()
+ if existing_urn in {obj.urn for obj in objects}:
+ existing_objects[i] = next(
+ obj.dict(exclude_unset=True, exclude_none=True)
+ for obj in objects
+ if obj.urn == existing_urn
+ )
+ new_objects = [
+ obj.dict(exclude_unset=True, exclude_none=True)
+ for obj in objects
+ if obj.urn not in existing_urns
+ ]
+ serialized_objects = existing_objects + new_objects
+ else:
+ serialized_objects = [
+ obj.dict(exclude_unset=True, exclude_none=True) for obj in objects
+ ]
+
+ with open(file, "w") as fp:
+ yaml.dump(serialized_objects, fp)
+
+ with get_default_graph() as graph:
+ if details:
+ logger.info(
+ "Listing structured properties with details. Use --no-details for urns only"
+ )
+ structuredproperties = StructuredProperties.list(graph)
+ if to_file:
+ to_yaml_list(structuredproperties, Path(to_file))
+ else:
+ for structuredproperty in structuredproperties:
+ click.secho(
+ f"{json.dumps(structuredproperty.dict(exclude_unset=True, exclude_none=True), indent=2)}"
+ )
+ else:
+ logger.info(
+ "Listing structured property urns only, use --details for more information"
+ )
+ structured_property_urns = StructuredProperties.list_urns(graph)
+ if to_file:
+ with open(to_file, "w") as f:
+ for urn in structured_property_urns:
+ f.write(f"{urn}\n")
+ click.secho(
+ f"Structured property urns written to {to_file}", fg="green"
+ )
+ else:
+ for urn in structured_property_urns:
+ click.secho(f"{urn}")
diff --git a/metadata-ingestion/src/datahub/emitter/rest_emitter.py b/metadata-ingestion/src/datahub/emitter/rest_emitter.py
index 04242c8bf45d2b..74b8ade7da445b 100644
--- a/metadata-ingestion/src/datahub/emitter/rest_emitter.py
+++ b/metadata-ingestion/src/datahub/emitter/rest_emitter.py
@@ -1,9 +1,21 @@
+from __future__ import annotations
+
import functools
import json
import logging
import os
from json.decoder import JSONDecodeError
-from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Sequence, Union
+from typing import (
+ TYPE_CHECKING,
+ Any,
+ Callable,
+ Dict,
+ List,
+ Optional,
+ Sequence,
+ Tuple,
+ Union,
+)
import requests
from deprecated import deprecated
@@ -12,8 +24,13 @@
from datahub import nice_version_name
from datahub.cli import config_utils
-from datahub.cli.cli_utils import ensure_has_system_metadata, fixup_gms_url
-from datahub.configuration.common import ConfigurationError, OperationalError
+from datahub.cli.cli_utils import ensure_has_system_metadata, fixup_gms_url, get_or_else
+from datahub.cli.env_utils import get_boolean_env_variable
+from datahub.configuration.common import (
+ ConfigModel,
+ ConfigurationError,
+ OperationalError,
+)
from datahub.emitter.generic_emitter import Emitter
from datahub.emitter.mcp import MetadataChangeProposalWrapper
from datahub.emitter.request_helper import make_curl_command
@@ -30,10 +47,8 @@
logger = logging.getLogger(__name__)
-_DEFAULT_CONNECT_TIMEOUT_SEC = 30 # 30 seconds should be plenty to connect
-_DEFAULT_READ_TIMEOUT_SEC = (
- 30 # Any ingest call taking longer than 30 seconds should be abandoned
-)
+_DEFAULT_TIMEOUT_SEC = 30 # 30 seconds should be plenty to connect
+_TIMEOUT_LOWER_BOUND_SEC = 1 # if below this, we log a warning
_DEFAULT_RETRY_STATUS_CODES = [ # Additional status codes to retry on
429,
500,
@@ -46,6 +61,8 @@
os.getenv("DATAHUB_REST_EMITTER_DEFAULT_RETRY_MAX_TIMES", "4")
)
+_DATAHUB_EMITTER_TRACE = get_boolean_env_variable("DATAHUB_EMITTER_TRACE", False)
+
# The limit is 16mb. We will use a max of 15mb to have some space
# for overhead like request headers.
# This applies to pretty much all calls to GMS.
@@ -60,15 +77,76 @@
)
+class RequestsSessionConfig(ConfigModel):
+ timeout: Union[float, Tuple[float, float], None] = _DEFAULT_TIMEOUT_SEC
+
+ retry_status_codes: List[int] = _DEFAULT_RETRY_STATUS_CODES
+ retry_methods: List[str] = _DEFAULT_RETRY_METHODS
+ retry_max_times: int = _DEFAULT_RETRY_MAX_TIMES
+
+ extra_headers: Dict[str, str] = {}
+
+ ca_certificate_path: Optional[str] = None
+ client_certificate_path: Optional[str] = None
+ disable_ssl_verification: bool = False
+
+ def build_session(self) -> requests.Session:
+ session = requests.Session()
+
+ if self.extra_headers:
+ session.headers.update(self.extra_headers)
+
+ if self.client_certificate_path:
+ session.cert = self.client_certificate_path
+
+ if self.ca_certificate_path:
+ session.verify = self.ca_certificate_path
+
+ if self.disable_ssl_verification:
+ session.verify = False
+
+ try:
+ # Set raise_on_status to False to propagate errors:
+ # https://stackoverflow.com/questions/70189330/determine-status-code-from-python-retry-exception
+ # Must call `raise_for_status` after making a request, which we do
+ retry_strategy = Retry(
+ total=self.retry_max_times,
+ status_forcelist=self.retry_status_codes,
+ backoff_factor=2,
+ allowed_methods=self.retry_methods,
+ raise_on_status=False,
+ )
+ except TypeError:
+ # Prior to urllib3 1.26, the Retry class used `method_whitelist` instead of `allowed_methods`.
+ retry_strategy = Retry(
+ total=self.retry_max_times,
+ status_forcelist=self.retry_status_codes,
+ backoff_factor=2,
+ method_whitelist=self.retry_methods,
+ raise_on_status=False,
+ )
+
+ adapter = HTTPAdapter(
+ pool_connections=100, pool_maxsize=100, max_retries=retry_strategy
+ )
+ session.mount("http://", adapter)
+ session.mount("https://", adapter)
+
+ if self.timeout is not None:
+ # Shim session.request to apply default timeout values.
+ # Via https://stackoverflow.com/a/59317604.
+ session.request = functools.partial( # type: ignore
+ session.request,
+ timeout=self.timeout,
+ )
+
+ return session
+
+
class DataHubRestEmitter(Closeable, Emitter):
_gms_server: str
_token: Optional[str]
_session: requests.Session
- _connect_timeout_sec: float = _DEFAULT_CONNECT_TIMEOUT_SEC
- _read_timeout_sec: float = _DEFAULT_READ_TIMEOUT_SEC
- _retry_status_codes: List[int] = _DEFAULT_RETRY_STATUS_CODES
- _retry_methods: List[str] = _DEFAULT_RETRY_METHODS
- _retry_max_times: int = _DEFAULT_RETRY_MAX_TIMES
def __init__(
self,
@@ -99,15 +177,13 @@ def __init__(
self._session = requests.Session()
- self._session.headers.update(
- {
- "X-RestLi-Protocol-Version": "2.0.0",
- "X-DataHub-Py-Cli-Version": nice_version_name(),
- "Content-Type": "application/json",
- }
- )
+ headers = {
+ "X-RestLi-Protocol-Version": "2.0.0",
+ "X-DataHub-Py-Cli-Version": nice_version_name(),
+ "Content-Type": "application/json",
+ }
if token:
- self._session.headers.update({"Authorization": f"Bearer {token}"})
+ headers["Authorization"] = f"Bearer {token}"
else:
# HACK: When no token is provided but system auth env variables are set, we use them.
# Ideally this should simply get passed in as config, instead of being sneakily injected
@@ -116,75 +192,43 @@ def __init__(
# rest emitter, and the rest sink uses the rest emitter under the hood.
system_auth = config_utils.get_system_auth()
if system_auth is not None:
- self._session.headers.update({"Authorization": system_auth})
-
- if extra_headers:
- self._session.headers.update(extra_headers)
-
- if client_certificate_path:
- self._session.cert = client_certificate_path
+ headers["Authorization"] = system_auth
- if ca_certificate_path:
- self._session.verify = ca_certificate_path
-
- if disable_ssl_verification:
- self._session.verify = False
-
- self._connect_timeout_sec = (
- connect_timeout_sec or timeout_sec or _DEFAULT_CONNECT_TIMEOUT_SEC
- )
- self._read_timeout_sec = (
- read_timeout_sec or timeout_sec or _DEFAULT_READ_TIMEOUT_SEC
- )
-
- if self._connect_timeout_sec < 1 or self._read_timeout_sec < 1:
- logger.warning(
- f"Setting timeout values lower than 1 second is not recommended. Your configuration is connect_timeout:{self._connect_timeout_sec}s, read_timeout:{self._read_timeout_sec}s"
- )
-
- if retry_status_codes is not None: # Only if missing. Empty list is allowed
- self._retry_status_codes = retry_status_codes
-
- if retry_methods is not None:
- self._retry_methods = retry_methods
-
- if retry_max_times:
- self._retry_max_times = retry_max_times
-
- try:
- # Set raise_on_status to False to propagate errors:
- # https://stackoverflow.com/questions/70189330/determine-status-code-from-python-retry-exception
- # Must call `raise_for_status` after making a request, which we do
- retry_strategy = Retry(
- total=self._retry_max_times,
- status_forcelist=self._retry_status_codes,
- backoff_factor=2,
- allowed_methods=self._retry_methods,
- raise_on_status=False,
- )
- except TypeError:
- # Prior to urllib3 1.26, the Retry class used `method_whitelist` instead of `allowed_methods`.
- retry_strategy = Retry(
- total=self._retry_max_times,
- status_forcelist=self._retry_status_codes,
- backoff_factor=2,
- method_whitelist=self._retry_methods,
- raise_on_status=False,
+ timeout: float | tuple[float, float]
+ if connect_timeout_sec is not None or read_timeout_sec is not None:
+ timeout = (
+ connect_timeout_sec or timeout_sec or _DEFAULT_TIMEOUT_SEC,
+ read_timeout_sec or timeout_sec or _DEFAULT_TIMEOUT_SEC,
)
+ if (
+ timeout[0] < _TIMEOUT_LOWER_BOUND_SEC
+ or timeout[1] < _TIMEOUT_LOWER_BOUND_SEC
+ ):
+ logger.warning(
+ f"Setting timeout values lower than {_TIMEOUT_LOWER_BOUND_SEC} second is not recommended. Your configuration is (connect_timeout, read_timeout) = {timeout} seconds"
+ )
+ else:
+ timeout = get_or_else(timeout_sec, _DEFAULT_TIMEOUT_SEC)
+ if timeout < _TIMEOUT_LOWER_BOUND_SEC:
+ logger.warning(
+ f"Setting timeout values lower than {_TIMEOUT_LOWER_BOUND_SEC} second is not recommended. Your configuration is timeout = {timeout} seconds"
+ )
- adapter = HTTPAdapter(
- pool_connections=100, pool_maxsize=100, max_retries=retry_strategy
- )
- self._session.mount("http://", adapter)
- self._session.mount("https://", adapter)
-
- # Shim session.request to apply default timeout values.
- # Via https://stackoverflow.com/a/59317604.
- self._session.request = functools.partial( # type: ignore
- self._session.request,
- timeout=(self._connect_timeout_sec, self._read_timeout_sec),
+ self._session_config = RequestsSessionConfig(
+ timeout=timeout,
+ retry_status_codes=get_or_else(
+ retry_status_codes, _DEFAULT_RETRY_STATUS_CODES
+ ),
+ retry_methods=get_or_else(retry_methods, _DEFAULT_RETRY_METHODS),
+ retry_max_times=get_or_else(retry_max_times, _DEFAULT_RETRY_MAX_TIMES),
+ extra_headers={**headers, **(extra_headers or {})},
+ ca_certificate_path=ca_certificate_path,
+ client_certificate_path=client_certificate_path,
+ disable_ssl_verification=disable_ssl_verification,
)
+ self._session = self._session_config.build_session()
+
def test_connection(self) -> None:
url = f"{self._gms_server}/config"
response = self._session.get(url)
@@ -291,7 +335,8 @@ def emit_mcps(
mcps: Sequence[Union[MetadataChangeProposal, MetadataChangeProposalWrapper]],
async_flag: Optional[bool] = None,
) -> int:
- logger.debug("Attempting to emit batch mcps")
+ if _DATAHUB_EMITTER_TRACE:
+ logger.debug(f"Attempting to emit MCP batch of size {len(mcps)}")
url = f"{self._gms_server}/aspects?action=ingestProposalBatch"
for mcp in mcps:
ensure_has_system_metadata(mcp)
@@ -304,22 +349,25 @@ def emit_mcps(
current_chunk_size = INGEST_MAX_PAYLOAD_BYTES
for mcp_obj in mcp_objs:
mcp_obj_size = len(json.dumps(mcp_obj))
- logger.debug(
- f"Iterating through object with size {mcp_obj_size} (type: {mcp_obj.get('aspectName')}"
- )
+ if _DATAHUB_EMITTER_TRACE:
+ logger.debug(
+ f"Iterating through object with size {mcp_obj_size} (type: {mcp_obj.get('aspectName')}"
+ )
if (
mcp_obj_size + current_chunk_size > INGEST_MAX_PAYLOAD_BYTES
or len(mcp_obj_chunks[-1]) >= BATCH_INGEST_MAX_PAYLOAD_LENGTH
):
- logger.debug("Decided to create new chunk")
+ if _DATAHUB_EMITTER_TRACE:
+ logger.debug("Decided to create new chunk")
mcp_obj_chunks.append([])
current_chunk_size = 0
mcp_obj_chunks[-1].append(mcp_obj)
current_chunk_size += mcp_obj_size
- logger.debug(
- f"Decided to send {len(mcps)} mcps in {len(mcp_obj_chunks)} chunks"
- )
+ if len(mcp_obj_chunks) > 0:
+ logger.debug(
+ f"Decided to send {len(mcps)} MCP batch in {len(mcp_obj_chunks)} chunks"
+ )
for mcp_obj_chunk in mcp_obj_chunks:
# TODO: We're calling json.dumps on each MCP object twice, once to estimate
diff --git a/metadata-ingestion/src/datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py b/metadata-ingestion/src/datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py
index 559f0b77f59dfa..b63c96b617ff06 100644
--- a/metadata-ingestion/src/datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py
+++ b/metadata-ingestion/src/datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py
@@ -1,10 +1,9 @@
import json
import logging
-from typing import Iterable, List
+from typing import TYPE_CHECKING, Iterable, List
from datahub.emitter.rest_emitter import INGEST_MAX_PAYLOAD_BYTES
from datahub.emitter.serialization_helper import pre_json_transform
-from datahub.ingestion.api.source import SourceReport
from datahub.ingestion.api.workunit import MetadataWorkUnit
from datahub.metadata.schema_classes import (
DatasetProfileClass,
@@ -12,12 +11,15 @@
SchemaMetadataClass,
)
+if TYPE_CHECKING:
+ from datahub.ingestion.api.source import SourceReport
+
logger = logging.getLogger(__name__)
class EnsureAspectSizeProcessor:
def __init__(
- self, report: SourceReport, payload_constraint: int = INGEST_MAX_PAYLOAD_BYTES
+ self, report: "SourceReport", payload_constraint: int = INGEST_MAX_PAYLOAD_BYTES
):
self.report = report
self.payload_constraint = payload_constraint
diff --git a/metadata-ingestion/src/datahub/ingestion/api/source.py b/metadata-ingestion/src/datahub/ingestion/api/source.py
index c3638635b19aac..53cb1b0ecad4ee 100644
--- a/metadata-ingestion/src/datahub/ingestion/api/source.py
+++ b/metadata-ingestion/src/datahub/ingestion/api/source.py
@@ -31,6 +31,9 @@
from datahub.ingestion.api.auto_work_units.auto_dataset_properties_aspect import (
auto_patch_last_modified,
)
+from datahub.ingestion.api.auto_work_units.auto_ensure_aspect_size import (
+ EnsureAspectSizeProcessor,
+)
from datahub.ingestion.api.closeable import Closeable
from datahub.ingestion.api.common import PipelineContext, RecordEnvelope, WorkUnit
from datahub.ingestion.api.report import Report
@@ -331,6 +334,8 @@ def as_obj(self) -> dict:
}
def compute_stats(self) -> None:
+ super().compute_stats()
+
duration = datetime.datetime.now() - self.start_time
workunits_produced = self.events_produced
if duration.total_seconds() > 0:
@@ -450,6 +455,7 @@ def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
browse_path_processor,
partial(auto_workunit_reporter, self.get_report()),
auto_patch_last_modified,
+ EnsureAspectSizeProcessor(self.get_report()).ensure_aspect_size,
]
@staticmethod
diff --git a/metadata-ingestion/src/datahub/ingestion/glossary/classifier.py b/metadata-ingestion/src/datahub/ingestion/glossary/classifier.py
index ddcb74e354613a..bdcdcb8990eba7 100644
--- a/metadata-ingestion/src/datahub/ingestion/glossary/classifier.py
+++ b/metadata-ingestion/src/datahub/ingestion/glossary/classifier.py
@@ -1,4 +1,3 @@
-import os
from abc import ABCMeta, abstractmethod
from dataclasses import dataclass
from typing import Any, Dict, List, Optional
@@ -38,8 +37,8 @@ class ClassificationConfig(ConfigModel):
)
max_workers: int = Field(
- default=(os.cpu_count() or 4),
- description="Number of worker processes to use for classification. Set to 1 to disable.",
+ default=1,
+ description="Number of worker processes to use for classification. Note that any number above 1 might lead to a deadlock. Set to 1 to disable.",
)
table_pattern: AllowDenyPattern = Field(
diff --git a/metadata-ingestion/src/datahub/ingestion/graph/client.py b/metadata-ingestion/src/datahub/ingestion/graph/client.py
index ca9a41172e5b6e..8c5f894a072d93 100644
--- a/metadata-ingestion/src/datahub/ingestion/graph/client.py
+++ b/metadata-ingestion/src/datahub/ingestion/graph/client.py
@@ -179,21 +179,24 @@ def frontend_base_url(self) -> str:
@classmethod
def from_emitter(cls, emitter: DatahubRestEmitter) -> "DataHubGraph":
+ session_config = emitter._session_config
+ if isinstance(session_config.timeout, tuple):
+ # TODO: This is slightly lossy. Eventually, we want to modify the emitter
+ # to accept a tuple for timeout_sec, and then we'll be able to remove this.
+ timeout_sec: Optional[float] = session_config.timeout[0]
+ else:
+ timeout_sec = session_config.timeout
return cls(
DatahubClientConfig(
server=emitter._gms_server,
token=emitter._token,
- timeout_sec=emitter._read_timeout_sec,
- retry_status_codes=emitter._retry_status_codes,
- retry_max_times=emitter._retry_max_times,
- extra_headers=emitter._session.headers,
- disable_ssl_verification=emitter._session.verify is False,
- ca_certificate_path=(
- emitter._session.verify
- if isinstance(emitter._session.verify, str)
- else None
- ),
- client_certificate_path=emitter._session.cert,
+ timeout_sec=timeout_sec,
+ retry_status_codes=session_config.retry_status_codes,
+ retry_max_times=session_config.retry_max_times,
+ extra_headers=session_config.extra_headers,
+ disable_ssl_verification=session_config.disable_ssl_verification,
+ ca_certificate_path=session_config.ca_certificate_path,
+ client_certificate_path=session_config.client_certificate_path,
)
)
@@ -245,9 +248,11 @@ def make_rest_sink(
with DatahubRestSink(PipelineContext(run_id=run_id), sink_config) as sink:
yield sink
if sink.report.failures:
+ logger.error(
+ f"Failed to emit {len(sink.report.failures)} records\n{sink.report.as_string()}"
+ )
raise OperationalError(
- f"Failed to emit {len(sink.report.failures)} records",
- info=sink.report.as_obj(),
+ f"Failed to emit {len(sink.report.failures)} records"
)
def emit_all(
diff --git a/metadata-ingestion/src/datahub/ingestion/graph/config.py b/metadata-ingestion/src/datahub/ingestion/graph/config.py
index 5f269e14e1a4af..8f0a5844c97c4b 100644
--- a/metadata-ingestion/src/datahub/ingestion/graph/config.py
+++ b/metadata-ingestion/src/datahub/ingestion/graph/config.py
@@ -10,7 +10,7 @@ class DatahubClientConfig(ConfigModel):
# by callers / the CLI, but the actual client should not have any magic.
server: str
token: Optional[str] = None
- timeout_sec: Optional[int] = None
+ timeout_sec: Optional[float] = None
retry_status_codes: Optional[List[int]] = None
retry_max_times: Optional[int] = None
extra_headers: Optional[Dict[str, str]] = None
diff --git a/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py b/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py
index 7a5ed154d40bc7..30e81643837375 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py
@@ -248,6 +248,9 @@ def report_table_dropped(self, table: str) -> None:
"Enabled by default when stateful ingestion is turned on.",
)
@capability(SourceCapability.LINEAGE_COARSE, "Enabled by default")
+@capability(
+ SourceCapability.LINEAGE_FINE, "Support via the `emit_s3_lineage` config field"
+)
class GlueSource(StatefulIngestionSourceBase):
"""
Note: if you also have files in S3 that you'd like to ingest, we recommend you use Glue's built-in data catalog. See [here](../../../../docs/generated/ingestion/sources/s3.md) for a quick guide on how to set up a crawler on Glue and ingest the outputs with DataHub.
@@ -284,12 +287,22 @@ class GlueSource(StatefulIngestionSourceBase):
"Action": [
"glue:GetDataflowGraph",
"glue:GetJobs",
+ "s3:GetObject",
],
"Resource": "*"
}
```
- plus `s3:GetObject` for the job script locations.
+ For profiling datasets, the following additional permissions are required:
+ ```json
+ {
+ "Effect": "Allow",
+ "Action": [
+ "glue:GetPartitions",
+ ],
+ "Resource": "*"
+ }
+ ```
"""
@@ -1054,49 +1067,66 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
yield from self.gen_database_containers(database)
for table in tables:
- database_name = table["DatabaseName"]
table_name = table["Name"]
- full_table_name = f"{database_name}.{table_name}"
- self.report.report_table_scanned()
- if not self.source_config.database_pattern.allowed(
- database_name
- ) or not self.source_config.table_pattern.allowed(full_table_name):
- self.report.report_table_dropped(full_table_name)
- continue
+ try:
+ yield from self._gen_table_wu(table=table)
+ except KeyError as e:
+ self.report.report_failure(
+ message="Failed to extract workunit for table",
+ context=f"Table: {table_name}",
+ exc=e,
+ )
+ if self.extract_transforms:
+ yield from self._transform_extraction()
- dataset_urn = make_dataset_urn_with_platform_instance(
- platform=self.platform,
- name=full_table_name,
- env=self.env,
- platform_instance=self.source_config.platform_instance,
- )
+ def _gen_table_wu(self, table: Dict) -> Iterable[MetadataWorkUnit]:
+ database_name = table["DatabaseName"]
+ table_name = table["Name"]
+ full_table_name = f"{database_name}.{table_name}"
+ self.report.report_table_scanned()
+ if not self.source_config.database_pattern.allowed(
+ database_name
+ ) or not self.source_config.table_pattern.allowed(full_table_name):
+ self.report.report_table_dropped(full_table_name)
+ return
+
+ dataset_urn = make_dataset_urn_with_platform_instance(
+ platform=self.platform,
+ name=full_table_name,
+ env=self.env,
+ platform_instance=self.source_config.platform_instance,
+ )
- mce = self._extract_record(dataset_urn, table, full_table_name)
- yield MetadataWorkUnit(full_table_name, mce=mce)
+ mce = self._extract_record(dataset_urn, table, full_table_name)
+ yield MetadataWorkUnit(full_table_name, mce=mce)
- # We also want to assign "table" subType to the dataset representing glue table - unfortunately it is not
- # possible via Dataset snapshot embedded in a mce, so we have to generate a mcp.
- yield MetadataChangeProposalWrapper(
- entityUrn=dataset_urn,
- aspect=SubTypes(typeNames=[DatasetSubTypes.TABLE]),
- ).as_workunit()
+ # We also want to assign "table" subType to the dataset representing glue table - unfortunately it is not
+ # possible via Dataset snapshot embedded in a mce, so we have to generate a mcp.
+ yield MetadataChangeProposalWrapper(
+ entityUrn=dataset_urn,
+ aspect=SubTypes(typeNames=[DatasetSubTypes.TABLE]),
+ ).as_workunit()
- yield from self._get_domain_wu(
- dataset_name=full_table_name,
- entity_urn=dataset_urn,
- )
- yield from self.add_table_to_database_container(
- dataset_urn=dataset_urn, db_name=database_name
- )
+ yield from self._get_domain_wu(
+ dataset_name=full_table_name,
+ entity_urn=dataset_urn,
+ )
+ yield from self.add_table_to_database_container(
+ dataset_urn=dataset_urn, db_name=database_name
+ )
- wu = self.get_lineage_if_enabled(mce)
- if wu:
- yield wu
+ wu = self.get_lineage_if_enabled(mce)
+ if wu:
+ yield wu
+ try:
yield from self.get_profile_if_enabled(mce, database_name, table_name)
-
- if self.extract_transforms:
- yield from self._transform_extraction()
+ except KeyError as e:
+ self.report.report_failure(
+ message="Failed to extract profile for table",
+ context=f"Table: {dataset_urn}",
+ exc=e,
+ )
def _transform_extraction(self) -> Iterable[MetadataWorkUnit]:
dags: Dict[str, Optional[Dict[str, Any]]] = {}
diff --git a/metadata-ingestion/src/datahub/ingestion/source/aws/s3_util.py b/metadata-ingestion/src/datahub/ingestion/source/aws/s3_util.py
index 878b8dd1bb9a51..360f18aa448f27 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/aws/s3_util.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/aws/s3_util.py
@@ -1,6 +1,11 @@
import logging
import os
-from typing import Optional
+from collections import defaultdict
+from typing import TYPE_CHECKING, Dict, Iterable, List, Optional
+
+if TYPE_CHECKING:
+ from mypy_boto3_s3.service_resource import ObjectSummary
+
S3_PREFIXES = ["s3://", "s3n://", "s3a://"]
@@ -68,3 +73,21 @@ def get_key_prefix(s3_uri: str) -> str:
f"Not an S3 URI. Must start with one of the following prefixes: {str(S3_PREFIXES)}"
)
return strip_s3_prefix(s3_uri).split("/", maxsplit=1)[1]
+
+
+def group_s3_objects_by_dirname(
+ s3_objects: Iterable["ObjectSummary"],
+) -> Dict[str, List["ObjectSummary"]]:
+ """
+ Groups S3 objects by their directory name.
+
+ If a s3_object in the root directory (i.e., s3://bucket/file.txt), it is grouped under '/'.
+ """
+ grouped_s3_objs = defaultdict(list)
+ for obj in s3_objects:
+ if "/" in obj.key:
+ dirname = obj.key.rsplit("/", 1)[0]
+ else:
+ dirname = "/"
+ grouped_s3_objs[dirname].append(obj)
+ return grouped_s3_objs
diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py
index 38eab3606b7e95..508b4bbaa277dc 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py
@@ -253,14 +253,14 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
for project in projects:
yield from self.bq_schema_extractor.get_project_workunits(project)
- self.report.set_ingestion_stage("*", "View and Snapshot Lineage")
- yield from self.lineage_extractor.get_lineage_workunits_for_views_and_snapshots(
- [p.id for p in projects],
- self.bq_schema_extractor.view_refs_by_project,
- self.bq_schema_extractor.view_definitions,
- self.bq_schema_extractor.snapshot_refs_by_project,
- self.bq_schema_extractor.snapshots_by_ref,
- )
+ with self.report.new_stage("*: View and Snapshot Lineage"):
+ yield from self.lineage_extractor.get_lineage_workunits_for_views_and_snapshots(
+ [p.id for p in projects],
+ self.bq_schema_extractor.view_refs_by_project,
+ self.bq_schema_extractor.view_definitions,
+ self.bq_schema_extractor.snapshot_refs_by_project,
+ self.bq_schema_extractor.snapshots_by_ref,
+ )
if self.config.use_queries_v2:
# if both usage and lineage are disabled then skip queries extractor piece
@@ -270,29 +270,29 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
):
return
- self.report.set_ingestion_stage("*", QUERIES_EXTRACTION)
-
- with BigQueryQueriesExtractor(
- connection=self.config.get_bigquery_client(),
- schema_api=self.bq_schema_extractor.schema_api,
- config=BigQueryQueriesExtractorConfig(
- window=self.config,
- user_email_pattern=self.config.usage.user_email_pattern,
- include_lineage=self.config.include_table_lineage,
- include_usage_statistics=self.config.include_usage_statistics,
- include_operations=self.config.usage.include_operational_stats,
- top_n_queries=self.config.usage.top_n_queries,
- region_qualifiers=self.config.region_qualifiers,
- ),
- structured_report=self.report,
- filters=self.filters,
- identifiers=self.identifiers,
- schema_resolver=self.sql_parser_schema_resolver,
- discovered_tables=self.bq_schema_extractor.table_refs,
- ) as queries_extractor:
- self.report.queries_extractor = queries_extractor.report
- yield from queries_extractor.get_workunits_internal()
-
+ with self.report.new_stage(f"*: {QUERIES_EXTRACTION}"):
+ with BigQueryQueriesExtractor(
+ connection=self.config.get_bigquery_client(),
+ schema_api=self.bq_schema_extractor.schema_api,
+ config=BigQueryQueriesExtractorConfig(
+ window=self.config,
+ user_email_pattern=self.config.usage.user_email_pattern,
+ include_lineage=self.config.include_table_lineage,
+ include_usage_statistics=self.config.include_usage_statistics,
+ include_operations=self.config.usage.include_operational_stats,
+ include_queries=self.config.include_queries,
+ include_query_usage_statistics=self.config.include_query_usage_statistics,
+ top_n_queries=self.config.usage.top_n_queries,
+ region_qualifiers=self.config.region_qualifiers,
+ ),
+ structured_report=self.report,
+ filters=self.filters,
+ identifiers=self.identifiers,
+ schema_resolver=self.sql_parser_schema_resolver,
+ discovered_tables=self.bq_schema_extractor.table_refs,
+ ) as queries_extractor:
+ self.report.queries_extractor = queries_extractor.report
+ yield from queries_extractor.get_workunits_internal()
else:
if self.config.include_usage_statistics:
yield from self.usage_extractor.get_usage_workunits(
diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py
index ef323260b014e6..afbe919df4dcae 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py
@@ -447,6 +447,14 @@ class BigQueryV2Config(
default=False,
description="If enabled, uses the new queries extractor to extract queries from bigquery.",
)
+ include_queries: bool = Field(
+ default=True,
+ description="If enabled, generate query entities associated with lineage edges. Only applicable if `use_queries_v2` is enabled.",
+ )
+ include_query_usage_statistics: bool = Field(
+ default=True,
+ description="If enabled, generate query popularity statistics. Only applicable if `use_queries_v2` is enabled.",
+ )
@property
def have_table_data_read_permission(self) -> bool:
diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_report.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_report.py
index 06842da67f76ca..8e55d81aac5fe3 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_report.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_report.py
@@ -190,6 +190,3 @@ class BigQueryV2Report(
num_skipped_external_table_lineage: int = 0
queries_extractor: Optional[BigQueryQueriesExtractorReport] = None
-
- def set_ingestion_stage(self, project_id: str, stage: str) -> None:
- self.report_ingestion_stage_start(f"{project_id}: {stage}")
diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py
index bc2688e6b481ab..56e930dfb811f1 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py
@@ -248,9 +248,9 @@ def modified_base32decode(self, text_to_decode: str) -> str:
def get_project_workunits(
self, project: BigqueryProject
) -> Iterable[MetadataWorkUnit]:
- self.report.set_ingestion_stage(project.id, METADATA_EXTRACTION)
- logger.info(f"Processing project: {project.id}")
- yield from self._process_project(project)
+ with self.report.new_stage(f"{project.id}: {METADATA_EXTRACTION}"):
+ logger.info(f"Processing project: {project.id}")
+ yield from self._process_project(project)
def get_dataplatform_instance_aspect(
self, dataset_urn: str, project_id: str
@@ -405,11 +405,11 @@ def _process_project(
if self.config.is_profiling_enabled():
logger.info(f"Starting profiling project {project_id}")
- self.report.set_ingestion_stage(project_id, PROFILING)
- yield from self.profiler.get_workunits(
- project_id=project_id,
- tables=db_tables,
- )
+ with self.report.new_stage(f"{project_id}: {PROFILING}"):
+ yield from self.profiler.get_workunits(
+ project_id=project_id,
+ tables=db_tables,
+ )
def _process_project_datasets(
self,
@@ -1203,9 +1203,9 @@ def get_tables_for_dataset(
report=self.report,
)
- self.report.metadata_extraction_sec[f"{project_id}.{dataset.name}"] = round(
- timer.elapsed_seconds(), 2
- )
+ self.report.metadata_extraction_sec[
+ f"{project_id}.{dataset.name}"
+ ] = timer.elapsed_seconds(digits=2)
def get_core_table_details(
self, dataset_name: str, project_id: str, temp_table_dataset_prefix: str
diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py
index ba3357aa8ca20c..433282a21fdb66 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py
@@ -330,11 +330,11 @@ def get_lineage_workunits(
projects = ["*"] # project_id not used when using exported metadata
for project in projects:
- self.report.set_ingestion_stage(project, LINEAGE_EXTRACTION)
- yield from self.generate_lineage(
- project,
- table_refs,
- )
+ with self.report.new_stage(f"{project}: {LINEAGE_EXTRACTION}"):
+ yield from self.generate_lineage(
+ project,
+ table_refs,
+ )
if self.redundant_run_skip_handler:
# Update the checkpoint state for this run.
@@ -368,8 +368,8 @@ def generate_lineage(
self.report.lineage_metadata_entries[project_id] = len(lineage)
logger.info(f"Built lineage map containing {len(lineage)} entries.")
logger.debug(f"lineage metadata is {lineage}")
- self.report.lineage_extraction_sec[project_id] = round(
- timer.elapsed_seconds(), 2
+ self.report.lineage_extraction_sec[project_id] = timer.elapsed_seconds(
+ digits=2
)
self.report.lineage_mem_size[project_id] = humanfriendly.format_size(
memory_footprint.total_size(lineage)
diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/usage.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/usage.py
index 876ffab85ba311..f2f6cc731858d1 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/usage.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/usage.py
@@ -495,62 +495,62 @@ def _ingest_events(
def _generate_operational_workunits(
self, usage_state: BigQueryUsageState, table_refs: Collection[str]
) -> Iterable[MetadataWorkUnit]:
- self.report.set_ingestion_stage("*", USAGE_EXTRACTION_OPERATIONAL_STATS)
- for audit_event in usage_state.standalone_events():
- try:
- operational_wu = self._create_operation_workunit(
- audit_event, table_refs
- )
- if operational_wu:
- yield operational_wu
- self.report.num_operational_stats_workunits_emitted += 1
- except Exception as e:
- self.report.warning(
- message="Unable to generate operation workunit",
- context=f"{audit_event}",
- exc=e,
- )
+ with self.report.new_stage(f"*: {USAGE_EXTRACTION_OPERATIONAL_STATS}"):
+ for audit_event in usage_state.standalone_events():
+ try:
+ operational_wu = self._create_operation_workunit(
+ audit_event, table_refs
+ )
+ if operational_wu:
+ yield operational_wu
+ self.report.num_operational_stats_workunits_emitted += 1
+ except Exception as e:
+ self.report.warning(
+ message="Unable to generate operation workunit",
+ context=f"{audit_event}",
+ exc=e,
+ )
def _generate_usage_workunits(
self, usage_state: BigQueryUsageState
) -> Iterable[MetadataWorkUnit]:
- self.report.set_ingestion_stage("*", USAGE_EXTRACTION_USAGE_AGGREGATION)
- top_n = (
- self.config.usage.top_n_queries
- if self.config.usage.include_top_n_queries
- else 0
- )
- for entry in usage_state.usage_statistics(top_n=top_n):
- try:
- query_freq = [
- (
- self.uuid_to_query.get(
- query_hash, usage_state.queries[query_hash]
- ),
- count,
+ with self.report.new_stage(f"*: {USAGE_EXTRACTION_USAGE_AGGREGATION}"):
+ top_n = (
+ self.config.usage.top_n_queries
+ if self.config.usage.include_top_n_queries
+ else 0
+ )
+ for entry in usage_state.usage_statistics(top_n=top_n):
+ try:
+ query_freq = [
+ (
+ self.uuid_to_query.get(
+ query_hash, usage_state.queries[query_hash]
+ ),
+ count,
+ )
+ for query_hash, count in entry.query_freq
+ ]
+ yield make_usage_workunit(
+ bucket_start_time=datetime.fromisoformat(entry.timestamp),
+ resource=BigQueryTableRef.from_string_name(entry.resource),
+ query_count=entry.query_count,
+ query_freq=query_freq,
+ user_freq=entry.user_freq,
+ column_freq=entry.column_freq,
+ bucket_duration=self.config.bucket_duration,
+ resource_urn_builder=self.identifiers.gen_dataset_urn_from_raw_ref,
+ top_n_queries=self.config.usage.top_n_queries,
+ format_sql_queries=self.config.usage.format_sql_queries,
+ queries_character_limit=self.config.usage.queries_character_limit,
+ )
+ self.report.num_usage_workunits_emitted += 1
+ except Exception as e:
+ self.report.warning(
+ message="Unable to generate usage statistics workunit",
+ context=f"{entry.timestamp}, {entry.resource}",
+ exc=e,
)
- for query_hash, count in entry.query_freq
- ]
- yield make_usage_workunit(
- bucket_start_time=datetime.fromisoformat(entry.timestamp),
- resource=BigQueryTableRef.from_string_name(entry.resource),
- query_count=entry.query_count,
- query_freq=query_freq,
- user_freq=entry.user_freq,
- column_freq=entry.column_freq,
- bucket_duration=self.config.bucket_duration,
- resource_urn_builder=self.identifiers.gen_dataset_urn_from_raw_ref,
- top_n_queries=self.config.usage.top_n_queries,
- format_sql_queries=self.config.usage.format_sql_queries,
- queries_character_limit=self.config.usage.queries_character_limit,
- )
- self.report.num_usage_workunits_emitted += 1
- except Exception as e:
- self.report.warning(
- message="Unable to generate usage statistics workunit",
- context=f"{entry.timestamp}, {entry.resource}",
- exc=e,
- )
def _get_usage_events(self, projects: Iterable[str]) -> Iterable[AuditEvent]:
if self.config.use_exported_bigquery_audit_metadata:
@@ -559,10 +559,10 @@ def _get_usage_events(self, projects: Iterable[str]) -> Iterable[AuditEvent]:
for project_id in projects:
with PerfTimer() as timer:
try:
- self.report.set_ingestion_stage(
- project_id, USAGE_EXTRACTION_INGESTION
- )
- yield from self._get_parsed_bigquery_log_events(project_id)
+ with self.report.new_stage(
+ f"{project_id}: {USAGE_EXTRACTION_INGESTION}"
+ ):
+ yield from self._get_parsed_bigquery_log_events(project_id)
except Exception as e:
self.report.usage_failed_extraction.append(project_id)
self.report.warning(
@@ -572,8 +572,8 @@ def _get_usage_events(self, projects: Iterable[str]) -> Iterable[AuditEvent]:
)
self.report_status(f"usage-extraction-{project_id}", False)
- self.report.usage_extraction_sec[project_id] = round(
- timer.elapsed_seconds(), 2
+ self.report.usage_extraction_sec[project_id] = timer.elapsed_seconds(
+ digits=2
)
def _store_usage_event(
diff --git a/metadata-ingestion/src/datahub/ingestion/source/cassandra/cassandra_profiling.py b/metadata-ingestion/src/datahub/ingestion/source/cassandra/cassandra_profiling.py
index d8ab62f1d6d91f..7bf1d66f618a4b 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/cassandra/cassandra_profiling.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/cassandra/cassandra_profiling.py
@@ -70,30 +70,30 @@ def get_workunits(
) -> Iterable[MetadataWorkUnit]:
for keyspace_name in cassandra_data.keyspaces:
tables = cassandra_data.tables.get(keyspace_name, [])
- self.report.set_ingestion_stage(keyspace_name, PROFILING)
- with ThreadPoolExecutor(
- max_workers=self.config.profiling.max_workers
- ) as executor:
- future_to_dataset = {
- executor.submit(
- self.generate_profile,
- keyspace_name,
- table_name,
- cassandra_data.columns.get(table_name, []),
- ): table_name
- for table_name in tables
- }
- for future in as_completed(future_to_dataset):
- table_name = future_to_dataset[future]
- try:
- yield from future.result()
- except Exception as exc:
- self.report.profiling_skipped_other[table_name] += 1
- self.report.failure(
- message="Failed to profile for table",
- context=f"{keyspace_name}.{table_name}",
- exc=exc,
- )
+ with self.report.new_stage(f"{keyspace_name}: {PROFILING}"):
+ with ThreadPoolExecutor(
+ max_workers=self.config.profiling.max_workers
+ ) as executor:
+ future_to_dataset = {
+ executor.submit(
+ self.generate_profile,
+ keyspace_name,
+ table_name,
+ cassandra_data.columns.get(table_name, []),
+ ): table_name
+ for table_name in tables
+ }
+ for future in as_completed(future_to_dataset):
+ table_name = future_to_dataset[future]
+ try:
+ yield from future.result()
+ except Exception as exc:
+ self.report.profiling_skipped_other[table_name] += 1
+ self.report.failure(
+ message="Failed to profile for table",
+ context=f"{keyspace_name}.{table_name}",
+ exc=exc,
+ )
def generate_profile(
self,
diff --git a/metadata-ingestion/src/datahub/ingestion/source/cassandra/cassandra_utils.py b/metadata-ingestion/src/datahub/ingestion/source/cassandra/cassandra_utils.py
index 41d4ac7ced6035..75a0ba0c617734 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/cassandra/cassandra_utils.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/cassandra/cassandra_utils.py
@@ -54,9 +54,6 @@ def report_entity_scanned(self, name: str, ent_type: str = "View") -> None:
else:
raise KeyError(f"Unknown entity {ent_type}.")
- def set_ingestion_stage(self, keyspace: str, stage: str) -> None:
- self.report_ingestion_stage_start(f"{keyspace}: {stage}")
-
# TODO Need to create seperate common config for profiling report
profiling_skipped_other: TopKDict[str, int] = field(default_factory=int_top_k_dict)
profiling_skipped_table_profile_pattern: TopKDict[str, int] = field(
diff --git a/metadata-ingestion/src/datahub/ingestion/source/datahub/config.py b/metadata-ingestion/src/datahub/ingestion/source/datahub/config.py
index cd3c2146e6d848..8622e221940317 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/datahub/config.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/datahub/config.py
@@ -1,6 +1,7 @@
import os
from typing import Optional, Set
+import pydantic
from pydantic import Field, root_validator
from datahub.configuration.common import AllowDenyPattern
@@ -107,6 +108,12 @@ class DataHubSourceConfig(StatefulIngestionConfigBase):
urn_pattern: AllowDenyPattern = Field(default=AllowDenyPattern())
+ drop_duplicate_schema_fields: bool = Field(
+ default=False,
+ description="Whether to drop duplicate schema fields in the schemaMetadata aspect. "
+ "Useful if the source system has duplicate field paths in the db, but we're pushing to a system with server-side duplicate checking.",
+ )
+
@root_validator(skip_on_failure=True)
def check_ingesting_data(cls, values):
if (
@@ -119,3 +126,12 @@ def check_ingesting_data(cls, values):
" Please specify at least one of `database_connection` or `kafka_connection`, ideally both."
)
return values
+
+ @pydantic.validator("database_connection")
+ def validate_mysql_scheme(
+ cls, v: SQLAlchemyConnectionConfig
+ ) -> SQLAlchemyConnectionConfig:
+ if "mysql" in v.scheme:
+ if v.scheme != "mysql+pymysql":
+ raise ValueError("For MySQL, the scheme must be mysql+pymysql.")
+ return v
diff --git a/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_database_reader.py b/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_database_reader.py
index 80906ca63115f5..ee105f4862caba 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_database_reader.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_database_reader.py
@@ -151,8 +151,10 @@ def execute_server_cursor(
self, query: str, params: Dict[str, Any]
) -> Iterable[Dict[str, Any]]:
with self.engine.connect() as conn:
- if self.engine.dialect.name == "postgresql":
+ if self.engine.dialect.name in ["postgresql", "mysql", "mariadb"]:
with conn.begin(): # Transaction required for PostgreSQL server-side cursor
+ # Note that stream_results=True is mainly supported by PostgreSQL and MySQL-based dialects.
+ # https://docs.sqlalchemy.org/en/14/core/connections.html#sqlalchemy.engine.Connection.execution_options.params.stream_results
conn = conn.execution_options(
stream_results=True,
yield_per=self.config.database_query_batch_size,
@@ -160,22 +162,6 @@ def execute_server_cursor(
result = conn.execute(query, params)
for row in result:
yield dict(row)
- elif self.engine.dialect.name == "mysql": # MySQL
- import MySQLdb
-
- with contextlib.closing(
- conn.connection.cursor(MySQLdb.cursors.SSCursor)
- ) as cursor:
- logger.debug(f"Using Cursor type: {cursor.__class__.__name__}")
- cursor.execute(query, params)
-
- columns = [desc[0] for desc in cursor.description]
- while True:
- rows = cursor.fetchmany(self.config.database_query_batch_size)
- if not rows:
- break # Use break instead of return in generator
- for row in rows:
- yield dict(zip(columns, row))
else:
raise ValueError(f"Unsupported dialect: {self.engine.dialect.name}")
diff --git a/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_source.py b/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_source.py
index cb72441344088c..472abd0a97ec70 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_source.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_source.py
@@ -12,7 +12,10 @@
support_status,
)
from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceReport
-from datahub.ingestion.api.source_helpers import auto_workunit_reporter
+from datahub.ingestion.api.source_helpers import (
+ auto_fix_duplicate_schema_field_paths,
+ auto_workunit_reporter,
+)
from datahub.ingestion.api.workunit import MetadataWorkUnit
from datahub.ingestion.source.datahub.config import DataHubSourceConfig
from datahub.ingestion.source.datahub.datahub_api_reader import DataHubApiReader
@@ -57,7 +60,14 @@ def get_report(self) -> SourceReport:
def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
# Exactly replicate data from DataHub source
- return [partial(auto_workunit_reporter, self.get_report())]
+ return [
+ (
+ auto_fix_duplicate_schema_field_paths
+ if self.config.drop_duplicate_schema_fields
+ else None
+ ),
+ partial(auto_workunit_reporter, self.get_report()),
+ ]
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
self.report.stop_time = datetime.now(tz=timezone.utc)
@@ -130,7 +140,7 @@ def _get_database_workunits(
self._commit_progress(i)
def _get_kafka_workunits(
- self, from_offsets: Dict[int, int], soft_deleted_urns: List[str] = []
+ self, from_offsets: Dict[int, int], soft_deleted_urns: List[str]
) -> Iterable[MetadataWorkUnit]:
if self.config.kafka_connection is None:
return
diff --git a/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_reporting.py b/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_reporting.py
index c8eb035461ca16..9712d4ddc67998 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_reporting.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_reporting.py
@@ -45,6 +45,3 @@ def report_entity_scanned(self, name: str, ent_type: str = "View") -> None:
self.views_scanned += 1
else:
raise KeyError(f"Unknown entity {ent_type}.")
-
- def set_ingestion_stage(self, dataset: str, stage: str) -> None:
- self.report_ingestion_stage_start(f"{dataset}: {stage}")
diff --git a/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_source.py b/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_source.py
index 319290d25169af..6d34e86be6282e 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_source.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_source.py
@@ -472,8 +472,8 @@ def generate_profiles(
env=self.config.env,
platform_instance=self.config.platform_instance,
)
- self.report.set_ingestion_stage(dataset_info.resource_name, PROFILING)
- yield from self.profiler.get_workunits(dataset_info, dataset_urn)
+ with self.report.new_stage(f"{dataset_info.resource_name}: {PROFILING}"):
+ yield from self.profiler.get_workunits(dataset_info, dataset_urn)
def generate_view_lineage(
self, dataset_urn: str, parents: List[str]
diff --git a/metadata-ingestion/src/datahub/ingestion/source/gc/datahub_gc.py b/metadata-ingestion/src/datahub/ingestion/source/gc/datahub_gc.py
index 168b787b85e8be..b4cc5423277c5a 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/gc/datahub_gc.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/gc/datahub_gc.py
@@ -141,40 +141,36 @@ def get_workunits_internal(
) -> Iterable[MetadataWorkUnit]:
if self.config.cleanup_expired_tokens:
try:
- self.report.report_ingestion_stage_start("Expired Token Cleanup")
- self.revoke_expired_tokens()
+ with self.report.new_stage("Expired Token Cleanup"):
+ self.revoke_expired_tokens()
except Exception as e:
self.report.failure("While trying to cleanup expired token ", exc=e)
if self.config.truncate_indices:
try:
- self.report.report_ingestion_stage_start("Truncate Indices")
- self.truncate_indices()
+ with self.report.new_stage("Truncate Indices"):
+ self.truncate_indices()
except Exception as e:
self.report.failure("While trying to truncate indices ", exc=e)
if self.config.soft_deleted_entities_cleanup.enabled:
try:
- self.report.report_ingestion_stage_start(
- "Soft Deleted Entities Cleanup"
- )
- self.soft_deleted_entities_cleanup.cleanup_soft_deleted_entities()
+ with self.report.new_stage("Soft Deleted Entities Cleanup"):
+ self.soft_deleted_entities_cleanup.cleanup_soft_deleted_entities()
except Exception as e:
self.report.failure(
"While trying to cleanup soft deleted entities ", exc=e
)
if self.config.dataprocess_cleanup.enabled:
try:
- self.report.report_ingestion_stage_start("Data Process Cleanup")
- yield from self.dataprocess_cleanup.get_workunits_internal()
+ with self.report.new_stage("Data Process Cleanup"):
+ yield from self.dataprocess_cleanup.get_workunits_internal()
except Exception as e:
self.report.failure("While trying to cleanup data process ", exc=e)
if self.config.execution_request_cleanup.enabled:
try:
- self.report.report_ingestion_stage_start("Execution request Cleanup")
- self.execution_request_cleanup.run()
+ with self.report.new_stage("Execution request Cleanup"):
+ self.execution_request_cleanup.run()
except Exception as e:
self.report.failure("While trying to cleanup execution request ", exc=e)
- # Otherwise last stage's duration does not get calculated.
- self.report.report_ingestion_stage_start("End")
yield from []
def truncate_indices(self) -> None:
@@ -188,6 +184,9 @@ def truncate_indices(self) -> None:
self._truncate_timeseries_helper(
aspect_name="dashboardUsageStatistics", entity_type="dashboard"
)
+ self._truncate_timeseries_helper(
+ aspect_name="queryusagestatistics", entity_type="query"
+ )
def _truncate_timeseries_helper(self, aspect_name: str, entity_type: str) -> None:
self._truncate_timeseries_with_watch_optional(
diff --git a/metadata-ingestion/src/datahub/ingestion/source/gc/execution_request_cleanup.py b/metadata-ingestion/src/datahub/ingestion/source/gc/execution_request_cleanup.py
index f9a00d7f009058..c1763b16f3670f 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/gc/execution_request_cleanup.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/gc/execution_request_cleanup.py
@@ -29,7 +29,7 @@ class DatahubExecutionRequestCleanupConfig(ConfigModel):
)
keep_history_max_days: int = Field(
- 30,
+ 90,
description="Maximum number of days to keep execution requests for, per ingestion source",
)
@@ -48,6 +48,10 @@ class DatahubExecutionRequestCleanupConfig(ConfigModel):
description="Maximum runtime in seconds for the cleanup task",
)
+ limit_entities_delete: Optional[int] = Field(
+ 10000, description="Max number of execution requests to hard delete."
+ )
+
max_read_errors: int = Field(
default=10,
description="Maximum number of read errors before aborting",
@@ -65,6 +69,8 @@ class DatahubExecutionRequestCleanupReport(SourceReport):
ergc_delete_errors: int = 0
ergc_start_time: Optional[datetime.datetime] = None
ergc_end_time: Optional[datetime.datetime] = None
+ ergc_delete_limit_reached: bool = False
+ ergc_runtime_limit_reached: bool = False
class CleanupRecord(BaseModel):
@@ -85,12 +91,20 @@ def __init__(
self.graph = graph
self.report = report
self.instance_id = int(time.time())
+ self.last_print_time = 0.0
if config is not None:
self.config = config
else:
self.config = DatahubExecutionRequestCleanupConfig()
+ def _print_report(self) -> None:
+ time_taken = round(time.time() - self.last_print_time, 1)
+ # Print report every 2 minutes
+ if time_taken > 120:
+ self.last_print_time = time.time()
+ logger.info(f"\n{self.report.as_string()}")
+
def _to_cleanup_record(self, entry: Dict) -> CleanupRecord:
input_aspect = (
entry.get("aspects", {})
@@ -175,6 +189,7 @@ def _scroll_garbage_records(self):
running_guard_timeout = now_ms - 30 * 24 * 3600 * 1000
for entry in self._scroll_execution_requests():
+ self._print_report()
self.report.ergc_records_read += 1
key = entry.ingestion_source
@@ -225,15 +240,12 @@ def _scroll_garbage_records(self):
f"record timestamp: {entry.requested_at}."
)
)
- self.report.ergc_records_deleted += 1
yield entry
def _delete_entry(self, entry: CleanupRecord) -> None:
try:
- logger.info(
- f"ergc({self.instance_id}): going to delete ExecutionRequest {entry.request_id}"
- )
self.graph.delete_entity(entry.urn, True)
+ self.report.ergc_records_deleted += 1
except Exception as e:
self.report.ergc_delete_errors += 1
self.report.failure(
@@ -252,10 +264,23 @@ def _reached_runtime_limit(self) -> bool:
>= datetime.timedelta(seconds=self.config.runtime_limit_seconds)
)
):
+ self.report.ergc_runtime_limit_reached = True
logger.info(f"ergc({self.instance_id}): max runtime reached.")
return True
return False
+ def _reached_delete_limit(self) -> bool:
+ if (
+ self.config.limit_entities_delete
+ and self.report.ergc_records_deleted >= self.config.limit_entities_delete
+ ):
+ logger.info(
+ f"ergc({self.instance_id}): max delete limit reached: {self.config.limit_entities_delete}."
+ )
+ self.report.ergc_delete_limit_reached = True
+ return True
+ return False
+
def run(self) -> None:
if not self.config.enabled:
logger.info(
@@ -274,7 +299,7 @@ def run(self) -> None:
)
for entry in self._scroll_garbage_records():
- if self._reached_runtime_limit():
+ if self._reached_runtime_limit() or self._reached_delete_limit():
break
self._delete_entry(entry)
diff --git a/metadata-ingestion/src/datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py b/metadata-ingestion/src/datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py
index cf810d05aa2ca1..471eeff0224ed1 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py
@@ -19,8 +19,8 @@
logger = logging.getLogger(__name__)
-QUERY_QUERY_ENTITY = """
-query listQueries($input: ScrollAcrossEntitiesInput!) {
+QUERY_ENTITIES = """
+query listEntities($input: ScrollAcrossEntitiesInput!) {
scrollAcrossEntities(input: $input) {
nextScrollId
count
@@ -29,6 +29,9 @@
... on QueryEntity {
urn
}
+ ... on DataProcessInstance {
+ urn
+ }
}
}
}
@@ -225,19 +228,28 @@ def _process_futures(self, futures: Dict[Future, str]) -> Dict[Future, str]:
time.sleep(self.config.delay)
return futures
- def _get_soft_deleted_queries(self) -> Iterable[str]:
+ def _get_soft_deleted(self, graphql_query: str, entity_type: str) -> Iterable[str]:
assert self.ctx.graph
scroll_id: Optional[str] = None
+
+ batch_size = self.config.batch_size
+ if entity_type == "DATA_PROCESS_INSTANCE":
+ # Due to a bug in Data process instance querying this is a temp workaround
+ # to avoid a giant stacktrace by having a smaller batch size in first call
+ # This will be remove in future version after server with fix has been
+ # around for a while
+ batch_size = 10
+
while True:
try:
result = self.ctx.graph.execute_graphql(
- QUERY_QUERY_ENTITY,
+ graphql_query,
{
"input": {
- "types": ["QUERY"],
+ "types": [entity_type],
"query": "*",
"scrollId": scroll_id if scroll_id else None,
- "count": self.config.batch_size,
+ "count": batch_size,
"orFilters": [
{
"and": [
@@ -254,12 +266,16 @@ def _get_soft_deleted_queries(self) -> Iterable[str]:
)
except Exception as e:
self.report.failure(
- f"While trying to get queries with {scroll_id}", exc=e
+ f"While trying to get {entity_type} with {scroll_id}", exc=e
)
break
scroll_across_entities = result.get("scrollAcrossEntities")
- if not scroll_across_entities:
+ if not scroll_across_entities or not scroll_across_entities.get("count"):
break
+ if entity_type == "DATA_PROCESS_INSTANCE":
+ # Temp workaround. See note in beginning of the function
+ # We make the batch size = config after call has succeeded once
+ batch_size = self.config.batch_size
scroll_id = scroll_across_entities.get("nextScrollId")
self.report.num_queries_found += scroll_across_entities.get("count")
for query in scroll_across_entities.get("searchResults"):
@@ -275,7 +291,8 @@ def _get_urns(self) -> Iterable[str]:
status=RemovedStatusFilter.ONLY_SOFT_DELETED,
batch_size=self.config.batch_size,
)
- yield from self._get_soft_deleted_queries()
+ yield from self._get_soft_deleted(QUERY_ENTITIES, "QUERY")
+ yield from self._get_soft_deleted(QUERY_ENTITIES, "DATA_PROCESS_INSTANCE")
def _times_up(self) -> bool:
if (
diff --git a/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg.py b/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg.py
index 76f24bfd63d476..8101f0110509e3 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg.py
@@ -203,7 +203,9 @@ def _process_dataset(dataset_path: Identifier) -> Iterable[MetadataWorkUnit]:
with PerfTimer() as timer:
table = thread_local.local_catalog.load_table(dataset_path)
time_taken = timer.elapsed_seconds()
- self.report.report_table_load_time(time_taken)
+ self.report.report_table_load_time(
+ time_taken, dataset_name, table.metadata_location
+ )
LOGGER.debug(f"Loaded table: {table.name()}, time taken: {time_taken}")
yield from self._create_iceberg_workunit(dataset_name, table)
except NoSuchPropertyException as e:
@@ -247,7 +249,10 @@ def _process_dataset(dataset_path: Identifier) -> Iterable[MetadataWorkUnit]:
f"Iceberg Rest Catalog server error (500 status) encountered when processing table {dataset_path}, skipping it."
)
except Exception as e:
- self.report.report_failure("general", f"Failed to create workunit: {e}")
+ self.report.report_failure(
+ "general",
+ f"Failed to create workunit for dataset {dataset_name}: {e}",
+ )
LOGGER.exception(
f"Exception while processing table {dataset_path}, skipping it.",
)
@@ -312,7 +317,9 @@ def _create_iceberg_workunit(
dataset_snapshot.aspects.append(schema_metadata)
mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
- self.report.report_table_processing_time(timer.elapsed_seconds())
+ self.report.report_table_processing_time(
+ timer.elapsed_seconds(), dataset_name, table.metadata_location
+ )
yield MetadataWorkUnit(id=dataset_name, mce=mce)
dpi_aspect = self._get_dataplatform_instance_aspect(dataset_urn=dataset_urn)
diff --git a/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg_common.py b/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg_common.py
index 4a7f6bf4d60c1d..83fe3d1c079f17 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg_common.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg_common.py
@@ -5,6 +5,7 @@
from humanfriendly import format_timespan
from pydantic import Field, validator
from pyiceberg.catalog import Catalog, load_catalog
+from sortedcontainers import SortedList
from datahub.configuration.common import AllowDenyPattern, ConfigModel
from datahub.configuration.source_common import DatasetSourceConfigMixin
@@ -146,19 +147,40 @@ def get_catalog(self) -> Catalog:
return load_catalog(name=catalog_name, **catalog_config)
+class TopTableTimings:
+ _VALUE_FIELD: str = "timing"
+ top_entites: SortedList
+ _size: int
+
+ def __init__(self, size: int = 10):
+ self._size = size
+ self.top_entites = SortedList(key=lambda x: -x.get(self._VALUE_FIELD, 0))
+
+ def add(self, entity: Dict[str, Any]) -> None:
+ if self._VALUE_FIELD not in entity:
+ return
+ self.top_entites.add(entity)
+ if len(self.top_entites) > self._size:
+ self.top_entites.pop()
+
+ def __str__(self) -> str:
+ if len(self.top_entites) == 0:
+ return "no timings reported"
+ return str(list(self.top_entites))
+
+
class TimingClass:
- times: List[int]
+ times: SortedList
def __init__(self):
- self.times = []
+ self.times = SortedList()
- def add_timing(self, t):
- self.times.append(t)
+ def add_timing(self, t: float) -> None:
+ self.times.add(t)
- def __str__(self):
+ def __str__(self) -> str:
if len(self.times) == 0:
return "no timings reported"
- self.times.sort()
total = sum(self.times)
avg = total / len(self.times)
return str(
@@ -180,6 +202,9 @@ class IcebergSourceReport(StaleEntityRemovalSourceReport):
load_table_timings: TimingClass = field(default_factory=TimingClass)
processing_table_timings: TimingClass = field(default_factory=TimingClass)
profiling_table_timings: TimingClass = field(default_factory=TimingClass)
+ tables_load_timings: TopTableTimings = field(default_factory=TopTableTimings)
+ tables_profile_timings: TopTableTimings = field(default_factory=TopTableTimings)
+ tables_process_timings: TopTableTimings = field(default_factory=TopTableTimings)
listed_namespaces: int = 0
total_listed_tables: int = 0
tables_listed_per_namespace: TopKDict[str, int] = field(
@@ -201,11 +226,26 @@ def report_table_scanned(self, name: str) -> None:
def report_dropped(self, ent_name: str) -> None:
self.filtered.append(ent_name)
- def report_table_load_time(self, t: float) -> None:
+ def report_table_load_time(
+ self, t: float, table_name: str, table_metadata_location: str
+ ) -> None:
self.load_table_timings.add_timing(t)
+ self.tables_load_timings.add(
+ {"table": table_name, "timing": t, "metadata_file": table_metadata_location}
+ )
- def report_table_processing_time(self, t: float) -> None:
+ def report_table_processing_time(
+ self, t: float, table_name: str, table_metadata_location: str
+ ) -> None:
self.processing_table_timings.add_timing(t)
+ self.tables_process_timings.add(
+ {"table": table_name, "timing": t, "metadata_file": table_metadata_location}
+ )
- def report_table_profiling_time(self, t: float) -> None:
+ def report_table_profiling_time(
+ self, t: float, table_name: str, table_metadata_location: str
+ ) -> None:
self.profiling_table_timings.add_timing(t)
+ self.tables_profile_timings.add(
+ {"table": table_name, "timing": t, "metadata_file": table_metadata_location}
+ )
diff --git a/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg_profiler.py b/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg_profiler.py
index 9cc6dd08544e4e..7642cabbd1404c 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg_profiler.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg_profiler.py
@@ -204,7 +204,9 @@ def profile_table(
)
dataset_profile.fieldProfiles.append(column_profile)
time_taken = timer.elapsed_seconds()
- self.report.report_table_profiling_time(time_taken)
+ self.report.report_table_profiling_time(
+ time_taken, dataset_name, table.metadata_location
+ )
LOGGER.debug(
f"Finished profiling of dataset: {dataset_name} in {time_taken}"
)
diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_config.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_config.py
index bfae3060013d59..4e9d0f68928a45 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_config.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_config.py
@@ -300,11 +300,16 @@ class LookerDashboardSourceConfig(
folder_path_pattern: AllowDenyPattern = Field(
default=AllowDenyPattern.allow_all(),
- description="Allow or deny dashboards from specific folders. "
+ description="Allow or deny dashboards from specific folders using their fully qualified paths. "
"For example: \n"
"deny: \n"
- " - sales/deprecated \n"
- "This pattern will deny the ingestion of all dashboards and looks within the sales/deprecated folder. \n"
+ " - Shared/deprecated \n"
+ "This pattern will deny the ingestion of all dashboards and looks within the Shared/deprecated folder. \n"
+ "allow: \n"
+ " - Shared/sales \n"
+ "This pattern will allow only the ingestion of dashboards within the Shared/sales folder. \n"
+ "To get the correct path from Looker, take the folder hierarchy shown in the UI and join it with slashes. "
+ "For example, Shared -> Customer Reports -> Sales becomes Shared/Customer Reports/Sales. "
"Dashboards will only be ingested if they're allowed by both this config and dashboard_pattern.",
)
diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi_report_server/report_server.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi_report_server/report_server.py
index 2a247d0c63957a..4764400215e12a 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/powerbi_report_server/report_server.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi_report_server/report_server.py
@@ -485,7 +485,7 @@ def report_dropped(self, view: str) -> None:
self.filtered_reports.append(view)
-@platform_name("PowerBI")
+@platform_name("PowerBI Report Server")
@config_class(PowerBiReportServerDashboardSourceConfig)
@support_status(SupportStatus.INCUBATING)
@capability(SourceCapability.OWNERSHIP, "Enabled by default")
diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift.py
index 49f7941563c1a7..5371017a2a3212 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift.py
@@ -423,10 +423,10 @@ def get_workunits_internal(self) -> Iterable[Union[MetadataWorkUnit, SqlWorkUnit
database = self.config.database
logger.info(f"Processing db {database}")
- self.report.report_ingestion_stage_start(METADATA_EXTRACTION)
- self.db_tables[database] = defaultdict()
- self.db_views[database] = defaultdict()
- self.db_schemas.setdefault(database, {})
+ with self.report.new_stage(METADATA_EXTRACTION):
+ self.db_tables[database] = defaultdict()
+ self.db_views[database] = defaultdict()
+ self.db_schemas.setdefault(database, {})
# TODO: Ideally, we'd push down exception handling to the place where the connection is used, as opposed to keeping
# this fallback. For now, this gets us broad coverage quickly.
@@ -462,12 +462,12 @@ def _extract_metadata(
self.process_schemas(connection, database)
)
- self.report.report_ingestion_stage_start(LINEAGE_EXTRACTION)
- yield from self.extract_lineage_v2(
- connection=connection,
- database=database,
- lineage_extractor=lineage_extractor,
- )
+ with self.report.new_stage(LINEAGE_EXTRACTION):
+ yield from self.extract_lineage_v2(
+ connection=connection,
+ database=database,
+ lineage_extractor=lineage_extractor,
+ )
all_tables = self.get_all_tables()
else:
@@ -480,25 +480,25 @@ def _extract_metadata(
or self.config.include_view_lineage
or self.config.include_copy_lineage
):
- self.report.report_ingestion_stage_start(LINEAGE_EXTRACTION)
- yield from self.extract_lineage(
- connection=connection, all_tables=all_tables, database=database
- )
+ with self.report.new_stage(LINEAGE_EXTRACTION):
+ yield from self.extract_lineage(
+ connection=connection, all_tables=all_tables, database=database
+ )
- self.report.report_ingestion_stage_start(USAGE_EXTRACTION_INGESTION)
if self.config.include_usage_statistics:
- yield from self.extract_usage(
- connection=connection, all_tables=all_tables, database=database
- )
+ with self.report.new_stage(USAGE_EXTRACTION_INGESTION):
+ yield from self.extract_usage(
+ connection=connection, all_tables=all_tables, database=database
+ )
if self.config.is_profiling_enabled():
- self.report.report_ingestion_stage_start(PROFILING)
- profiler = RedshiftProfiler(
- config=self.config,
- report=self.report,
- state_handler=self.profiling_state_handler,
- )
- yield from profiler.get_workunits(self.db_tables)
+ with self.report.new_stage(PROFILING):
+ profiler = RedshiftProfiler(
+ config=self.config,
+ report=self.report,
+ state_handler=self.profiling_state_handler,
+ )
+ yield from profiler.get_workunits(self.db_tables)
def process_schemas(self, connection, database):
for schema in self.data_dictionary.get_schemas(
@@ -633,8 +633,8 @@ def process_schema(
else:
logger.info("View processing disabled, skipping")
- self.report.metadata_extraction_sec[report_key] = round(
- timer.elapsed_seconds(), 2
+ self.report.metadata_extraction_sec[report_key] = timer.elapsed_seconds(
+ digits=2
)
def _process_table(
@@ -986,9 +986,7 @@ def extract_usage(
yield from usage_extractor.get_usage_workunits(all_tables=all_tables)
- self.report.usage_extraction_sec[database] = round(
- timer.elapsed_seconds(), 2
- )
+ self.report.usage_extraction_sec[database] = timer.elapsed_seconds(digits=2)
def extract_lineage(
self,
@@ -1011,8 +1009,8 @@ def extract_lineage(
database=database, connection=connection, all_tables=all_tables
)
- self.report.lineage_extraction_sec[f"{database}"] = round(
- timer.elapsed_seconds(), 2
+ self.report.lineage_extraction_sec[f"{database}"] = timer.elapsed_seconds(
+ digits=2
)
yield from self.generate_lineage(
database, lineage_extractor=lineage_extractor
@@ -1042,8 +1040,8 @@ def extract_lineage_v2(
yield from lineage_extractor.generate()
- self.report.lineage_extraction_sec[f"{database}"] = round(
- timer.elapsed_seconds(), 2
+ self.report.lineage_extraction_sec[f"{database}"] = timer.elapsed_seconds(
+ digits=2
)
if self.redundant_lineage_run_skip_handler:
diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/usage.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/usage.py
index e0bf8b23dd0f7d..d66a1ee18be40f 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/redshift/usage.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/usage.py
@@ -182,38 +182,38 @@ def _get_workunits_internal(
self.report.num_operational_stats_filtered = 0
if self.config.include_operational_stats:
- self.report.report_ingestion_stage_start(USAGE_EXTRACTION_OPERATIONAL_STATS)
- with PerfTimer() as timer:
- # Generate operation aspect workunits
- yield from self._gen_operation_aspect_workunits(
- self.connection, all_tables
- )
- self.report.operational_metadata_extraction_sec[
- self.config.database
- ] = round(timer.elapsed_seconds(), 2)
+ with self.report.new_stage(USAGE_EXTRACTION_OPERATIONAL_STATS):
+ with PerfTimer() as timer:
+ # Generate operation aspect workunits
+ yield from self._gen_operation_aspect_workunits(
+ self.connection, all_tables
+ )
+ self.report.operational_metadata_extraction_sec[
+ self.config.database
+ ] = timer.elapsed_seconds(digits=2)
# Generate aggregate events
- self.report.report_ingestion_stage_start(USAGE_EXTRACTION_USAGE_AGGREGATION)
- query: str = self.queries.usage_query(
- start_time=self.start_time.strftime(REDSHIFT_DATETIME_FORMAT),
- end_time=self.end_time.strftime(REDSHIFT_DATETIME_FORMAT),
- database=self.config.database,
- )
- access_events_iterable: Iterable[
- RedshiftAccessEvent
- ] = self._gen_access_events_from_history_query(
- query, connection=self.connection, all_tables=all_tables
- )
+ with self.report.new_stage(USAGE_EXTRACTION_USAGE_AGGREGATION):
+ query: str = self.queries.usage_query(
+ start_time=self.start_time.strftime(REDSHIFT_DATETIME_FORMAT),
+ end_time=self.end_time.strftime(REDSHIFT_DATETIME_FORMAT),
+ database=self.config.database,
+ )
+ access_events_iterable: Iterable[
+ RedshiftAccessEvent
+ ] = self._gen_access_events_from_history_query(
+ query, connection=self.connection, all_tables=all_tables
+ )
- aggregated_events: AggregatedAccessEvents = self._aggregate_access_events(
- access_events_iterable
- )
- # Generate usage workunits from aggregated events.
- for time_bucket in aggregated_events.values():
- for aggregate in time_bucket.values():
- wu: MetadataWorkUnit = self._make_usage_stat(aggregate)
- self.report.num_usage_workunits_emitted += 1
- yield wu
+ aggregated_events: AggregatedAccessEvents = self._aggregate_access_events(
+ access_events_iterable
+ )
+ # Generate usage workunits from aggregated events.
+ for time_bucket in aggregated_events.values():
+ for aggregate in time_bucket.values():
+ wu: MetadataWorkUnit = self._make_usage_stat(aggregate)
+ self.report.num_usage_workunits_emitted += 1
+ yield wu
def _gen_operation_aspect_workunits(
self,
diff --git a/metadata-ingestion/src/datahub/ingestion/source/s3/source.py b/metadata-ingestion/src/datahub/ingestion/source/s3/source.py
index ceac9e96d1ddd0..989d0d734352a2 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/s3/source.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/s3/source.py
@@ -6,9 +6,8 @@
import re
import time
from datetime import datetime
-from itertools import groupby
from pathlib import PurePath
-from typing import Any, Dict, Iterable, List, Optional, Tuple
+from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple
from urllib.parse import urlparse
import smart_open.compression as so_compression
@@ -41,6 +40,7 @@
get_bucket_name,
get_bucket_relative_path,
get_key_prefix,
+ group_s3_objects_by_dirname,
strip_s3_prefix,
)
from datahub.ingestion.source.data_lake_common.data_lake_utils import ContainerWUCreator
@@ -75,6 +75,9 @@
from datahub.telemetry import stats, telemetry
from datahub.utilities.perf_timer import PerfTimer
+if TYPE_CHECKING:
+ from mypy_boto3_s3.service_resource import Bucket
+
# hide annoying debug errors from py4j
logging.getLogger("py4j").setLevel(logging.ERROR)
logger: logging.Logger = logging.getLogger(__name__)
@@ -842,7 +845,7 @@ def get_dir_to_process(
def get_folder_info(
self,
path_spec: PathSpec,
- bucket: Any, # Todo: proper type
+ bucket: "Bucket",
prefix: str,
) -> List[Folder]:
"""
@@ -857,22 +860,15 @@ def get_folder_info(
Parameters:
path_spec (PathSpec): The path specification used to determine partitioning.
- bucket (Any): The S3 bucket object.
+ bucket (Bucket): The S3 bucket object.
prefix (str): The prefix path in the S3 bucket to list objects from.
Returns:
List[Folder]: A list of Folder objects representing the partitions found.
"""
-
- prefix_to_list = prefix
- files = list(
- bucket.objects.filter(Prefix=f"{prefix_to_list}").page_size(PAGE_SIZE)
- )
- files = sorted(files, key=lambda a: a.last_modified)
- grouped_files = groupby(files, lambda x: x.key.rsplit("/", 1)[0])
-
partitions: List[Folder] = []
- for key, group in grouped_files:
+ s3_objects = bucket.objects.filter(Prefix=prefix).page_size(PAGE_SIZE)
+ for key, group in group_s3_objects_by_dirname(s3_objects).items():
file_size = 0
creation_time = None
modification_time = None
@@ -904,7 +900,7 @@ def get_folder_info(
Folder(
partition_id=id,
is_partition=bool(id),
- creation_time=creation_time if creation_time else None,
+ creation_time=creation_time if creation_time else None, # type: ignore[arg-type]
modification_time=modification_time,
sample_file=self.create_s3_path(max_file.bucket_name, max_file.key),
size=file_size,
diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py
index 12e5fb72b00de8..2d61ce59857778 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py
@@ -221,6 +221,14 @@ class SnowflakeV2Config(
default=False,
description="If enabled, uses the new queries extractor to extract queries from snowflake.",
)
+ include_queries: bool = Field(
+ default=True,
+ description="If enabled, generate query entities associated with lineage edges. Only applicable if `use_queries_v2` is enabled.",
+ )
+ include_query_usage_statistics: bool = Field(
+ default=True,
+ description="If enabled, generate query popularity statistics. Only applicable if `use_queries_v2` is enabled.",
+ )
lazy_schema_resolver: bool = Field(
default=True,
diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py
index 6b200590d7ab63..e93ecf30171f65 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py
@@ -40,6 +40,7 @@
ColumnRef,
DownstreamColumnRef,
)
+from datahub.sql_parsing.sqlglot_utils import get_query_fingerprint
from datahub.utilities.perf_timer import PerfTimer
from datahub.utilities.time import ts_millis_to_datetime
@@ -239,6 +240,9 @@ def get_known_query_lineage(
downstream_table_urn = self.identifiers.gen_dataset_urn(dataset_name)
known_lineage = KnownQueryLineageInfo(
+ query_id=get_query_fingerprint(
+ query.query_text, self.identifiers.platform, fast=True
+ ),
query_text=query.query_text,
downstream=downstream_table_urn,
upstreams=self.map_query_result_upstreams(
diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py
index 36825dc33fe7dc..b82734cbbe84ea 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py
@@ -61,6 +61,7 @@
ColumnRef,
DownstreamColumnRef,
)
+from datahub.sql_parsing.sqlglot_utils import get_query_fingerprint
from datahub.utilities.file_backed_collections import ConnectionWrapper, FileBackedList
from datahub.utilities.perf_timer import PerfTimer
@@ -475,10 +476,11 @@ def _parse_audit_log_row(
entry = PreparsedQuery(
# Despite having Snowflake's fingerprints available, our own fingerprinting logic does a better
- # job at eliminating redundant / repetitive queries. As such, we don't include the fingerprint
- # here so that the aggregator auto-generates one.
- # query_id=res["query_fingerprint"],
- query_id=None,
+ # job at eliminating redundant / repetitive queries. As such, we include the fast fingerprint
+ # here
+ query_id=get_query_fingerprint(
+ res["query_text"], self.identifiers.platform, fast=True
+ ),
query_text=res["query_text"],
upstreams=upstreams,
downstream=downstream,
diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_report.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_report.py
index 030b2d43be81f9..b24471f8666afa 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_report.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_report.py
@@ -166,6 +166,3 @@ def _is_tag_scanned(self, tag_name: str) -> bool:
def report_tag_processed(self, tag_name: str) -> None:
self._processed_tags.add(tag_name)
-
- def set_ingestion_stage(self, database: str, stage: str) -> None:
- self.report_ingestion_stage_start(f"{database}: {stage}")
diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py
index 8a1bf15b7a7bc4..6f09c26b08da2d 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py
@@ -216,21 +216,23 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
try:
for snowflake_db in self.databases:
- self.report.set_ingestion_stage(snowflake_db.name, METADATA_EXTRACTION)
- yield from self._process_database(snowflake_db)
+ with self.report.new_stage(
+ f"{snowflake_db.name}: {METADATA_EXTRACTION}"
+ ):
+ yield from self._process_database(snowflake_db)
- self.report.set_ingestion_stage("*", EXTERNAL_TABLE_DDL_LINEAGE)
- discovered_tables: List[str] = [
- self.identifiers.get_dataset_identifier(
- table_name, schema.name, db.name
- )
- for db in self.databases
- for schema in db.schemas
- for table_name in schema.tables
- ]
- if self.aggregator:
- for entry in self._external_tables_ddl_lineage(discovered_tables):
- self.aggregator.add(entry)
+ with self.report.new_stage(f"*: {EXTERNAL_TABLE_DDL_LINEAGE}"):
+ discovered_tables: List[str] = [
+ self.identifiers.get_dataset_identifier(
+ table_name, schema.name, db.name
+ )
+ for db in self.databases
+ for schema in db.schemas
+ for table_name in schema.tables
+ ]
+ if self.aggregator:
+ for entry in self._external_tables_ddl_lineage(discovered_tables):
+ self.aggregator.add(entry)
except SnowflakePermissionError as e:
self.structured_reporter.failure(
@@ -332,8 +334,8 @@ def _process_database(
yield from self._process_db_schemas(snowflake_db, db_tables)
if self.profiler and db_tables:
- self.report.set_ingestion_stage(snowflake_db.name, PROFILING)
- yield from self.profiler.get_workunits(snowflake_db, db_tables)
+ with self.report.new_stage(f"{snowflake_db.name}: {PROFILING}"):
+ yield from self.profiler.get_workunits(snowflake_db, db_tables)
def _process_db_schemas(
self,
diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_usage_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_usage_v2.py
index 4bdf559f293b51..85e4071aec07df 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_usage_v2.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_usage_v2.py
@@ -146,59 +146,58 @@ def get_usage_workunits(
if not self._should_ingest_usage():
return
- self.report.set_ingestion_stage("*", USAGE_EXTRACTION_USAGE_AGGREGATION)
- if self.report.edition == SnowflakeEdition.STANDARD.value:
- logger.info(
- "Snowflake Account is Standard Edition. Usage and Operation History Feature is not supported."
- )
- return
+ with self.report.new_stage(f"*: {USAGE_EXTRACTION_USAGE_AGGREGATION}"):
+ if self.report.edition == SnowflakeEdition.STANDARD.value:
+ logger.info(
+ "Snowflake Account is Standard Edition. Usage and Operation History Feature is not supported."
+ )
+ return
- logger.info("Checking usage date ranges")
+ logger.info("Checking usage date ranges")
- self._check_usage_date_ranges()
+ self._check_usage_date_ranges()
- # If permission error, execution returns from here
- if (
- self.report.min_access_history_time is None
- or self.report.max_access_history_time is None
- ):
- return
+ # If permission error, execution returns from here
+ if (
+ self.report.min_access_history_time is None
+ or self.report.max_access_history_time is None
+ ):
+ return
- # NOTE: In earlier `snowflake-usage` connector, users with no email were not considered in usage counts as well as in operation
- # Now, we report the usage as well as operation metadata even if user email is absent
+ # NOTE: In earlier `snowflake-usage` connector, users with no email were not considered in usage counts as well as in operation
+ # Now, we report the usage as well as operation metadata even if user email is absent
- if self.config.include_usage_stats:
- yield from auto_empty_dataset_usage_statistics(
- self._get_workunits_internal(discovered_datasets),
- config=BaseTimeWindowConfig(
- start_time=self.start_time,
- end_time=self.end_time,
- bucket_duration=self.config.bucket_duration,
- ),
- dataset_urns={
- self.identifiers.gen_dataset_urn(dataset_identifier)
- for dataset_identifier in discovered_datasets
- },
- )
+ if self.config.include_usage_stats:
+ yield from auto_empty_dataset_usage_statistics(
+ self._get_workunits_internal(discovered_datasets),
+ config=BaseTimeWindowConfig(
+ start_time=self.start_time,
+ end_time=self.end_time,
+ bucket_duration=self.config.bucket_duration,
+ ),
+ dataset_urns={
+ self.identifiers.gen_dataset_urn(dataset_identifier)
+ for dataset_identifier in discovered_datasets
+ },
+ )
- self.report.set_ingestion_stage("*", USAGE_EXTRACTION_OPERATIONAL_STATS)
+ with self.report.new_stage(f"*: {USAGE_EXTRACTION_OPERATIONAL_STATS}"):
+ if self.config.include_operational_stats:
+ # Generate the operation workunits.
+ access_events = self._get_snowflake_history()
+ for event in access_events:
+ yield from self._get_operation_aspect_work_unit(
+ event, discovered_datasets
+ )
- if self.config.include_operational_stats:
- # Generate the operation workunits.
- access_events = self._get_snowflake_history()
- for event in access_events:
- yield from self._get_operation_aspect_work_unit(
- event, discovered_datasets
+ if self.redundant_run_skip_handler:
+ # Update the checkpoint state for this run.
+ self.redundant_run_skip_handler.update_state(
+ self.config.start_time,
+ self.config.end_time,
+ self.config.bucket_duration,
)
- if self.redundant_run_skip_handler:
- # Update the checkpoint state for this run.
- self.redundant_run_skip_handler.update_state(
- self.config.start_time,
- self.config.end_time,
- self.config.bucket_duration,
- )
-
def _get_workunits_internal(
self, discovered_datasets: List[str]
) -> Iterable[MetadataWorkUnit]:
@@ -386,7 +385,7 @@ def _get_snowflake_history(self) -> Iterable[SnowflakeJoinedAccessEvent]:
)
self.report_status(USAGE_EXTRACTION_OPERATIONAL_STATS, False)
return
- self.report.access_history_query_secs = round(timer.elapsed_seconds(), 2)
+ self.report.access_history_query_secs = timer.elapsed_seconds(digits=2)
for row in results:
yield from self._process_snowflake_history_row(row)
@@ -434,8 +433,8 @@ def _check_usage_date_ranges(self) -> None:
self.report.max_access_history_time = db_row["MAX_TIME"].astimezone(
tz=timezone.utc
)
- self.report.access_history_range_query_secs = round(
- timer.elapsed_seconds(), 2
+ self.report.access_history_range_query_secs = timer.elapsed_seconds(
+ digits=2
)
def _get_operation_aspect_work_unit(
diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py
index 954e8a29c1a1bd..c0385a8d5af30a 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py
@@ -480,8 +480,8 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
identifiers=self.identifiers,
)
- self.report.set_ingestion_stage("*", METADATA_EXTRACTION)
- yield from schema_extractor.get_workunits_internal()
+ with self.report.new_stage(f"*: {METADATA_EXTRACTION}"):
+ yield from schema_extractor.get_workunits_internal()
databases = schema_extractor.databases
@@ -513,45 +513,46 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
discovered_datasets = discovered_tables + discovered_views
if self.config.use_queries_v2:
- self.report.set_ingestion_stage("*", VIEW_PARSING)
- yield from auto_workunit(self.aggregator.gen_metadata())
-
- self.report.set_ingestion_stage("*", QUERIES_EXTRACTION)
-
- schema_resolver = self.aggregator._schema_resolver
-
- queries_extractor = SnowflakeQueriesExtractor(
- connection=self.connection,
- config=SnowflakeQueriesExtractorConfig(
- window=self.config,
- temporary_tables_pattern=self.config.temporary_tables_pattern,
- include_lineage=self.config.include_table_lineage,
- include_usage_statistics=self.config.include_usage_stats,
- include_operations=self.config.include_operational_stats,
- user_email_pattern=self.config.user_email_pattern,
- ),
- structured_report=self.report,
- filters=self.filters,
- identifiers=self.identifiers,
- schema_resolver=schema_resolver,
- discovered_tables=discovered_datasets,
- graph=self.ctx.graph,
- )
+ with self.report.new_stage(f"*: {VIEW_PARSING}"):
+ yield from auto_workunit(self.aggregator.gen_metadata())
- # TODO: This is slightly suboptimal because we create two SqlParsingAggregator instances with different configs
- # but a shared schema resolver. That's fine for now though - once we remove the old lineage/usage extractors,
- # it should be pretty straightforward to refactor this and only initialize the aggregator once.
- self.report.queries_extractor = queries_extractor.report
- yield from queries_extractor.get_workunits_internal()
- queries_extractor.close()
+ with self.report.new_stage(f"*: {QUERIES_EXTRACTION}"):
+ schema_resolver = self.aggregator._schema_resolver
+
+ queries_extractor = SnowflakeQueriesExtractor(
+ connection=self.connection,
+ config=SnowflakeQueriesExtractorConfig(
+ window=self.config,
+ temporary_tables_pattern=self.config.temporary_tables_pattern,
+ include_lineage=self.config.include_table_lineage,
+ include_usage_statistics=self.config.include_usage_stats,
+ include_operations=self.config.include_operational_stats,
+ include_queries=self.config.include_queries,
+ include_query_usage_statistics=self.config.include_query_usage_statistics,
+ user_email_pattern=self.config.user_email_pattern,
+ ),
+ structured_report=self.report,
+ filters=self.filters,
+ identifiers=self.identifiers,
+ schema_resolver=schema_resolver,
+ discovered_tables=discovered_datasets,
+ graph=self.ctx.graph,
+ )
+
+ # TODO: This is slightly suboptimal because we create two SqlParsingAggregator instances with different configs
+ # but a shared schema resolver. That's fine for now though - once we remove the old lineage/usage extractors,
+ # it should be pretty straightforward to refactor this and only initialize the aggregator once.
+ self.report.queries_extractor = queries_extractor.report
+ yield from queries_extractor.get_workunits_internal()
+ queries_extractor.close()
else:
if self.lineage_extractor:
- self.report.set_ingestion_stage("*", LINEAGE_EXTRACTION)
- self.lineage_extractor.add_time_based_lineage_to_aggregator(
- discovered_tables=discovered_tables,
- discovered_views=discovered_views,
- )
+ with self.report.new_stage(f"*: {LINEAGE_EXTRACTION}"):
+ self.lineage_extractor.add_time_based_lineage_to_aggregator(
+ discovered_tables=discovered_tables,
+ discovered_views=discovered_views,
+ )
# This would emit view and external table ddl lineage
# as well as query lineage via lineage_extractor
diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/teradata.py b/metadata-ingestion/src/datahub/ingestion/source/sql/teradata.py
index e42564975c3d19..5b76fe41d92e97 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/sql/teradata.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/sql/teradata.py
@@ -878,7 +878,7 @@ def get_workunits_internal(self) -> Iterable[Union[MetadataWorkUnit, SqlWorkUnit
urns = self.schema_resolver.get_urns()
if self.config.include_table_lineage or self.config.include_usage_statistics:
- self.report.report_ingestion_stage_start("audit log extraction")
- yield from self.get_audit_log_mcps(urns=urns)
+ with self.report.new_stage("Audit log extraction"):
+ yield from self.get_audit_log_mcps(urns=urns)
yield from self.builder.gen_workunits()
diff --git a/metadata-ingestion/src/datahub/ingestion/source/superset.py b/metadata-ingestion/src/datahub/ingestion/source/superset.py
index 1da233bf0b22ab..e6615ff7bc3645 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/superset.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/superset.py
@@ -22,8 +22,12 @@
make_dataset_urn,
make_dataset_urn_with_platform_instance,
make_domain_urn,
+ make_term_urn,
+ make_user_urn,
)
+from datahub.emitter.mcp import MetadataChangeProposalWrapper
from datahub.emitter.mcp_builder import add_domain_to_entity_wu
+from datahub.emitter.rest_emitter import DatahubRestEmitter
from datahub.ingestion.api.common import PipelineContext
from datahub.ingestion.api.decorators import (
SourceCapability,
@@ -35,6 +39,7 @@
)
from datahub.ingestion.api.source import MetadataWorkUnitProcessor, Source
from datahub.ingestion.api.workunit import MetadataWorkUnit
+from datahub.ingestion.graph.client import DataHubGraph, DatahubClientConfig
from datahub.ingestion.source.sql.sql_types import resolve_sql_type
from datahub.ingestion.source.sql.sqlalchemy_uri_mapper import (
get_platform_from_sqlalchemy_uri,
@@ -48,6 +53,7 @@
StatefulIngestionConfigBase,
StatefulIngestionSourceBase,
)
+from datahub.metadata._schema_classes import GlossaryTermAssociationClass, GlossaryTermInfoClass, AuditStampClass
from datahub.metadata.com.linkedin.pegasus2avro.common import (
AuditStamp,
ChangeAuditStamps,
@@ -72,6 +78,10 @@
ChartTypeClass,
DashboardInfoClass,
DatasetPropertiesClass,
+ GlossaryTermsClass,
+ OwnerClass,
+ OwnershipClass,
+ OwnershipTypeClass,
)
from datahub.utilities import config_clean
from datahub.utilities.registries.domain_registry import DomainRegistry
@@ -231,7 +241,9 @@ def __init__(self, ctx: PipelineContext, config: SupersetConfig):
cached_domains=[domain_id for domain_id in self.config.domain],
graph=self.ctx.graph,
)
+ self.sink_config = ctx.pipeline_config.sink.config
self.session = self.login()
+ self.owners_id_to_email_dict = self.build_preset_owner_dict()
def login(self) -> requests.Session:
login_response = requests.post(
@@ -355,6 +367,61 @@ def get_datasource_urn_from_id(
)
raise ValueError("Could not construct dataset URN")
+ def _parse_owner_payload(self, payload: Dict[str, Any]) -> Dict[str, str]:
+ owners_id_to_email_dict = {}
+ for owner_data in payload["result"]:
+ owner_email = owner_data.get("extra", {}).get("email", None)
+ owner_id = owner_data.get("value", None)
+
+ if owner_id and owner_email:
+ owners_id_to_email_dict[owner_id] = owner_email
+ return owners_id_to_email_dict
+
+ def build_preset_owner_dict(self) -> Dict[str, str]:
+ owners_id_to_email_dict = {}
+ dataset_payload = self._get_all_entity_owners("dataset")
+ chart_payload = self._get_all_entity_owners("chart")
+ dashboard_payload = self._get_all_entity_owners("dashboard")
+
+ owners_id_to_email_dict.update(self._parse_owner_payload(dataset_payload))
+ owners_id_to_email_dict.update(self._parse_owner_payload(chart_payload))
+ owners_id_to_email_dict.update(self._parse_owner_payload(dashboard_payload))
+ return owners_id_to_email_dict
+
+ def build_owners_urn_list(self, data: Dict[str, Any]) -> List[str]:
+ owners_urn_list = []
+ for owner in data.get("owners", []):
+ owner_id = owner.get("id")
+ owner_email = self.owners_id_to_email_dict.get(owner_id)
+ if owner_email is not None:
+ owners_urn = make_user_urn(owner_email)
+ owners_urn_list.append(owners_urn)
+ return owners_urn_list
+
+ def _get_all_entity_owners(self, entity: str) -> Dict[str, Any]:
+ current_page = 1
+ total_owners = PAGE_SIZE
+ all_owners = []
+
+ while (current_page - 1) * PAGE_SIZE <= total_owners:
+ full_owners_response = self.session.get(
+ f"{self.config.connect_uri}/api/v1/{entity}/related/owners",
+ params=f"q=(page:{current_page},page_size:{PAGE_SIZE})",
+ )
+ if full_owners_response.status_code != 200:
+ logger.warning(
+ f"Failed to get {entity} data: {full_owners_response.text}"
+ )
+ current_page += 1
+ continue
+
+ payload = full_owners_response.json()
+ total_owners = payload.get("count", total_owners)
+ all_owners.extend(payload.get("result", []))
+ current_page += 1
+ # return combined payload
+ return {"result": all_owners, "count": total_owners}
+
def construct_dashboard_from_api_data(
self, dashboard_data: dict
) -> DashboardSnapshot:
@@ -427,6 +494,20 @@ def construct_dashboard_from_api_data(
customProperties=custom_properties,
)
dashboard_snapshot.aspects.append(dashboard_info)
+
+ dashboard_owners_list = self.build_owners_urn_list(dashboard_data)
+ owners_info = OwnershipClass(
+ owners=[
+ OwnerClass(
+ owner=urn,
+ # default as Technical Owners from Preset
+ type=OwnershipTypeClass.TECHNICAL_OWNER,
+ )
+ for urn in (dashboard_owners_list or [])
+ ],
+ )
+ dashboard_snapshot.aspects.append(owners_info)
+
return dashboard_snapshot
def emit_dashboard_mces(self) -> Iterable[MetadataWorkUnit]:
@@ -522,10 +603,24 @@ def construct_chart_from_chart_data(self, chart_data: dict) -> ChartSnapshot:
title=title,
lastModified=last_modified,
chartUrl=chart_url,
- inputs=[datasource_urn] if datasource_urn else None,
+ inputs=datasource_urn,
customProperties=custom_properties,
)
chart_snapshot.aspects.append(chart_info)
+
+ chart_owners_list = self.build_owners_urn_list(chart_data)
+ owners_info = OwnershipClass(
+ owners=[
+ OwnerClass(
+ owner=urn,
+ # default as Technical Owners from Preset
+ type=OwnershipTypeClass.TECHNICAL_OWNER,
+ )
+ for urn in (chart_owners_list or [])
+ ],
+ )
+ chart_snapshot.aspects.append(owners_info)
+
return chart_snapshot
def emit_chart_mces(self) -> Iterable[MetadataWorkUnit]:
@@ -588,6 +683,54 @@ def gen_dataset_urn(self, datahub_dataset_name: str) -> str:
env=self.config.env,
)
+ def check_if_term_exists(self, term_urn):
+ graph = DataHubGraph(
+ DatahubClientConfig(server=self.sink_config.get("server", ""), token=self.sink_config.get("token", "")))
+ # Query multiple aspects from entity
+ result = graph.get_entity_semityped(
+ entity_urn=term_urn,
+ aspects=["glossaryTermInfo"],
+ )
+
+ if result.get("glossaryTermInfo"):
+ return True
+ return False
+
+ def parse_glossary_terms_from_metrics(self, metrics, last_modified) -> GlossaryTermsClass:
+ glossary_term_urns = []
+ for metric in metrics:
+ ## We only sync in certified metrics
+ if "certified_by" in metric.get("extra", {}):
+ expression = metric.get("expression", "")
+ certification_details = metric.get("extra", "")
+ metric_name = metric.get("metric_name", "")
+ description = metric.get("description", "")
+ term_urn = make_term_urn(metric_name)
+
+ if self.check_if_term_exists(term_urn):
+ logger.info(f"Term {term_urn} already exists")
+ glossary_term_urns.append(GlossaryTermAssociationClass(urn=term_urn))
+ continue
+
+ term_properties_aspect = GlossaryTermInfoClass(
+ definition=f"Description: {description} \nSql Expression: {expression} \nCertification details: {certification_details}",
+ termSource="",
+ )
+
+ event: MetadataChangeProposalWrapper = MetadataChangeProposalWrapper(
+ entityUrn=term_urn,
+ aspect=term_properties_aspect,
+ )
+
+ # Create rest emitter
+ rest_emitter = DatahubRestEmitter(gms_server=self.sink_config.get("server", ""),
+ token=self.sink_config.get("token", ""))
+ rest_emitter.emit(event)
+ logger.info(f"Created Glossary term {term_urn}")
+ glossary_term_urns.append(GlossaryTermAssociationClass(urn=term_urn))
+
+ return GlossaryTermsClass(terms=glossary_term_urns, auditStamp=last_modified)
+
def construct_dataset_from_dataset_data(
self, dataset_data: dict
) -> DatasetSnapshot:
@@ -596,7 +739,11 @@ def construct_dataset_from_dataset_data(
datasource_urn = self.get_datasource_urn_from_id(
dataset_response, self.platform
)
-
+ modified_ts = int(
+ dp.parse(dataset_data.get("changed_on_utc", "now")).timestamp() * 1000
+ )
+ modified_actor = f"urn:li:corpuser:{(dataset_data.get('changed_by') or {}).get('username', 'unknown')}"
+ last_modified = AuditStampClass(time=modified_ts, actor=modified_actor)
dataset_url = f"{self.config.display_uri}{dataset.explore_url or ''}"
dataset_info = DatasetPropertiesClass(
@@ -615,6 +762,25 @@ def construct_dataset_from_dataset_data(
]
)
+ dataset_owners_list = self.build_owners_urn_list(dataset_data)
+ owners_info = OwnershipClass(
+ owners=[
+ OwnerClass(
+ owner=urn,
+ # default as Technical Owners from Preset
+ type=OwnershipTypeClass.TECHNICAL_OWNER,
+ )
+ for urn in (dataset_owners_list or [])
+ ],
+ )
+ aspects_items.append(owners_info)
+
+ metrics = dataset_response.get("result", {}).get("metrics", [])
+
+ if metrics:
+ glossary_terms = self.parse_glossary_terms_from_metrics(metrics, last_modified)
+ aspects_items.append(glossary_terms)
+
dataset_snapshot = DatasetSnapshot(
urn=datasource_urn,
aspects=aspects_items,
diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py
index 008216fea89508..2543cbe653ba72 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py
@@ -2,9 +2,9 @@
import logging
import re
import time
-from collections import OrderedDict
-from dataclasses import dataclass
-from datetime import datetime
+from collections import OrderedDict, defaultdict
+from dataclasses import dataclass, field as dataclass_field
+from datetime import datetime, timedelta, timezone
from functools import lru_cache
from typing import (
Any,
@@ -118,6 +118,7 @@
)
from datahub.ingestion.source.tableau.tableau_server_wrapper import UserInfo
from datahub.ingestion.source.tableau.tableau_validation import check_user_role
+from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
from datahub.metadata.com.linkedin.pegasus2avro.common import (
AuditStamp,
ChangeAuditStamps,
@@ -170,6 +171,8 @@
create_lineage_sql_parsed_result,
)
from datahub.utilities import config_clean
+from datahub.utilities.perf_timer import PerfTimer
+from datahub.utilities.stats_collections import TopKDict
from datahub.utilities.urns.dataset_urn import DatasetUrn
try:
@@ -196,6 +199,11 @@
504, # Gateway Timeout
]
+# From experience, this expiry time typically ranges from 50 minutes
+# to 2 hours but might as well be configurable. We will allow upto
+# 10 minutes of such expiry time
+REGULAR_AUTH_EXPIRY_PERIOD = timedelta(minutes=10)
+
logger: logging.Logger = logging.getLogger(__name__)
# Replace / with |
@@ -637,12 +645,42 @@ class SiteIdContentUrl:
site_content_url: str
-class TableauSourceReport(StaleEntityRemovalSourceReport):
+@dataclass
+class TableauSourceReport(
+ StaleEntityRemovalSourceReport,
+ IngestionStageReport,
+):
get_all_datasources_query_failed: bool = False
num_get_datasource_query_failures: int = 0
num_datasource_field_skipped_no_name: int = 0
num_csql_field_skipped_no_name: int = 0
num_table_field_skipped_no_name: int = 0
+ # timers
+ extract_usage_stats_timer: Dict[str, float] = dataclass_field(
+ default_factory=TopKDict
+ )
+ fetch_groups_timer: Dict[str, float] = dataclass_field(default_factory=TopKDict)
+ populate_database_server_hostname_map_timer: Dict[str, float] = dataclass_field(
+ default_factory=TopKDict
+ )
+ populate_projects_registry_timer: Dict[str, float] = dataclass_field(
+ default_factory=TopKDict
+ )
+ emit_workbooks_timer: Dict[str, float] = dataclass_field(default_factory=TopKDict)
+ emit_sheets_timer: Dict[str, float] = dataclass_field(default_factory=TopKDict)
+ emit_dashboards_timer: Dict[str, float] = dataclass_field(default_factory=TopKDict)
+ emit_embedded_datasources_timer: Dict[str, float] = dataclass_field(
+ default_factory=TopKDict
+ )
+ emit_published_datasources_timer: Dict[str, float] = dataclass_field(
+ default_factory=TopKDict
+ )
+ emit_custom_sql_datasources_timer: Dict[str, float] = dataclass_field(
+ default_factory=TopKDict
+ )
+ emit_upstream_tables_timer: Dict[str, float] = dataclass_field(
+ default_factory=TopKDict
+ )
# lineage
num_tables_with_upstream_lineage: int = 0
num_upstream_table_lineage: int = 0
@@ -653,7 +691,15 @@ class TableauSourceReport(StaleEntityRemovalSourceReport):
num_upstream_table_lineage_failed_parse_sql: int = 0
num_upstream_fine_grained_lineage_failed_parse_sql: int = 0
num_hidden_assets_skipped: int = 0
- logged_in_user: List[UserInfo] = []
+ logged_in_user: List[UserInfo] = dataclass_field(default_factory=list)
+
+ last_authenticated_at: Optional[datetime] = None
+
+ num_expected_tableau_metadata_queries: int = 0
+ num_actual_tableau_metadata_queries: int = 0
+ tableau_server_error_stats: Dict[str, int] = dataclass_field(
+ default_factory=(lambda: defaultdict(int))
+ )
def report_user_role(report: TableauSourceReport, server: Server) -> None:
@@ -724,6 +770,7 @@ def _authenticate(self, site_content_url: str) -> None:
try:
logger.info(f"Authenticated to Tableau site: '{site_content_url}'")
self.server = self.config.make_tableau_client(site_content_url)
+ self.report.last_authenticated_at = datetime.now(timezone.utc)
report_user_role(report=self.report, server=self.server)
# Note that we're not catching ConfigurationError, since we want that to throw.
except ValueError as e:
@@ -807,16 +854,20 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
site_source = TableauSiteSource(
config=self.config,
ctx=self.ctx,
- site=site
- if site
- else SiteIdContentUrl(
- site_id=self.server.site_id, site_content_url=self.config.site
+ site=(
+ site
+ if site
+ else SiteIdContentUrl(
+ site_id=self.server.site_id,
+ site_content_url=self.config.site,
+ )
),
report=self.report,
server=self.server,
platform=self.platform,
)
yield from site_source.ingest_tableau_site()
+
except MetadataQueryException as md_exception:
self.report.failure(
title="Failed to Retrieve Tableau Metadata",
@@ -925,6 +976,7 @@ def _re_authenticate(self) -> None:
# Sign-in again may not be enough because Tableau sometimes caches invalid sessions
# so we need to recreate the Tableau Server object
self.server = self.config.make_tableau_client(self.site_content_url)
+ self.report.last_authenticated_at = datetime.now(timezone.utc)
def _populate_usage_stat_registry(self) -> None:
if self.server is None:
@@ -1190,6 +1242,7 @@ def get_connection_object_page(
)
try:
assert self.server is not None
+ self.report.num_actual_tableau_metadata_queries += 1
query_data = query_metadata_cursor_based_pagination(
server=self.server,
main_query=query,
@@ -1199,25 +1252,36 @@ def get_connection_object_page(
qry_filter=query_filter,
)
- except REAUTHENTICATE_ERRORS:
- if not retry_on_auth_error:
+ except REAUTHENTICATE_ERRORS as e:
+ self.report.tableau_server_error_stats[e.__class__.__name__] += 1
+ if not retry_on_auth_error or retries_remaining <= 0:
raise
- # If ingestion has been running for over 2 hours, the Tableau
- # temporary credentials will expire. If this happens, this exception
- # will be thrown, and we need to re-authenticate and retry.
- self._re_authenticate()
+ # We have been getting some irregular authorization errors like below well before the expected expiry time
+ # - within few seconds of initial authentication . We'll retry without re-auth for such cases.
+ # :
+ # b'{"timestamp":"xxx","status":401,"error":"Unauthorized","path":"/relationship-service-war/graphql"}'
+ if self.report.last_authenticated_at and (
+ datetime.now(timezone.utc) - self.report.last_authenticated_at
+ > REGULAR_AUTH_EXPIRY_PERIOD
+ ):
+ # If ingestion has been running for over 2 hours, the Tableau
+ # temporary credentials will expire. If this happens, this exception
+ # will be thrown, and we need to re-authenticate and retry.
+ self._re_authenticate()
+
return self.get_connection_object_page(
query=query,
connection_type=connection_type,
query_filter=query_filter,
fetch_size=fetch_size,
current_cursor=current_cursor,
- retry_on_auth_error=False,
+ retry_on_auth_error=True,
retries_remaining=retries_remaining - 1,
)
except InternalServerError as ise:
+ self.report.tableau_server_error_stats[InternalServerError.__name__] += 1
# In some cases Tableau Server returns 504 error, which is a timeout error, so it worths to retry.
# Extended with other retryable errors.
if ise.code in RETRIABLE_ERROR_CODES:
@@ -1230,13 +1294,14 @@ def get_connection_object_page(
query_filter=query_filter,
fetch_size=fetch_size,
current_cursor=current_cursor,
- retry_on_auth_error=False,
+ retry_on_auth_error=True,
retries_remaining=retries_remaining - 1,
)
else:
raise ise
except OSError:
+ self.report.tableau_server_error_stats[OSError.__name__] += 1
# In tableauseverclient 0.26 (which was yanked and released in 0.28 on 2023-10-04),
# the request logic was changed to use threads.
# https://github.com/tableau/server-client-python/commit/307d8a20a30f32c1ce615cca7c6a78b9b9bff081
@@ -1251,7 +1316,7 @@ def get_connection_object_page(
query_filter=query_filter,
fetch_size=fetch_size,
current_cursor=current_cursor,
- retry_on_auth_error=False,
+ retry_on_auth_error=True,
retries_remaining=retries_remaining - 1,
)
@@ -1339,7 +1404,7 @@ def get_connection_object_page(
query_filter=query_filter,
fetch_size=fetch_size,
current_cursor=current_cursor,
- retry_on_auth_error=False,
+ retry_on_auth_error=True,
retries_remaining=retries_remaining,
)
raise RuntimeError(f"Query {connection_type} error: {errors}")
@@ -1377,6 +1442,7 @@ def get_connection_objects(
while has_next_page:
filter_: str = make_filter(filter_page)
+ self.report.num_expected_tableau_metadata_queries += 1
(
connection_objects,
current_cursor,
@@ -3457,33 +3523,87 @@ def _create_workbook_properties(
return {"permissions": json.dumps(groups)} if len(groups) > 0 else None
def ingest_tableau_site(self):
- # Initialise the dictionary to later look-up for chart and dashboard stat
- if self.config.extract_usage_stats:
- self._populate_usage_stat_registry()
-
- if self.config.permission_ingestion:
- self._fetch_groups()
-
- # Populate the map of database names and database hostnames to be used later to map
- # databases to platform instances.
- if self.config.database_hostname_to_platform_instance_map:
- self._populate_database_server_hostname_map()
-
- self._populate_projects_registry()
-
- if self.config.add_site_container:
- yield from self.emit_site_container()
- yield from self.emit_project_containers()
- yield from self.emit_workbooks()
- if self.sheet_ids:
- yield from self.emit_sheets()
- if self.dashboard_ids:
- yield from self.emit_dashboards()
- if self.embedded_datasource_ids_being_used:
- yield from self.emit_embedded_datasources()
- if self.datasource_ids_being_used:
- yield from self.emit_published_datasources()
- if self.custom_sql_ids_being_used:
- yield from self.emit_custom_sql_datasources()
- if self.database_tables:
- yield from self.emit_upstream_tables()
+ with self.report.new_stage(
+ f"Ingesting Tableau Site: {self.site_id} {self.site_content_url}"
+ ):
+ # Initialise the dictionary to later look-up for chart and dashboard stat
+ if self.config.extract_usage_stats:
+ with PerfTimer() as timer:
+ self._populate_usage_stat_registry()
+ self.report.extract_usage_stats_timer[
+ self.site_content_url
+ ] = timer.elapsed_seconds(digits=2)
+
+ if self.config.permission_ingestion:
+ with PerfTimer() as timer:
+ self._fetch_groups()
+ self.report.fetch_groups_timer[
+ self.site_content_url
+ ] = timer.elapsed_seconds(digits=2)
+
+ # Populate the map of database names and database hostnames to be used later to map
+ # databases to platform instances.
+ if self.config.database_hostname_to_platform_instance_map:
+ with PerfTimer() as timer:
+ self._populate_database_server_hostname_map()
+ self.report.populate_database_server_hostname_map_timer[
+ self.site_content_url
+ ] = timer.elapsed_seconds(digits=2)
+
+ with PerfTimer() as timer:
+ self._populate_projects_registry()
+ self.report.populate_projects_registry_timer[
+ self.site_content_url
+ ] = timer.elapsed_seconds(digits=2)
+
+ if self.config.add_site_container:
+ yield from self.emit_site_container()
+ yield from self.emit_project_containers()
+
+ with PerfTimer() as timer:
+ yield from self.emit_workbooks()
+ self.report.emit_workbooks_timer[
+ self.site_content_url
+ ] = timer.elapsed_seconds(digits=2)
+
+ if self.sheet_ids:
+ with PerfTimer() as timer:
+ yield from self.emit_sheets()
+ self.report.emit_sheets_timer[
+ self.site_content_url
+ ] = timer.elapsed_seconds(digits=2)
+
+ if self.dashboard_ids:
+ with PerfTimer() as timer:
+ yield from self.emit_dashboards()
+ self.report.emit_dashboards_timer[
+ self.site_content_url
+ ] = timer.elapsed_seconds(digits=2)
+
+ if self.embedded_datasource_ids_being_used:
+ with PerfTimer() as timer:
+ yield from self.emit_embedded_datasources()
+ self.report.emit_embedded_datasources_timer[
+ self.site_content_url
+ ] = timer.elapsed_seconds(digits=2)
+
+ if self.datasource_ids_being_used:
+ with PerfTimer() as timer:
+ yield from self.emit_published_datasources()
+ self.report.emit_published_datasources_timer[
+ self.site_content_url
+ ] = timer.elapsed_seconds(digits=2)
+
+ if self.custom_sql_ids_being_used:
+ with PerfTimer() as timer:
+ yield from self.emit_custom_sql_datasources()
+ self.report.emit_custom_sql_datasources_timer[
+ self.site_content_url
+ ] = timer.elapsed_seconds(digits=2)
+
+ if self.database_tables:
+ with PerfTimer() as timer:
+ yield from self.emit_upstream_tables()
+ self.report.emit_upstream_tables_timer[
+ self.site_content_url
+ ] = timer.elapsed_seconds(digits=2)
diff --git a/metadata-ingestion/src/datahub/ingestion/source/unity/source.py b/metadata-ingestion/src/datahub/ingestion/source/unity/source.py
index 7bfa7fdb28aaf8..43bd788f809c3e 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/unity/source.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/unity/source.py
@@ -26,9 +26,6 @@
gen_containers,
)
from datahub.emitter.sql_parsing_builder import SqlParsingBuilder
-from datahub.ingestion.api.auto_work_units.auto_ensure_aspect_size import (
- EnsureAspectSizeProcessor,
-)
from datahub.ingestion.api.common import PipelineContext
from datahub.ingestion.api.decorators import (
SupportStatus,
@@ -263,90 +260,89 @@ def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
StaleEntityRemovalHandler.create(
self, self.config, self.ctx
).workunit_processor,
- EnsureAspectSizeProcessor(self.get_report()).ensure_aspect_size,
]
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
- self.report.report_ingestion_stage_start("Ingestion Setup")
- wait_on_warehouse = None
- if self.config.include_hive_metastore:
- self.report.report_ingestion_stage_start("Start warehouse")
- # Can take several minutes, so start now and wait later
- wait_on_warehouse = self.unity_catalog_api_proxy.start_warehouse()
- if wait_on_warehouse is None:
- self.report.report_failure(
- "initialization",
- f"SQL warehouse {self.config.profiling.warehouse_id} not found",
- )
- return
- else:
- # wait until warehouse is started
- wait_on_warehouse.result()
+ with self.report.new_stage("Ingestion Setup"):
+ wait_on_warehouse = None
+ if self.config.include_hive_metastore:
+ with self.report.new_stage("Start warehouse"):
+ # Can take several minutes, so start now and wait later
+ wait_on_warehouse = self.unity_catalog_api_proxy.start_warehouse()
+ if wait_on_warehouse is None:
+ self.report.report_failure(
+ "initialization",
+ f"SQL warehouse {self.config.profiling.warehouse_id} not found",
+ )
+ return
+ else:
+ # wait until warehouse is started
+ wait_on_warehouse.result()
if self.config.include_ownership:
- self.report.report_ingestion_stage_start("Ingest service principals")
- self.build_service_principal_map()
- self.build_groups_map()
+ with self.report.new_stage("Ingest service principals"):
+ self.build_service_principal_map()
+ self.build_groups_map()
if self.config.include_notebooks:
- self.report.report_ingestion_stage_start("Ingest notebooks")
- yield from self.process_notebooks()
+ with self.report.new_stage("Ingest notebooks"):
+ yield from self.process_notebooks()
yield from self.process_metastores()
yield from self.get_view_lineage()
if self.config.include_notebooks:
- self.report.report_ingestion_stage_start("Notebook lineage")
- for notebook in self.notebooks.values():
- wu = self._gen_notebook_lineage(notebook)
- if wu:
- yield wu
+ with self.report.new_stage("Notebook lineage"):
+ for notebook in self.notebooks.values():
+ wu = self._gen_notebook_lineage(notebook)
+ if wu:
+ yield wu
if self.config.include_usage_statistics:
- self.report.report_ingestion_stage_start("Ingest usage")
- usage_extractor = UnityCatalogUsageExtractor(
- config=self.config,
- report=self.report,
- proxy=self.unity_catalog_api_proxy,
- table_urn_builder=self.gen_dataset_urn,
- user_urn_builder=self.gen_user_urn,
- )
- yield from usage_extractor.get_usage_workunits(
- self.table_refs | self.view_refs
- )
-
- if self.config.is_profiling_enabled():
- self.report.report_ingestion_stage_start("Start warehouse")
- # Need to start the warehouse again for profiling,
- # as it may have been stopped after ingestion might take
- # longer time to complete
- wait_on_warehouse = self.unity_catalog_api_proxy.start_warehouse()
- if wait_on_warehouse is None:
- self.report.report_failure(
- "initialization",
- f"SQL warehouse {self.config.profiling.warehouse_id} not found",
+ with self.report.new_stage("Ingest usage"):
+ usage_extractor = UnityCatalogUsageExtractor(
+ config=self.config,
+ report=self.report,
+ proxy=self.unity_catalog_api_proxy,
+ table_urn_builder=self.gen_dataset_urn,
+ user_urn_builder=self.gen_user_urn,
+ )
+ yield from usage_extractor.get_usage_workunits(
+ self.table_refs | self.view_refs
)
- return
- else:
- # wait until warehouse is started
- wait_on_warehouse.result()
- self.report.report_ingestion_stage_start("Profiling")
- if isinstance(self.config.profiling, UnityCatalogAnalyzeProfilerConfig):
- yield from UnityCatalogAnalyzeProfiler(
- self.config.profiling,
- self.report,
- self.unity_catalog_api_proxy,
- self.gen_dataset_urn,
- ).get_workunits(self.table_refs)
- elif isinstance(self.config.profiling, UnityCatalogGEProfilerConfig):
- yield from UnityCatalogGEProfiler(
- sql_common_config=self.config,
- profiling_config=self.config.profiling,
- report=self.report,
- ).get_workunits(list(self.tables.values()))
- else:
- raise ValueError("Unknown profiling config method")
+ if self.config.is_profiling_enabled():
+ with self.report.new_stage("Start warehouse"):
+ # Need to start the warehouse again for profiling,
+ # as it may have been stopped after ingestion might take
+ # longer time to complete
+ wait_on_warehouse = self.unity_catalog_api_proxy.start_warehouse()
+ if wait_on_warehouse is None:
+ self.report.report_failure(
+ "initialization",
+ f"SQL warehouse {self.config.profiling.warehouse_id} not found",
+ )
+ return
+ else:
+ # wait until warehouse is started
+ wait_on_warehouse.result()
+
+ with self.report.new_stage("Profiling"):
+ if isinstance(self.config.profiling, UnityCatalogAnalyzeProfilerConfig):
+ yield from UnityCatalogAnalyzeProfiler(
+ self.config.profiling,
+ self.report,
+ self.unity_catalog_api_proxy,
+ self.gen_dataset_urn,
+ ).get_workunits(self.table_refs)
+ elif isinstance(self.config.profiling, UnityCatalogGEProfilerConfig):
+ yield from UnityCatalogGEProfiler(
+ sql_common_config=self.config,
+ profiling_config=self.config.profiling,
+ report=self.report,
+ ).get_workunits(list(self.tables.values()))
+ else:
+ raise ValueError("Unknown profiling config method")
def build_service_principal_map(self) -> None:
try:
@@ -466,11 +462,11 @@ def process_schemas(self, catalog: Catalog) -> Iterable[MetadataWorkUnit]:
self.report.schemas.dropped(schema.id)
continue
- self.report.report_ingestion_stage_start(f"Ingest schema {schema.id}")
- yield from self.gen_schema_containers(schema)
- yield from self.process_tables(schema)
+ with self.report.new_stage(f"Ingest schema {schema.id}"):
+ yield from self.gen_schema_containers(schema)
+ yield from self.process_tables(schema)
- self.report.schemas.processed(schema.id)
+ self.report.schemas.processed(schema.id)
def process_tables(self, schema: Schema) -> Iterable[MetadataWorkUnit]:
for table in self.unity_catalog_api_proxy.tables(schema=schema):
diff --git a/metadata-ingestion/src/datahub/ingestion/source/usage/usage_common.py b/metadata-ingestion/src/datahub/ingestion/source/usage/usage_common.py
index 2b7aae8330905e..95c2345232a1ee 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/usage/usage_common.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/usage/usage_common.py
@@ -54,6 +54,20 @@ def default_user_urn_builder(email: str) -> str:
return builder.make_user_urn(email.split("@")[0])
+def extract_user_email(user: str) -> Optional[str]:
+ """Extracts user email from user input
+
+ >>> extract_user_email('urn:li:corpuser:abc@xyz.com')
+ 'abc@xyz.com'
+ >>> extract_user_email('urn:li:corpuser:abc')
+ >>> extract_user_email('abc@xyz.com')
+ 'abc@xyz.com'
+ """
+ if user.startswith(("urn:li:corpuser:", "urn:li:corpGroup:")):
+ user = user.split(":")[-1]
+ return user if "@" in user else None
+
+
def make_usage_workunit(
bucket_start_time: datetime,
resource: ResourceType,
@@ -104,7 +118,7 @@ def make_usage_workunit(
DatasetUserUsageCountsClass(
user=user_urn_builder(user),
count=count,
- userEmail=user if "@" in user else None,
+ userEmail=extract_user_email(user),
)
for user, count in user_freq
],
diff --git a/metadata-ingestion/src/datahub/ingestion/source_report/ingestion_stage.py b/metadata-ingestion/src/datahub/ingestion/source_report/ingestion_stage.py
index ce683e64b3f468..130a36e254fefd 100644
--- a/metadata-ingestion/src/datahub/ingestion/source_report/ingestion_stage.py
+++ b/metadata-ingestion/src/datahub/ingestion/source_report/ingestion_stage.py
@@ -1,7 +1,7 @@
import logging
+from contextlib import AbstractContextManager
from dataclasses import dataclass, field
from datetime import datetime, timezone
-from typing import Optional
from datahub.utilities.perf_timer import PerfTimer
from datahub.utilities.stats_collections import TopKDict
@@ -22,25 +22,29 @@
@dataclass
class IngestionStageReport:
- ingestion_stage: Optional[str] = None
ingestion_stage_durations: TopKDict[str, float] = field(default_factory=TopKDict)
- _timer: Optional[PerfTimer] = field(
- default=None, init=False, repr=False, compare=False
- )
-
- def report_ingestion_stage_start(self, stage: str) -> None:
- if self._timer:
- elapsed = round(self._timer.elapsed_seconds(), 2)
- logger.info(
- f"Time spent in stage <{self.ingestion_stage}>: {elapsed} seconds",
- stacklevel=2,
- )
- if self.ingestion_stage:
- self.ingestion_stage_durations[self.ingestion_stage] = elapsed
- else:
- self._timer = PerfTimer()
-
- self.ingestion_stage = f"{stage} at {datetime.now(timezone.utc)}"
- logger.info(f"Stage started: {self.ingestion_stage}")
+ def new_stage(self, stage: str) -> "IngestionStageContext":
+ return IngestionStageContext(stage, self)
+
+
+@dataclass
+class IngestionStageContext(AbstractContextManager):
+ def __init__(self, stage: str, report: IngestionStageReport):
+ self._ingestion_stage = f"{stage} at {datetime.now(timezone.utc)}"
+ self._timer: PerfTimer = PerfTimer()
+ self._report = report
+
+ def __enter__(self) -> "IngestionStageContext":
+ logger.info(f"Stage started: {self._ingestion_stage}")
self._timer.start()
+ return self
+
+ def __exit__(self, exc_type, exc_val, exc_tb):
+ elapsed = self._timer.elapsed_seconds(digits=2)
+ logger.info(
+ f"Time spent in stage <{self._ingestion_stage}>: {elapsed} seconds",
+ stacklevel=2,
+ )
+ self._report.ingestion_stage_durations[self._ingestion_stage] = elapsed
+ return None
diff --git a/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py b/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py
index f81eb291e89e1d..25b63ffac45f96 100644
--- a/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py
+++ b/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py
@@ -165,6 +165,7 @@ class KnownQueryLineageInfo:
timestamp: Optional[datetime] = None
session_id: Optional[str] = None
query_type: QueryType = QueryType.UNKNOWN
+ query_id: Optional[str] = None
@dataclasses.dataclass
@@ -198,7 +199,7 @@ def id(self) -> str:
@dataclasses.dataclass
class PreparsedQuery:
- # If not provided, we will generate one using the fast fingerprint generator.
+ # If not provided, we will generate one using the fingerprint generator.
query_id: Optional[QueryId]
query_text: str
@@ -618,12 +619,13 @@ def add_known_query_lineage(
self.report.num_known_query_lineage += 1
# Generate a fingerprint for the query.
- with self.report.sql_fingerprinting_timer:
- query_fingerprint = get_query_fingerprint(
- known_query_lineage.query_text,
- platform=self.platform.platform_name,
- fast=True,
- )
+ query_fingerprint = known_query_lineage.query_id
+ if not query_fingerprint:
+ with self.report.sql_fingerprinting_timer:
+ query_fingerprint = get_query_fingerprint(
+ known_query_lineage.query_text,
+ platform=self.platform.platform_name,
+ )
formatted_query = self._maybe_format_query(known_query_lineage.query_text)
# Register the query.
@@ -848,7 +850,6 @@ def add_preparsed_query(
query_fingerprint = get_query_fingerprint(
parsed.query_text,
platform=self.platform.platform_name,
- fast=True,
)
# Format the query.
diff --git a/metadata-ingestion/src/datahub/utilities/file_backed_collections.py b/metadata-ingestion/src/datahub/utilities/file_backed_collections.py
index b8c27666d7f538..fb028605c35b77 100644
--- a/metadata-ingestion/src/datahub/utilities/file_backed_collections.py
+++ b/metadata-ingestion/src/datahub/utilities/file_backed_collections.py
@@ -243,7 +243,7 @@ def __post_init__(self) -> None:
# This was added in 3.24.0 from 2018-06-04.
# See https://www.sqlite.org/lang_conflict.html
if OVERRIDE_SQLITE_VERSION_REQUIREMENT:
- self.use_sqlite_on_conflict = False
+ self._use_sqlite_on_conflict = False
else:
raise RuntimeError("SQLite version 3.24.0 or later is required")
diff --git a/metadata-ingestion/src/datahub/utilities/perf_timer.py b/metadata-ingestion/src/datahub/utilities/perf_timer.py
index 9488683d6d8cac..fc1b1ed58244c3 100644
--- a/metadata-ingestion/src/datahub/utilities/perf_timer.py
+++ b/metadata-ingestion/src/datahub/utilities/perf_timer.py
@@ -57,7 +57,7 @@ def __exit__(
self.finish()
return None
- def elapsed_seconds(self) -> float:
+ def elapsed_seconds(self, digits: int = 4) -> float:
"""
Returns the elapsed time in seconds.
"""
@@ -65,11 +65,18 @@ def elapsed_seconds(self) -> float:
return self._past_active_time
if self.end_time is None:
- return (time.perf_counter() - self.start_time) + (self._past_active_time)
+ elapsed = (time.perf_counter() - self.start_time) + (self._past_active_time)
else:
- return (self.end_time - self.start_time) + self._past_active_time
+ elapsed = (self.end_time - self.start_time) + self._past_active_time
+
+ return round(elapsed, digits)
def assert_timer_is_running(self) -> None:
+ if not self.is_running():
+ self._error_state = True
+ logger.warning("Did you forget to start the timer ?")
+
+ def is_running(self) -> bool:
"""
Returns true if timer is in running state.
Timer is in NOT in running state if
@@ -77,9 +84,7 @@ def assert_timer_is_running(self) -> None:
2. it is in paused state.
3. it had been started and finished in the past but not started again.
"""
- if self.start_time is None or self.paused or self.end_time:
- self._error_state = True
- logger.warning("Did you forget to start the timer ?")
+ return self.start_time is not None and not self.paused and self.end_time is None
def __repr__(self) -> str:
return repr(self.as_obj())
diff --git a/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py b/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py
index 71e5ad10c2fc5e..d7868038a40aa1 100644
--- a/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py
+++ b/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py
@@ -1,6 +1,6 @@
import json
import pathlib
-from typing import Any, Dict, List, cast
+from typing import Any, Dict, List, Union, cast
from unittest import mock
import pytest
@@ -13,10 +13,15 @@
GroupItem,
ProjectItem,
SiteItem,
+ UserItem,
ViewItem,
WorkbookItem,
)
from tableauserverclient.models.reference_item import ResourceReference
+from tableauserverclient.server.endpoint.exceptions import (
+ NonXMLResponseError,
+ TableauError,
+)
from datahub.emitter.mce_builder import DEFAULT_ENV, make_schema_field_urn
from datahub.emitter.mcp import MetadataChangeProposalWrapper
@@ -270,7 +275,7 @@ def side_effect_site_get_by_id(id, *arg, **kwargs):
def mock_sdk_client(
- side_effect_query_metadata_response: List[dict],
+ side_effect_query_metadata_response: List[Union[dict, TableauError]],
datasources_side_effect: List[dict],
sign_out_side_effect: List[dict],
) -> mock.MagicMock:
@@ -1312,6 +1317,61 @@ def test_permission_warning(pytestconfig, tmp_path, mock_datahub_graph):
)
+@freeze_time(FROZEN_TIME)
+@pytest.mark.integration
+def test_retry_on_error(pytestconfig, tmp_path, mock_datahub_graph):
+ with mock.patch(
+ "datahub.ingestion.source.state_provider.datahub_ingestion_checkpointing_provider.DataHubGraph",
+ mock_datahub_graph,
+ ) as mock_checkpoint:
+ mock_checkpoint.return_value = mock_datahub_graph
+
+ with mock.patch("datahub.ingestion.source.tableau.tableau.Server") as mock_sdk:
+ mock_client = mock_sdk_client(
+ side_effect_query_metadata_response=[
+ NonXMLResponseError(
+ """{"timestamp":"xxx","status":401,"error":"Unauthorized","path":"/relationship-service-war/graphql"}"""
+ ),
+ *mock_data(),
+ ],
+ sign_out_side_effect=[{}],
+ datasources_side_effect=[{}],
+ )
+ mock_client.users = mock.Mock()
+ mock_client.users.get_by_id.side_effect = [
+ UserItem(
+ name="name", site_role=UserItem.Roles.SiteAdministratorExplorer
+ )
+ ]
+ mock_sdk.return_value = mock_client
+
+ reporter = TableauSourceReport()
+ tableau_source = TableauSiteSource(
+ platform="tableau",
+ config=mock.MagicMock(),
+ ctx=mock.MagicMock(),
+ site=mock.MagicMock(spec=SiteItem, id="Site1", content_url="site1"),
+ server=mock_sdk.return_value,
+ report=reporter,
+ )
+
+ tableau_source.get_connection_object_page(
+ query=mock.MagicMock(),
+ connection_type=mock.MagicMock(),
+ query_filter=mock.MagicMock(),
+ current_cursor=None,
+ retries_remaining=1,
+ fetch_size=10,
+ )
+
+ assert reporter.num_actual_tableau_metadata_queries == 2
+ assert reporter.tableau_server_error_stats
+ assert reporter.tableau_server_error_stats["NonXMLResponseError"] == 1
+
+ assert reporter.warnings == []
+ assert reporter.failures == []
+
+
@freeze_time(FROZEN_TIME)
@pytest.mark.parametrize(
"extract_project_hierarchy, allowed_projects",
diff --git a/metadata-ingestion/tests/performance/bigquery/test_bigquery_usage.py b/metadata-ingestion/tests/performance/bigquery/test_bigquery_usage.py
index 9cb80ff02657bb..24460f38298069 100644
--- a/metadata-ingestion/tests/performance/bigquery/test_bigquery_usage.py
+++ b/metadata-ingestion/tests/performance/bigquery/test_bigquery_usage.py
@@ -26,14 +26,14 @@
def run_test():
report = BigQueryV2Report()
- report.set_ingestion_stage("All", "Seed Data Generation")
- seed_metadata = generate_data(
- num_containers=2000,
- num_tables=20000,
- num_views=2000,
- time_range=timedelta(days=7),
- )
- all_tables = seed_metadata.all_tables
+ with report.new_stage("All: Seed Data Generation"):
+ seed_metadata = generate_data(
+ num_containers=2000,
+ num_tables=20000,
+ num_views=2000,
+ time_range=timedelta(days=7),
+ )
+ all_tables = seed_metadata.all_tables
config = BigQueryV2Config(
start_time=seed_metadata.start_time,
@@ -51,42 +51,45 @@ def run_test():
schema_resolver=SchemaResolver(platform="bigquery"),
identifiers=BigQueryIdentifierBuilder(config, report),
)
- report.set_ingestion_stage("All", "Event Generation")
-
- num_projects = 100
- projects = [f"project-{i}" for i in range(num_projects)]
- table_to_project = {table.name: random.choice(projects) for table in all_tables}
- table_refs = {str(ref_from_table(table, table_to_project)) for table in all_tables}
+ with report.new_stage("All: Event Generation"):
+ num_projects = 100
+ projects = [f"project-{i}" for i in range(num_projects)]
+ table_to_project = {table.name: random.choice(projects) for table in all_tables}
+ table_refs = {
+ str(ref_from_table(table, table_to_project)) for table in all_tables
+ }
- queries = list(
- generate_queries(
- seed_metadata,
- num_selects=240_000,
- num_operations=800_000,
- num_unique_queries=50_000,
- num_users=2000,
- query_length=NormalDistribution(2000, 500),
+ queries = list(
+ generate_queries(
+ seed_metadata,
+ num_selects=240_000,
+ num_operations=800_000,
+ num_unique_queries=50_000,
+ num_users=2000,
+ query_length=NormalDistribution(2000, 500),
+ )
)
- )
- queries.sort(key=lambda q: q.timestamp)
- events = list(generate_events(queries, projects, table_to_project, config=config))
- print(f"Events generated: {len(events)}")
- pre_mem_usage = psutil.Process(os.getpid()).memory_info().rss
- print(f"Test data size: {humanfriendly.format_size(pre_mem_usage)}")
+ queries.sort(key=lambda q: q.timestamp)
+ events = list(
+ generate_events(queries, projects, table_to_project, config=config)
+ )
+ print(f"Events generated: {len(events)}")
+ pre_mem_usage = psutil.Process(os.getpid()).memory_info().rss
+ print(f"Test data size: {humanfriendly.format_size(pre_mem_usage)}")
- report.set_ingestion_stage("All", "Event Ingestion")
- with PerfTimer() as timer:
- workunits = usage_extractor._get_workunits_internal(events, table_refs)
- num_workunits, peak_memory_usage = workunit_sink(workunits)
- report.set_ingestion_stage("All", "Done")
- print(f"Workunits Generated: {num_workunits}")
- print(f"Seconds Elapsed: {timer.elapsed_seconds():.2f} seconds")
+ with report.new_stage("All: Event Ingestion"):
+ with PerfTimer() as timer:
+ workunits = usage_extractor._get_workunits_internal(events, table_refs)
+ num_workunits, peak_memory_usage = workunit_sink(workunits)
+ with report.new_stage("All: Done"):
+ print(f"Workunits Generated: {num_workunits}")
+ print(f"Seconds Elapsed: {timer.elapsed_seconds(digits=2)} seconds")
- print(
- f"Peak Memory Used: {humanfriendly.format_size(peak_memory_usage - pre_mem_usage)}"
- )
- print(f"Disk Used: {report.processing_perf.usage_state_size}")
- print(f"Hash collisions: {report.num_usage_query_hash_collisions}")
+ print(
+ f"Peak Memory Used: {humanfriendly.format_size(peak_memory_usage - pre_mem_usage)}"
+ )
+ print(f"Disk Used: {report.processing_perf.usage_state_size}")
+ print(f"Hash collisions: {report.num_usage_query_hash_collisions}")
if __name__ == "__main__":
diff --git a/metadata-ingestion/tests/performance/databricks/test_unity.py b/metadata-ingestion/tests/performance/databricks/test_unity.py
index ddd19804ba1841..71192dc5b509bc 100644
--- a/metadata-ingestion/tests/performance/databricks/test_unity.py
+++ b/metadata-ingestion/tests/performance/databricks/test_unity.py
@@ -59,7 +59,7 @@ def run_test():
workunits = source.get_workunits()
num_workunits, peak_memory_usage = workunit_sink(workunits)
print(f"Workunits Generated: {num_workunits}")
- print(f"Seconds Elapsed: {timer.elapsed_seconds():.2f} seconds")
+ print(f"Seconds Elapsed: {timer.elapsed_seconds(digits=2)} seconds")
print(
f"Peak Memory Used: {humanfriendly.format_size(peak_memory_usage - pre_mem_usage)}"
diff --git a/metadata-ingestion/tests/performance/snowflake/test_snowflake.py b/metadata-ingestion/tests/performance/snowflake/test_snowflake.py
index 984d9e42957452..a940cce46a8f74 100644
--- a/metadata-ingestion/tests/performance/snowflake/test_snowflake.py
+++ b/metadata-ingestion/tests/performance/snowflake/test_snowflake.py
@@ -53,7 +53,7 @@ def run_test():
workunits = source.get_workunits()
num_workunits, peak_memory_usage = workunit_sink(workunits)
logging.info(f"Workunits Generated: {num_workunits}")
- logging.info(f"Seconds Elapsed: {timer.elapsed_seconds():.2f} seconds")
+ logging.info(f"Seconds Elapsed: {timer.elapsed_seconds(digits=2)} seconds")
logging.info(source.get_report().as_string())
logging.info(
diff --git a/metadata-ingestion/tests/performance/sql/test_sql_formatter.py b/metadata-ingestion/tests/performance/sql/test_sql_formatter.py
index 5f783efc559bc9..f09047c0ec4a4f 100644
--- a/metadata-ingestion/tests/performance/sql/test_sql_formatter.py
+++ b/metadata-ingestion/tests/performance/sql/test_sql_formatter.py
@@ -12,12 +12,14 @@ def run_test() -> None:
for i in range(N):
if i % 50 == 0:
print(
- f"Running iteration {i}, elapsed time: {timer.elapsed_seconds():.2f} seconds"
+ f"Running iteration {i}, elapsed time: {timer.elapsed_seconds(digits=2)} seconds"
)
try_format_query.__wrapped__(large_sql_query, platform="snowflake")
- print(f"Total time taken for {N} iterations: {timer.elapsed_seconds():.2f} seconds")
+ print(
+ f"Total time taken for {N} iterations: {timer.elapsed_seconds(digits=2)} seconds"
+ )
if __name__ == "__main__":
diff --git a/metadata-ingestion/tests/unit/cli/test_cli_utils.py b/metadata-ingestion/tests/unit/cli/test_cli_utils.py
index c9693c75d96fe9..c430f585200e5a 100644
--- a/metadata-ingestion/tests/unit/cli/test_cli_utils.py
+++ b/metadata-ingestion/tests/unit/cli/test_cli_utils.py
@@ -70,6 +70,10 @@ def test_fixup_gms_url():
cli_utils.fixup_gms_url("http://abc.acryl.io/api/gms")
== "https://abc.acryl.io/gms"
)
+ assert (
+ cli_utils.fixup_gms_url("http://abcd.acryl.io:8080")
+ == "https://abcd.acryl.io/gms"
+ )
def test_guess_frontend_url_from_gms_url():
diff --git a/metadata-ingestion/tests/unit/reporting/test_ingestion_stage.py b/metadata-ingestion/tests/unit/reporting/test_ingestion_stage.py
new file mode 100644
index 00000000000000..8bae38eaa74446
--- /dev/null
+++ b/metadata-ingestion/tests/unit/reporting/test_ingestion_stage.py
@@ -0,0 +1,42 @@
+import time
+
+from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
+
+
+def test_ingestion_stage_context_records_duration():
+ report = IngestionStageReport()
+ with report.new_stage(stage="Test Stage"):
+ pass
+ assert len(report.ingestion_stage_durations) == 1
+ assert "Test Stage" in next(iter(report.ingestion_stage_durations.keys()))
+
+
+def test_ingestion_stage_context_handles_exceptions():
+ report = IngestionStageReport()
+ try:
+ with report.new_stage(stage="Test Stage"):
+ raise ValueError("Test Exception")
+ except ValueError:
+ pass
+ assert len(report.ingestion_stage_durations) == 1
+ assert "Test Stage" in next(iter(report.ingestion_stage_durations))
+
+
+def test_ingestion_stage_context_report_handles_multiple_stages():
+ report = IngestionStageReport()
+ with report.new_stage(stage="Test Stage 1"):
+ time.sleep(0.1)
+ with report.new_stage(stage="Test Stage 2"):
+ time.sleep(0.1)
+ with report.new_stage(stage="Test Stage 3"):
+ time.sleep(0.1)
+ assert len(report.ingestion_stage_durations) == 3
+ assert all(
+ isinstance(duration, float) and duration > 0.0
+ for duration in report.ingestion_stage_durations.values()
+ )
+
+ sorted_stages = list(sorted(report.ingestion_stage_durations.keys()))
+ assert "Test Stage 1" in sorted_stages[0]
+ assert "Test Stage 2" in sorted_stages[1]
+ assert "Test Stage 3" in sorted_stages[2]
diff --git a/metadata-ingestion/tests/unit/s3/test_s3_source.py b/metadata-ingestion/tests/unit/s3/test_s3_source.py
index f826cf0179e221..902987213e122f 100644
--- a/metadata-ingestion/tests/unit/s3/test_s3_source.py
+++ b/metadata-ingestion/tests/unit/s3/test_s3_source.py
@@ -1,12 +1,15 @@
+from datetime import datetime
from typing import List, Tuple
+from unittest.mock import Mock
import pytest
from datahub.emitter.mcp import MetadataChangeProposalWrapper
+from datahub.ingestion.api.common import PipelineContext
from datahub.ingestion.api.workunit import MetadataWorkUnit
from datahub.ingestion.source.data_lake_common.data_lake_utils import ContainerWUCreator
from datahub.ingestion.source.data_lake_common.path_spec import PathSpec
-from datahub.ingestion.source.s3.source import partitioned_folder_comparator
+from datahub.ingestion.source.s3.source import S3Source, partitioned_folder_comparator
def test_partition_comparator_numeric_folder_name():
@@ -240,3 +243,63 @@ def container_properties_filter(x: MetadataWorkUnit) -> bool:
"folder_abs_path": "my-bucket/my-dir/my-dir2",
"platform": "s3",
}
+
+
+def test_get_folder_info():
+ """
+ Test S3Source.get_folder_info returns the latest file in each folder
+ """
+
+ def _get_s3_source(path_spec_: PathSpec) -> S3Source:
+ return S3Source.create(
+ config_dict={
+ "path_spec": {
+ "include": path_spec_.include,
+ "table_name": path_spec_.table_name,
+ },
+ },
+ ctx=PipelineContext(run_id="test-s3"),
+ )
+
+ # arrange
+ path_spec = PathSpec(
+ include="s3://my-bucket/{table}/{partition0}/*.csv",
+ table_name="{table}",
+ )
+
+ bucket = Mock()
+ bucket.objects.filter().page_size = Mock(
+ return_value=[
+ Mock(
+ bucket_name="my-bucket",
+ key="my-folder/dir1/0001.csv",
+ creation_time=datetime(2025, 1, 1, 1),
+ last_modified=datetime(2025, 1, 1, 1),
+ size=100,
+ ),
+ Mock(
+ bucket_name="my-bucket",
+ key="my-folder/dir2/0001.csv",
+ creation_time=datetime(2025, 1, 1, 2),
+ last_modified=datetime(2025, 1, 1, 2),
+ size=100,
+ ),
+ Mock(
+ bucket_name="my-bucket",
+ key="my-folder/dir1/0002.csv",
+ creation_time=datetime(2025, 1, 1, 2),
+ last_modified=datetime(2025, 1, 1, 2),
+ size=100,
+ ),
+ ]
+ )
+
+ # act
+ res = _get_s3_source(path_spec).get_folder_info(
+ path_spec, bucket, prefix="/my-folder"
+ )
+
+ # assert
+ assert len(res) == 2
+ assert res[0].sample_file == "s3://my-bucket/my-folder/dir1/0002.csv"
+ assert res[1].sample_file == "s3://my-bucket/my-folder/dir2/0001.csv"
diff --git a/metadata-ingestion/tests/unit/s3/test_s3_util.py b/metadata-ingestion/tests/unit/s3/test_s3_util.py
new file mode 100644
index 00000000000000..7850d65ca8b01f
--- /dev/null
+++ b/metadata-ingestion/tests/unit/s3/test_s3_util.py
@@ -0,0 +1,29 @@
+from unittest.mock import Mock
+
+from datahub.ingestion.source.aws.s3_util import group_s3_objects_by_dirname
+
+
+def test_group_s3_objects_by_dirname():
+ s3_objects = [
+ Mock(key="/dir1/file1.txt"),
+ Mock(key="/dir2/file2.txt"),
+ Mock(key="/dir1/file3.txt"),
+ ]
+
+ grouped_objects = group_s3_objects_by_dirname(s3_objects)
+
+ assert len(grouped_objects) == 2
+ assert grouped_objects["/dir1"] == [s3_objects[0], s3_objects[2]]
+ assert grouped_objects["/dir2"] == [s3_objects[1]]
+
+
+def test_group_s3_objects_by_dirname_files_in_root_directory():
+ s3_objects = [
+ Mock(key="file1.txt"),
+ Mock(key="file2.txt"),
+ ]
+
+ grouped_objects = group_s3_objects_by_dirname(s3_objects)
+
+ assert len(grouped_objects) == 1
+ assert grouped_objects["/"] == s3_objects
diff --git a/metadata-ingestion/tests/unit/sdk/test_rest_emitter.py b/metadata-ingestion/tests/unit/sdk/test_rest_emitter.py
index b4d7cb17b66f5c..81120dfc87aba3 100644
--- a/metadata-ingestion/tests/unit/sdk/test_rest_emitter.py
+++ b/metadata-ingestion/tests/unit/sdk/test_rest_emitter.py
@@ -4,39 +4,41 @@
MOCK_GMS_ENDPOINT = "http://fakegmshost:8080"
-def test_datahub_rest_emitter_construction():
+def test_datahub_rest_emitter_construction() -> None:
emitter = DatahubRestEmitter(MOCK_GMS_ENDPOINT)
- assert emitter._connect_timeout_sec == rest_emitter._DEFAULT_CONNECT_TIMEOUT_SEC
- assert emitter._read_timeout_sec == rest_emitter._DEFAULT_READ_TIMEOUT_SEC
- assert emitter._retry_status_codes == rest_emitter._DEFAULT_RETRY_STATUS_CODES
- assert emitter._retry_max_times == rest_emitter._DEFAULT_RETRY_MAX_TIMES
+ assert emitter._session_config.timeout == rest_emitter._DEFAULT_TIMEOUT_SEC
+ assert (
+ emitter._session_config.retry_status_codes
+ == rest_emitter._DEFAULT_RETRY_STATUS_CODES
+ )
+ assert (
+ emitter._session_config.retry_max_times == rest_emitter._DEFAULT_RETRY_MAX_TIMES
+ )
-def test_datahub_rest_emitter_timeout_construction():
+def test_datahub_rest_emitter_timeout_construction() -> None:
emitter = DatahubRestEmitter(
MOCK_GMS_ENDPOINT, connect_timeout_sec=2, read_timeout_sec=4
)
- assert emitter._connect_timeout_sec == 2
- assert emitter._read_timeout_sec == 4
+ assert emitter._session_config.timeout == (2, 4)
-def test_datahub_rest_emitter_general_timeout_construction():
+def test_datahub_rest_emitter_general_timeout_construction() -> None:
emitter = DatahubRestEmitter(MOCK_GMS_ENDPOINT, timeout_sec=2, read_timeout_sec=4)
- assert emitter._connect_timeout_sec == 2
- assert emitter._read_timeout_sec == 4
+ assert emitter._session_config.timeout == (2, 4)
-def test_datahub_rest_emitter_retry_construction():
+def test_datahub_rest_emitter_retry_construction() -> None:
emitter = DatahubRestEmitter(
MOCK_GMS_ENDPOINT,
retry_status_codes=[418],
retry_max_times=42,
)
- assert emitter._retry_status_codes == [418]
- assert emitter._retry_max_times == 42
+ assert emitter._session_config.retry_status_codes == [418]
+ assert emitter._session_config.retry_max_times == 42
-def test_datahub_rest_emitter_extra_params():
+def test_datahub_rest_emitter_extra_params() -> None:
emitter = DatahubRestEmitter(
MOCK_GMS_ENDPOINT, extra_headers={"key1": "value1", "key2": "value2"}
)
diff --git a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_add_known_query_lineage.json b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_add_known_query_lineage.json
index 0d8822736c95eb..31d7419b2c8cca 100644
--- a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_add_known_query_lineage.json
+++ b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_add_known_query_lineage.json
@@ -18,7 +18,7 @@
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.bar,PROD)",
"type": "TRANSFORMED",
- "query": "urn:li:query:6ed1d12fbf2ccc8138ceec08cc35b981030d6d004bfad9743c7afd84260fa63f"
+ "query": "urn:li:query:02e2ec36678bea2a8c4c855fed5255d087cfeb2710d326e95fd9b48a9c4fc0ae"
}
],
"fineGrainedLineages": [
@@ -32,7 +32,7 @@
"urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo,PROD),a)"
],
"confidenceScore": 1.0,
- "query": "urn:li:query:6ed1d12fbf2ccc8138ceec08cc35b981030d6d004bfad9743c7afd84260fa63f"
+ "query": "urn:li:query:02e2ec36678bea2a8c4c855fed5255d087cfeb2710d326e95fd9b48a9c4fc0ae"
},
{
"upstreamType": "FIELD_SET",
@@ -44,7 +44,7 @@
"urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo,PROD),b)"
],
"confidenceScore": 1.0,
- "query": "urn:li:query:6ed1d12fbf2ccc8138ceec08cc35b981030d6d004bfad9743c7afd84260fa63f"
+ "query": "urn:li:query:02e2ec36678bea2a8c4c855fed5255d087cfeb2710d326e95fd9b48a9c4fc0ae"
},
{
"upstreamType": "FIELD_SET",
@@ -56,7 +56,7 @@
"urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo,PROD),c)"
],
"confidenceScore": 1.0,
- "query": "urn:li:query:6ed1d12fbf2ccc8138ceec08cc35b981030d6d004bfad9743c7afd84260fa63f"
+ "query": "urn:li:query:02e2ec36678bea2a8c4c855fed5255d087cfeb2710d326e95fd9b48a9c4fc0ae"
}
]
}
@@ -64,7 +64,7 @@
},
{
"entityType": "query",
- "entityUrn": "urn:li:query:6ed1d12fbf2ccc8138ceec08cc35b981030d6d004bfad9743c7afd84260fa63f",
+ "entityUrn": "urn:li:query:02e2ec36678bea2a8c4c855fed5255d087cfeb2710d326e95fd9b48a9c4fc0ae",
"changeType": "UPSERT",
"aspectName": "queryProperties",
"aspect": {
@@ -87,7 +87,7 @@
},
{
"entityType": "query",
- "entityUrn": "urn:li:query:6ed1d12fbf2ccc8138ceec08cc35b981030d6d004bfad9743c7afd84260fa63f",
+ "entityUrn": "urn:li:query:02e2ec36678bea2a8c4c855fed5255d087cfeb2710d326e95fd9b48a9c4fc0ae",
"changeType": "UPSERT",
"aspectName": "querySubjects",
"aspect": {
@@ -114,7 +114,7 @@
},
{
"entityType": "query",
- "entityUrn": "urn:li:query:6ed1d12fbf2ccc8138ceec08cc35b981030d6d004bfad9743c7afd84260fa63f",
+ "entityUrn": "urn:li:query:02e2ec36678bea2a8c4c855fed5255d087cfeb2710d326e95fd9b48a9c4fc0ae",
"changeType": "UPSERT",
"aspectName": "dataPlatformInstance",
"aspect": {
@@ -137,7 +137,7 @@
},
"operationType": "INSERT",
"customProperties": {
- "query_urn": "urn:li:query:6ed1d12fbf2ccc8138ceec08cc35b981030d6d004bfad9743c7afd84260fa63f"
+ "query_urn": "urn:li:query:02e2ec36678bea2a8c4c855fed5255d087cfeb2710d326e95fd9b48a9c4fc0ae"
},
"lastUpdatedTimestamp": 20000
}
diff --git a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_table_rename.json b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_table_rename.json
index fd8475090f009e..e22947fd96ce45 100644
--- a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_table_rename.json
+++ b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_table_rename.json
@@ -133,7 +133,7 @@
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo_staging,PROD)",
"type": "TRANSFORMED",
- "query": "urn:li:query:88d742bcc0216d6ccb50c7430d1d97494d5dfcfa90160ffa123108844ad261e4"
+ "query": "urn:li:query:07a307ad99d3c4a7e54d20c004a4f2d52496f3f5283b33013f80e6323700d97b"
}
],
"fineGrainedLineages": [
@@ -147,7 +147,7 @@
"urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo,PROD),a)"
],
"confidenceScore": 1.0,
- "query": "urn:li:query:88d742bcc0216d6ccb50c7430d1d97494d5dfcfa90160ffa123108844ad261e4"
+ "query": "urn:li:query:07a307ad99d3c4a7e54d20c004a4f2d52496f3f5283b33013f80e6323700d97b"
},
{
"upstreamType": "FIELD_SET",
@@ -159,7 +159,7 @@
"urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo,PROD),b)"
],
"confidenceScore": 1.0,
- "query": "urn:li:query:88d742bcc0216d6ccb50c7430d1d97494d5dfcfa90160ffa123108844ad261e4"
+ "query": "urn:li:query:07a307ad99d3c4a7e54d20c004a4f2d52496f3f5283b33013f80e6323700d97b"
},
{
"upstreamType": "FIELD_SET",
@@ -171,7 +171,7 @@
"urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo,PROD),c)"
],
"confidenceScore": 1.0,
- "query": "urn:li:query:88d742bcc0216d6ccb50c7430d1d97494d5dfcfa90160ffa123108844ad261e4"
+ "query": "urn:li:query:07a307ad99d3c4a7e54d20c004a4f2d52496f3f5283b33013f80e6323700d97b"
}
]
}
@@ -179,7 +179,7 @@
},
{
"entityType": "query",
- "entityUrn": "urn:li:query:88d742bcc0216d6ccb50c7430d1d97494d5dfcfa90160ffa123108844ad261e4",
+ "entityUrn": "urn:li:query:07a307ad99d3c4a7e54d20c004a4f2d52496f3f5283b33013f80e6323700d97b",
"changeType": "UPSERT",
"aspectName": "queryProperties",
"aspect": {
@@ -202,7 +202,7 @@
},
{
"entityType": "query",
- "entityUrn": "urn:li:query:88d742bcc0216d6ccb50c7430d1d97494d5dfcfa90160ffa123108844ad261e4",
+ "entityUrn": "urn:li:query:07a307ad99d3c4a7e54d20c004a4f2d52496f3f5283b33013f80e6323700d97b",
"changeType": "UPSERT",
"aspectName": "querySubjects",
"aspect": {
@@ -229,7 +229,7 @@
},
{
"entityType": "query",
- "entityUrn": "urn:li:query:88d742bcc0216d6ccb50c7430d1d97494d5dfcfa90160ffa123108844ad261e4",
+ "entityUrn": "urn:li:query:07a307ad99d3c4a7e54d20c004a4f2d52496f3f5283b33013f80e6323700d97b",
"changeType": "UPSERT",
"aspectName": "dataPlatformInstance",
"aspect": {
diff --git a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_table_rename_with_temp.json b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_table_rename_with_temp.json
index a4ac349c3c455c..b657b46476cbbd 100644
--- a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_table_rename_with_temp.json
+++ b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_table_rename_with_temp.json
@@ -133,7 +133,7 @@
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo_dep,PROD)",
"type": "TRANSFORMED",
- "query": "urn:li:query:composite_2efc2a13ee673ccf7b195f8f2c0e4ba0570194d8200c3c20b1eb7e8ca4fb4332"
+ "query": "urn:li:query:composite_c035c933cc4ce5cf8a111bcaf419b8e66a1e41853bb154ff9aaa24cd00ecf51e"
}
],
"fineGrainedLineages": [
@@ -147,7 +147,7 @@
"urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo,PROD),a)"
],
"confidenceScore": 0.2,
- "query": "urn:li:query:composite_2efc2a13ee673ccf7b195f8f2c0e4ba0570194d8200c3c20b1eb7e8ca4fb4332"
+ "query": "urn:li:query:composite_c035c933cc4ce5cf8a111bcaf419b8e66a1e41853bb154ff9aaa24cd00ecf51e"
},
{
"upstreamType": "FIELD_SET",
@@ -159,7 +159,7 @@
"urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo,PROD),b)"
],
"confidenceScore": 0.2,
- "query": "urn:li:query:composite_2efc2a13ee673ccf7b195f8f2c0e4ba0570194d8200c3c20b1eb7e8ca4fb4332"
+ "query": "urn:li:query:composite_c035c933cc4ce5cf8a111bcaf419b8e66a1e41853bb154ff9aaa24cd00ecf51e"
}
]
}
@@ -167,7 +167,7 @@
},
{
"entityType": "query",
- "entityUrn": "urn:li:query:composite_2efc2a13ee673ccf7b195f8f2c0e4ba0570194d8200c3c20b1eb7e8ca4fb4332",
+ "entityUrn": "urn:li:query:composite_c035c933cc4ce5cf8a111bcaf419b8e66a1e41853bb154ff9aaa24cd00ecf51e",
"changeType": "UPSERT",
"aspectName": "queryProperties",
"aspect": {
@@ -190,7 +190,7 @@
},
{
"entityType": "query",
- "entityUrn": "urn:li:query:composite_2efc2a13ee673ccf7b195f8f2c0e4ba0570194d8200c3c20b1eb7e8ca4fb4332",
+ "entityUrn": "urn:li:query:composite_c035c933cc4ce5cf8a111bcaf419b8e66a1e41853bb154ff9aaa24cd00ecf51e",
"changeType": "UPSERT",
"aspectName": "querySubjects",
"aspect": {
@@ -217,7 +217,7 @@
},
{
"entityType": "query",
- "entityUrn": "urn:li:query:composite_2efc2a13ee673ccf7b195f8f2c0e4ba0570194d8200c3c20b1eb7e8ca4fb4332",
+ "entityUrn": "urn:li:query:composite_c035c933cc4ce5cf8a111bcaf419b8e66a1e41853bb154ff9aaa24cd00ecf51e",
"changeType": "UPSERT",
"aspectName": "dataPlatformInstance",
"aspect": {
diff --git a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_table_swap.json b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_table_swap.json
index d9d46a4b14a146..09a98a81f2602e 100644
--- a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_table_swap.json
+++ b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_table_swap.json
@@ -133,7 +133,7 @@
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,dev.public.person_info_swap,PROD)",
"type": "TRANSFORMED",
- "query": "urn:li:query:b256c8cc8f386b209ef8da55485d46c3fbd471b942f804d370e24350b3087405"
+ "query": "urn:li:query:1ed34195f33514203e8359ca22772e03a3588b669e0db00b1681e1a8d0862300"
}
],
"fineGrainedLineages": [
@@ -147,7 +147,7 @@
"urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,dev.public.person_info,PROD),a)"
],
"confidenceScore": 1.0,
- "query": "urn:li:query:b256c8cc8f386b209ef8da55485d46c3fbd471b942f804d370e24350b3087405"
+ "query": "urn:li:query:1ed34195f33514203e8359ca22772e03a3588b669e0db00b1681e1a8d0862300"
},
{
"upstreamType": "FIELD_SET",
@@ -159,7 +159,7 @@
"urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,dev.public.person_info,PROD),b)"
],
"confidenceScore": 1.0,
- "query": "urn:li:query:b256c8cc8f386b209ef8da55485d46c3fbd471b942f804d370e24350b3087405"
+ "query": "urn:li:query:1ed34195f33514203e8359ca22772e03a3588b669e0db00b1681e1a8d0862300"
},
{
"upstreamType": "FIELD_SET",
@@ -171,7 +171,7 @@
"urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,dev.public.person_info,PROD),c)"
],
"confidenceScore": 1.0,
- "query": "urn:li:query:b256c8cc8f386b209ef8da55485d46c3fbd471b942f804d370e24350b3087405"
+ "query": "urn:li:query:1ed34195f33514203e8359ca22772e03a3588b669e0db00b1681e1a8d0862300"
}
]
}
@@ -179,7 +179,7 @@
},
{
"entityType": "query",
- "entityUrn": "urn:li:query:b256c8cc8f386b209ef8da55485d46c3fbd471b942f804d370e24350b3087405",
+ "entityUrn": "urn:li:query:1ed34195f33514203e8359ca22772e03a3588b669e0db00b1681e1a8d0862300",
"changeType": "UPSERT",
"aspectName": "queryProperties",
"aspect": {
@@ -202,7 +202,7 @@
},
{
"entityType": "query",
- "entityUrn": "urn:li:query:b256c8cc8f386b209ef8da55485d46c3fbd471b942f804d370e24350b3087405",
+ "entityUrn": "urn:li:query:1ed34195f33514203e8359ca22772e03a3588b669e0db00b1681e1a8d0862300",
"changeType": "UPSERT",
"aspectName": "querySubjects",
"aspect": {
@@ -229,7 +229,7 @@
},
{
"entityType": "query",
- "entityUrn": "urn:li:query:b256c8cc8f386b209ef8da55485d46c3fbd471b942f804d370e24350b3087405",
+ "entityUrn": "urn:li:query:1ed34195f33514203e8359ca22772e03a3588b669e0db00b1681e1a8d0862300",
"changeType": "UPSERT",
"aspectName": "dataPlatformInstance",
"aspect": {
@@ -257,7 +257,7 @@
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,dev.public.person_info_swap,PROD)",
"type": "TRANSFORMED",
- "query": "urn:li:query:6f71602f39d01a39b3f8bd411c74c5ac08dc4b90bc3d49b257089acb19fa8559"
+ "query": "urn:li:query:76f0a8e1da90c4d33b5741c6e1014251ce2d1650ba0f58ab136ebaf1bb64dc8c"
}
]
}
@@ -265,7 +265,7 @@
},
{
"entityType": "query",
- "entityUrn": "urn:li:query:6f71602f39d01a39b3f8bd411c74c5ac08dc4b90bc3d49b257089acb19fa8559",
+ "entityUrn": "urn:li:query:76f0a8e1da90c4d33b5741c6e1014251ce2d1650ba0f58ab136ebaf1bb64dc8c",
"changeType": "UPSERT",
"aspectName": "queryProperties",
"aspect": {
@@ -288,7 +288,7 @@
},
{
"entityType": "query",
- "entityUrn": "urn:li:query:6f71602f39d01a39b3f8bd411c74c5ac08dc4b90bc3d49b257089acb19fa8559",
+ "entityUrn": "urn:li:query:76f0a8e1da90c4d33b5741c6e1014251ce2d1650ba0f58ab136ebaf1bb64dc8c",
"changeType": "UPSERT",
"aspectName": "querySubjects",
"aspect": {
@@ -306,7 +306,7 @@
},
{
"entityType": "query",
- "entityUrn": "urn:li:query:6f71602f39d01a39b3f8bd411c74c5ac08dc4b90bc3d49b257089acb19fa8559",
+ "entityUrn": "urn:li:query:76f0a8e1da90c4d33b5741c6e1014251ce2d1650ba0f58ab136ebaf1bb64dc8c",
"changeType": "UPSERT",
"aspectName": "dataPlatformInstance",
"aspect": {
@@ -334,7 +334,7 @@
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,dev.public.person_info_dep,PROD)",
"type": "TRANSFORMED",
- "query": "urn:li:query:4b1fad909083e1ed5c47c146bd01247ed4d6295d175c34f9065b8fc6000fc7ae"
+ "query": "urn:li:query:37c14a3bbb67360d19d1666fa4e11b67ef81926e1e2bcd46b87ea239d27a549d"
}
]
}
@@ -342,7 +342,7 @@
},
{
"entityType": "query",
- "entityUrn": "urn:li:query:4b1fad909083e1ed5c47c146bd01247ed4d6295d175c34f9065b8fc6000fc7ae",
+ "entityUrn": "urn:li:query:37c14a3bbb67360d19d1666fa4e11b67ef81926e1e2bcd46b87ea239d27a549d",
"changeType": "UPSERT",
"aspectName": "queryProperties",
"aspect": {
@@ -365,7 +365,7 @@
},
{
"entityType": "query",
- "entityUrn": "urn:li:query:4b1fad909083e1ed5c47c146bd01247ed4d6295d175c34f9065b8fc6000fc7ae",
+ "entityUrn": "urn:li:query:37c14a3bbb67360d19d1666fa4e11b67ef81926e1e2bcd46b87ea239d27a549d",
"changeType": "UPSERT",
"aspectName": "querySubjects",
"aspect": {
@@ -383,7 +383,7 @@
},
{
"entityType": "query",
- "entityUrn": "urn:li:query:4b1fad909083e1ed5c47c146bd01247ed4d6295d175c34f9065b8fc6000fc7ae",
+ "entityUrn": "urn:li:query:37c14a3bbb67360d19d1666fa4e11b67ef81926e1e2bcd46b87ea239d27a549d",
"changeType": "UPSERT",
"aspectName": "dataPlatformInstance",
"aspect": {
@@ -411,7 +411,7 @@
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,dev.public.person_info,PROD)",
"type": "TRANSFORMED",
- "query": "urn:li:query:3886d427c84692923797048da6d3991693e89ce44e10d1917c12e8b6fd493904"
+ "query": "urn:li:query:f4eb748a53291bbea59e080f6d415b08dfd7003d0b7c3d538d02f4e404b30943"
},
{
"auditStamp": {
@@ -424,7 +424,7 @@
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,dev.public.person_info_incremental,PROD)",
"type": "TRANSFORMED",
- "query": "urn:li:query:481d0392ffeffdafd198d94e0a9f778dd722b60daa47083a32800b99ea21f86f"
+ "query": "urn:li:query:29935c31db1f06edf50d62a59d2874a86c51570256ab3b3102984439c03be1f2"
}
]
}
@@ -432,7 +432,7 @@
},
{
"entityType": "query",
- "entityUrn": "urn:li:query:3886d427c84692923797048da6d3991693e89ce44e10d1917c12e8b6fd493904",
+ "entityUrn": "urn:li:query:f4eb748a53291bbea59e080f6d415b08dfd7003d0b7c3d538d02f4e404b30943",
"changeType": "UPSERT",
"aspectName": "queryProperties",
"aspect": {
@@ -455,7 +455,7 @@
},
{
"entityType": "query",
- "entityUrn": "urn:li:query:3886d427c84692923797048da6d3991693e89ce44e10d1917c12e8b6fd493904",
+ "entityUrn": "urn:li:query:f4eb748a53291bbea59e080f6d415b08dfd7003d0b7c3d538d02f4e404b30943",
"changeType": "UPSERT",
"aspectName": "querySubjects",
"aspect": {
@@ -473,7 +473,7 @@
},
{
"entityType": "query",
- "entityUrn": "urn:li:query:3886d427c84692923797048da6d3991693e89ce44e10d1917c12e8b6fd493904",
+ "entityUrn": "urn:li:query:f4eb748a53291bbea59e080f6d415b08dfd7003d0b7c3d538d02f4e404b30943",
"changeType": "UPSERT",
"aspectName": "dataPlatformInstance",
"aspect": {
@@ -484,7 +484,7 @@
},
{
"entityType": "query",
- "entityUrn": "urn:li:query:481d0392ffeffdafd198d94e0a9f778dd722b60daa47083a32800b99ea21f86f",
+ "entityUrn": "urn:li:query:29935c31db1f06edf50d62a59d2874a86c51570256ab3b3102984439c03be1f2",
"changeType": "UPSERT",
"aspectName": "queryProperties",
"aspect": {
@@ -507,7 +507,7 @@
},
{
"entityType": "query",
- "entityUrn": "urn:li:query:481d0392ffeffdafd198d94e0a9f778dd722b60daa47083a32800b99ea21f86f",
+ "entityUrn": "urn:li:query:29935c31db1f06edf50d62a59d2874a86c51570256ab3b3102984439c03be1f2",
"changeType": "UPSERT",
"aspectName": "querySubjects",
"aspect": {
@@ -525,7 +525,7 @@
},
{
"entityType": "query",
- "entityUrn": "urn:li:query:481d0392ffeffdafd198d94e0a9f778dd722b60daa47083a32800b99ea21f86f",
+ "entityUrn": "urn:li:query:29935c31db1f06edf50d62a59d2874a86c51570256ab3b3102984439c03be1f2",
"changeType": "UPSERT",
"aspectName": "dataPlatformInstance",
"aspect": {
diff --git a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_table_swap_with_temp.json b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_table_swap_with_temp.json
index b4eaf76a149337..69bcd8eb10e951 100644
--- a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_table_swap_with_temp.json
+++ b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_table_swap_with_temp.json
@@ -133,7 +133,7 @@
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,dev.public.person_info,PROD)",
"type": "TRANSFORMED",
- "query": "urn:li:query:composite_9e36ef19163461d35b618fd1eea2a3f6a5d10a23a979a6d5ef688b31f277abb3"
+ "query": "urn:li:query:composite_a10e266957d5007837642526d09f058ca461e42e2159ff45c328ebd069c112df"
},
{
"auditStamp": {
@@ -146,7 +146,7 @@
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,dev.public.person_info_dep,PROD)",
"type": "TRANSFORMED",
- "query": "urn:li:query:composite_9e36ef19163461d35b618fd1eea2a3f6a5d10a23a979a6d5ef688b31f277abb3"
+ "query": "urn:li:query:composite_a10e266957d5007837642526d09f058ca461e42e2159ff45c328ebd069c112df"
}
],
"fineGrainedLineages": [
@@ -161,7 +161,7 @@
"urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,dev.public.person_info,PROD),a)"
],
"confidenceScore": 1.0,
- "query": "urn:li:query:composite_9e36ef19163461d35b618fd1eea2a3f6a5d10a23a979a6d5ef688b31f277abb3"
+ "query": "urn:li:query:composite_a10e266957d5007837642526d09f058ca461e42e2159ff45c328ebd069c112df"
}
]
}
@@ -169,7 +169,7 @@
},
{
"entityType": "query",
- "entityUrn": "urn:li:query:composite_9e36ef19163461d35b618fd1eea2a3f6a5d10a23a979a6d5ef688b31f277abb3",
+ "entityUrn": "urn:li:query:composite_a10e266957d5007837642526d09f058ca461e42e2159ff45c328ebd069c112df",
"changeType": "UPSERT",
"aspectName": "queryProperties",
"aspect": {
@@ -192,7 +192,7 @@
},
{
"entityType": "query",
- "entityUrn": "urn:li:query:composite_9e36ef19163461d35b618fd1eea2a3f6a5d10a23a979a6d5ef688b31f277abb3",
+ "entityUrn": "urn:li:query:composite_a10e266957d5007837642526d09f058ca461e42e2159ff45c328ebd069c112df",
"changeType": "UPSERT",
"aspectName": "querySubjects",
"aspect": {
@@ -219,7 +219,7 @@
},
{
"entityType": "query",
- "entityUrn": "urn:li:query:composite_9e36ef19163461d35b618fd1eea2a3f6a5d10a23a979a6d5ef688b31f277abb3",
+ "entityUrn": "urn:li:query:composite_a10e266957d5007837642526d09f058ca461e42e2159ff45c328ebd069c112df",
"changeType": "UPSERT",
"aspectName": "dataPlatformInstance",
"aspect": {
@@ -247,7 +247,7 @@
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,dev.public.person_info,PROD)",
"type": "TRANSFORMED",
- "query": "urn:li:query:composite_49daa72ac1d22734879a6bed1224daa7f8c1293750d6d7b8a24a0aa0e9f74d80"
+ "query": "urn:li:query:composite_5d8360cfc2f57f023d9945749848ad52227674fefc9fec568e7fbb1787cfd544"
},
{
"auditStamp": {
@@ -260,7 +260,7 @@
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,dev.public.person_info_dep,PROD)",
"type": "TRANSFORMED",
- "query": "urn:li:query:composite_49daa72ac1d22734879a6bed1224daa7f8c1293750d6d7b8a24a0aa0e9f74d80"
+ "query": "urn:li:query:composite_5d8360cfc2f57f023d9945749848ad52227674fefc9fec568e7fbb1787cfd544"
}
],
"fineGrainedLineages": [
@@ -275,7 +275,7 @@
"urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,dev.public.person_info_backup,PROD),a)"
],
"confidenceScore": 1.0,
- "query": "urn:li:query:composite_49daa72ac1d22734879a6bed1224daa7f8c1293750d6d7b8a24a0aa0e9f74d80"
+ "query": "urn:li:query:composite_5d8360cfc2f57f023d9945749848ad52227674fefc9fec568e7fbb1787cfd544"
}
]
}
@@ -283,7 +283,7 @@
},
{
"entityType": "query",
- "entityUrn": "urn:li:query:composite_49daa72ac1d22734879a6bed1224daa7f8c1293750d6d7b8a24a0aa0e9f74d80",
+ "entityUrn": "urn:li:query:composite_5d8360cfc2f57f023d9945749848ad52227674fefc9fec568e7fbb1787cfd544",
"changeType": "UPSERT",
"aspectName": "queryProperties",
"aspect": {
@@ -306,7 +306,7 @@
},
{
"entityType": "query",
- "entityUrn": "urn:li:query:composite_49daa72ac1d22734879a6bed1224daa7f8c1293750d6d7b8a24a0aa0e9f74d80",
+ "entityUrn": "urn:li:query:composite_5d8360cfc2f57f023d9945749848ad52227674fefc9fec568e7fbb1787cfd544",
"changeType": "UPSERT",
"aspectName": "querySubjects",
"aspect": {
@@ -330,7 +330,7 @@
},
{
"entityType": "query",
- "entityUrn": "urn:li:query:composite_49daa72ac1d22734879a6bed1224daa7f8c1293750d6d7b8a24a0aa0e9f74d80",
+ "entityUrn": "urn:li:query:composite_5d8360cfc2f57f023d9945749848ad52227674fefc9fec568e7fbb1787cfd544",
"changeType": "UPSERT",
"aspectName": "dataPlatformInstance",
"aspect": {
diff --git a/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_utils.py b/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_utils.py
index dbe24ade6944f6..c3c3a4a15d915b 100644
--- a/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_utils.py
+++ b/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_utils.py
@@ -186,3 +186,15 @@ def test_query_fingerprint():
assert get_query_fingerprint(
"select 1 + 1", platform="postgres"
) != get_query_fingerprint("select 2", platform="postgres")
+
+
+def test_redshift_query_fingerprint():
+ query1 = "insert into insert_into_table (select * from base_table);"
+ query2 = "INSERT INTO insert_into_table (SELECT * FROM base_table)"
+
+ assert get_query_fingerprint(query1, "redshift") == get_query_fingerprint(
+ query2, "redshift"
+ )
+ assert get_query_fingerprint(query1, "redshift", True) != get_query_fingerprint(
+ query2, "redshift", True
+ )
diff --git a/metadata-ingestion/tests/unit/structured_properties/test_structured_properties.py b/metadata-ingestion/tests/unit/structured_properties/test_structured_properties.py
new file mode 100644
index 00000000000000..d03b08b77d5a96
--- /dev/null
+++ b/metadata-ingestion/tests/unit/structured_properties/test_structured_properties.py
@@ -0,0 +1,213 @@
+from unittest.mock import Mock
+
+import pytest
+import yaml
+
+from datahub.api.entities.structuredproperties.structuredproperties import (
+ AllowedValue,
+ StructuredProperties,
+ TypeQualifierAllowedTypes,
+)
+from datahub.ingestion.graph.client import DataHubGraph
+from datahub.metadata.schema_classes import (
+ PropertyValueClass,
+ StructuredPropertyDefinitionClass,
+)
+
+
+@pytest.fixture
+def sample_yaml_content():
+ return """
+- id: test_property
+ type: string
+ description: Test description
+ display_name: Test Property
+ entity_types:
+ - dataset
+ cardinality: SINGLE
+ allowed_values:
+ - value: test_value
+ description: Test value description
+"""
+
+
+@pytest.fixture
+def sample_yaml_file(tmp_path, sample_yaml_content):
+ yaml_file = tmp_path / "test_properties.yaml"
+ yaml_file.write_text(sample_yaml_content)
+ return str(yaml_file)
+
+
+@pytest.fixture
+def mock_graph():
+ return Mock(spec=DataHubGraph)
+
+
+def test_structured_properties_basic_creation():
+ props = StructuredProperties(
+ id="test_prop", type="string", description="Test description"
+ )
+ assert props.id == "test_prop"
+ assert props.type == "urn:li:dataType:datahub.string"
+ assert props.description == "Test description"
+ assert props.urn == "urn:li:structuredProperty:test_prop"
+
+
+def test_structured_properties_validate_type():
+ # Test valid types
+ props = StructuredProperties(id="test", type="string")
+ assert props.type == "urn:li:dataType:datahub.string"
+
+ # Test invalid type
+ with pytest.raises(ValueError, match="Type .* is not allowed"):
+ StructuredProperties(id="test", type="invalid_type")
+
+
+def test_structured_properties_validate_entity_types():
+ # Test valid entity type
+ props = StructuredProperties(id="test", type="string", entity_types=["dataset"])
+ assert props.entity_types
+ assert "urn:li:entityType:datahub.dataset" in props.entity_types
+
+ # Test invalid entity type
+ with pytest.raises(ValueError, match="not a valid entity type"):
+ StructuredProperties(id="test", type="string", entity_types=["invalid_entity"])
+
+
+def test_structured_properties_from_yaml(sample_yaml_file):
+ props = StructuredProperties.from_yaml(sample_yaml_file)
+ assert len(props) == 1
+ assert props[0].id == "test_property"
+ assert props[0].type == "urn:li:dataType:datahub.string"
+ assert props[0].description == "Test description"
+ assert props[0].display_name
+ assert props[0].display_name == "Test Property"
+ assert props[0].allowed_values
+ assert len(props[0].allowed_values) == 1
+ assert props[0].allowed_values[0].value == "test_value"
+
+
+def test_structured_properties_generate_mcps():
+ props = StructuredProperties(
+ id="test_prop",
+ type="string",
+ description="Test description",
+ display_name="Test Property",
+ entity_types=["dataset"],
+ allowed_values=[
+ AllowedValue(value="test_value", description="Test value description")
+ ],
+ )
+
+ mcps = props.generate_mcps()
+ assert len(mcps) == 1
+ mcp = mcps[0]
+
+ assert mcp.entityUrn == "urn:li:structuredProperty:test_prop"
+ assert isinstance(mcp.aspect, StructuredPropertyDefinitionClass)
+ assert mcp.aspect.valueType == "urn:li:dataType:datahub.string"
+ assert mcp.aspect.description == "Test description"
+ assert mcp.aspect.allowedValues
+ assert len(mcp.aspect.allowedValues) == 1
+ assert mcp.aspect.allowedValues[0].value == "test_value"
+
+
+def test_structured_properties_from_datahub(mock_graph):
+ mock_aspect = StructuredPropertyDefinitionClass(
+ qualifiedName="test_prop",
+ valueType="urn:li:dataType:datahub.string",
+ displayName="Test Property",
+ description="Test description",
+ entityTypes=["urn:li:entityType:datahub.dataset"],
+ cardinality="SINGLE",
+ allowedValues=[
+ PropertyValueClass(value="test_value", description="Test description")
+ ],
+ )
+
+ mock_graph.get_aspect.return_value = mock_aspect
+
+ props = StructuredProperties.from_datahub(
+ mock_graph, "urn:li:structuredProperty:test_prop"
+ )
+
+ assert props.qualified_name == "test_prop"
+ assert props.type == "urn:li:dataType:datahub.string"
+ assert props.display_name == "Test Property"
+ assert props.allowed_values
+ assert len(props.allowed_values) == 1
+ assert props.allowed_values[0].value == "test_value"
+
+
+def test_structured_properties_to_yaml(tmp_path):
+ props = StructuredProperties(
+ id="test_prop",
+ type="string",
+ description="Test description",
+ allowed_values=[
+ AllowedValue(value="test_value", description="Test value description")
+ ],
+ )
+
+ yaml_file = tmp_path / "output.yaml"
+ props.to_yaml(yaml_file)
+
+ # Verify the yaml file was created and contains expected content
+ assert yaml_file.exists()
+ with open(yaml_file) as f:
+ content = yaml.safe_load(f)
+ assert content["id"] == "test_prop"
+ assert content["type"] == "urn:li:dataType:datahub.string"
+ assert content["description"] == "Test description"
+
+
+@pytest.mark.parametrize(
+ "input_type,expected_type",
+ [
+ ("string", "urn:li:dataType:datahub.string"),
+ ("STRING", "urn:li:dataType:datahub.string"),
+ ("number", "urn:li:dataType:datahub.number"),
+ ("date", "urn:li:dataType:datahub.date"),
+ ],
+)
+def test_structured_properties_type_normalization(input_type, expected_type):
+ props = StructuredProperties(id="test_prop", type=input_type)
+ assert props.type == expected_type
+
+
+def test_structured_properties_type_qualifier():
+ props = StructuredProperties(
+ id="test_prop",
+ type="urn",
+ type_qualifier=TypeQualifierAllowedTypes(allowed_types=["dataset"]),
+ )
+
+ mcps = props.generate_mcps()
+ assert mcps[0].aspect
+ assert mcps[0].aspect.typeQualifier["allowedTypes"] == [ # type: ignore
+ "urn:li:entityType:datahub.dataset"
+ ]
+
+
+def test_structured_properties_list(mock_graph):
+ mock_graph.get_urns_by_filter.return_value = [
+ "urn:li:structuredProperty:prop1",
+ "urn:li:structuredProperty:prop2",
+ ]
+
+ mock_aspect = StructuredPropertyDefinitionClass(
+ qualifiedName="test_prop",
+ valueType="urn:li:dataType:string",
+ entityTypes=["urn:li:entityType:datahub.dataset"],
+ )
+ mock_graph.get_aspect.return_value = mock_aspect
+
+ props = list(StructuredProperties.list(mock_graph))
+
+ # Verify get_urns_by_filter was called with correct arguments
+ mock_graph.get_urns_by_filter.assert_called_once_with(
+ entity_types=["structuredProperty"]
+ )
+
+ assert len(props) == 2
+ assert all(isinstance(prop, StructuredProperties) for prop in props)
diff --git a/metadata-ingestion/tests/unit/test_usage_common.py b/metadata-ingestion/tests/unit/test_usage_common.py
index e01f0ea77df837..bd6d194835dd96 100644
--- a/metadata-ingestion/tests/unit/test_usage_common.py
+++ b/metadata-ingestion/tests/unit/test_usage_common.py
@@ -5,6 +5,7 @@
from freezegun import freeze_time
from pydantic import ValidationError
+import datahub.ingestion.source.usage.usage_common
from datahub.configuration.common import AllowDenyPattern
from datahub.configuration.time_window_config import BucketDuration, get_time_bucket
from datahub.emitter.mce_builder import make_dataset_urn_with_platform_instance
@@ -28,6 +29,7 @@
UserUsageCountsClass,
WindowDurationClass,
)
+from datahub.testing.doctest import assert_doctest
_TestTableRef = str
@@ -373,3 +375,7 @@ def test_convert_usage_aggregation_class():
eventGranularity=TimeWindowSizeClass(unit=CalendarIntervalClass.MONTH),
),
)
+
+
+def test_extract_user_email():
+ assert_doctest(datahub.ingestion.source.usage.usage_common)
diff --git a/metadata-ingestion/tests/unit/urns/test_urn.py b/metadata-ingestion/tests/unit/urns/test_urn.py
index 0c362473c0cf18..bee80ec33148e9 100644
--- a/metadata-ingestion/tests/unit/urns/test_urn.py
+++ b/metadata-ingestion/tests/unit/urns/test_urn.py
@@ -4,7 +4,13 @@
import pytest
-from datahub.metadata.urns import CorpUserUrn, DatasetUrn, Urn
+from datahub.metadata.urns import (
+ CorpUserUrn,
+ DataPlatformUrn,
+ DatasetUrn,
+ SchemaFieldUrn,
+ Urn,
+)
from datahub.utilities.urns.error import InvalidUrnError
pytestmark = pytest.mark.filterwarnings("ignore::DeprecationWarning")
@@ -60,6 +66,20 @@ def test_urn_coercion() -> None:
assert urn == Urn.from_string(urn.urn())
+def test_urns_in_init() -> None:
+ platform = DataPlatformUrn("abc")
+ assert platform.urn() == "urn:li:dataPlatform:abc"
+
+ dataset_urn = DatasetUrn(platform, "def", "PROD")
+ assert dataset_urn.urn() == "urn:li:dataset:(urn:li:dataPlatform:abc,def,PROD)"
+
+ schema_field = SchemaFieldUrn(dataset_urn, "foo")
+ assert (
+ schema_field.urn()
+ == "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:abc,def,PROD),foo)"
+ )
+
+
def test_urn_type_dispatch_1() -> None:
urn = Urn.from_string("urn:li:dataset:(urn:li:dataPlatform:abc,def,PROD)")
assert isinstance(urn, DatasetUrn)
diff --git a/metadata-ingestion/tests/unit/utilities/test_file_backed_collections.py b/metadata-ingestion/tests/unit/utilities/test_file_backed_collections.py
index 6230c2e37edc6a..7e1627151c6ebf 100644
--- a/metadata-ingestion/tests/unit/utilities/test_file_backed_collections.py
+++ b/metadata-ingestion/tests/unit/utilities/test_file_backed_collections.py
@@ -5,6 +5,7 @@
import sqlite3
from dataclasses import dataclass
from typing import Counter, Dict
+from unittest.mock import patch
import pytest
@@ -15,6 +16,36 @@
)
+def test_set_use_sqlite_on_conflict():
+ with patch("sqlite3.sqlite_version_info", (3, 24, 0)):
+ cache = FileBackedDict[int](
+ tablename="cache",
+ cache_max_size=10,
+ cache_eviction_batch_size=10,
+ )
+ assert cache._use_sqlite_on_conflict is True
+
+ with pytest.raises(RuntimeError):
+ with patch("sqlite3.sqlite_version_info", (3, 23, 1)):
+ cache = FileBackedDict[int](
+ tablename="cache",
+ cache_max_size=10,
+ cache_eviction_batch_size=10,
+ )
+ assert cache._use_sqlite_on_conflict is False
+
+ with patch("sqlite3.sqlite_version_info", (3, 23, 1)), patch(
+ "datahub.utilities.file_backed_collections.OVERRIDE_SQLITE_VERSION_REQUIREMENT",
+ True,
+ ):
+ cache = FileBackedDict[int](
+ tablename="cache",
+ cache_max_size=10,
+ cache_eviction_batch_size=10,
+ )
+ assert cache._use_sqlite_on_conflict is False
+
+
@pytest.mark.parametrize("use_sqlite_on_conflict", [True, False])
def test_file_dict(use_sqlite_on_conflict: bool) -> None:
cache = FileBackedDict[int](
diff --git a/metadata-integration/java/acryl-spark-lineage/README.md b/metadata-integration/java/acryl-spark-lineage/README.md
index 97851e90e860ed..e51c884c297d7e 100644
--- a/metadata-integration/java/acryl-spark-lineage/README.md
+++ b/metadata-integration/java/acryl-spark-lineage/README.md
@@ -24,7 +24,7 @@ When running jobs using spark-submit, the agent needs to be configured in the co
```text
#Configuring DataHub spark agent jar
-spark.jars.packages io.acryl:acryl-spark-lineage:0.2.16
+spark.jars.packages io.acryl:acryl-spark-lineage:0.2.17
spark.extraListeners datahub.spark.DatahubSparkListener
spark.datahub.rest.server http://localhost:8080
```
@@ -32,7 +32,7 @@ spark.datahub.rest.server http://localhost:8080
## spark-submit command line
```sh
-spark-submit --packages io.acryl:acryl-spark-lineage:0.2.16 --conf "spark.extraListeners=datahub.spark.DatahubSparkListener" my_spark_job_to_run.py
+spark-submit --packages io.acryl:acryl-spark-lineage:0.2.17 --conf "spark.extraListeners=datahub.spark.DatahubSparkListener" my_spark_job_to_run.py
```
### Configuration Instructions: Amazon EMR
@@ -41,7 +41,7 @@ Set the following spark-defaults configuration properties as it
stated [here](https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-spark-configure.html)
```text
-spark.jars.packages io.acryl:acryl-spark-lineage:0.2.16
+spark.jars.packages io.acryl:acryl-spark-lineage:0.2.17
spark.extraListeners datahub.spark.DatahubSparkListener
spark.datahub.rest.server https://your_datahub_host/gms
#If you have authentication set up then you also need to specify the Datahub access token
@@ -56,7 +56,7 @@ When running interactive jobs from a notebook, the listener can be configured wh
spark = SparkSession.builder
.master("spark://spark-master:7077")
.appName("test-application")
-.config("spark.jars.packages", "io.acryl:acryl-spark-lineage:0.2.16")
+.config("spark.jars.packages", "io.acryl:acryl-spark-lineage:0.2.17")
.config("spark.extraListeners", "datahub.spark.DatahubSparkListener")
.config("spark.datahub.rest.server", "http://localhost:8080")
.enableHiveSupport()
@@ -79,7 +79,7 @@ appName("test-application")
config("spark.master","spark://spark-master:7077")
.
-config("spark.jars.packages","io.acryl:acryl-spark-lineage:0.2.16")
+config("spark.jars.packages","io.acryl:acryl-spark-lineage:0.2.17")
.
config("spark.extraListeners","datahub.spark.DatahubSparkListener")
@@ -158,45 +158,47 @@ information like tokens.
## Configuration Options
-| Field | Required | Default | Description |
-|--------------------------------------------------------|----------|-----------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| spark.jars.packages | ✅ | | Set with latest/required version io.acryl:acryl-spark-lineage:0.2.15 |
-| spark.extraListeners | ✅ | | datahub.spark.DatahubSparkListener |
-| spark.datahub.emitter | | rest | Specify the ways to emit metadata. By default it sends to DataHub using REST emitter. Valid options are rest, kafka or file |
-| spark.datahub.rest.server | | http://localhost:8080 | Datahub server url eg: |
-| spark.datahub.rest.token | | | Authentication token. |
-| spark.datahub.rest.disable_ssl_verification | | false | Disable SSL certificate validation. Caution: Only use this if you know what you are doing! |
-| spark.datahub.rest.disable_chunked_encoding | | false | Disable Chunked Transfer Encoding. In some environment chunked encoding causes issues. With this config option it can be disabled. ||
-| spark.datahub.rest.max_retries | | 0 | Number of times a request retried if failed |
-| spark.datahub.rest.retry_interval | | 10 | Number of seconds to wait between retries |
-| spark.datahub.file.filename | | | The file where metadata will be written if file emitter is set |
-| spark.datahub.kafka.bootstrap | | | The Kafka bootstrap server url to use if the Kafka emitter is set |
-| spark.datahub.kafka.schema_registry_url | | | The Schema registry url to use if the Kafka emitter is set |
-| spark.datahub.kafka.schema_registry_config. | | | Additional config to pass in to the Schema Registry Client |
-| spark.datahub.kafka.producer_config. | | | Additional config to pass in to the Kafka producer. For example: `--conf "spark.datahub.kafka.producer_config.client.id=my_client_id"` |
-| spark.datahub.metadata.pipeline.platformInstance | | | Pipeline level platform instance |
-| spark.datahub.metadata.dataset.platformInstance | | | dataset level platform instance (it is usefult to set if you have it in your glue ingestion) |
-| spark.datahub.metadata.dataset.env | | PROD | [Supported values](https://datahubproject.io/docs/graphql/enums#fabrictype). In all other cases, will fallback to PROD |
-| spark.datahub.metadata.dataset.hivePlatformAlias | | hive | By default, datahub assigns Hive-like tables to the Hive platform. If you are using Glue as your Hive metastore, set this config flag to `glue` |
+| Field | Required | Default | Description |
+|--------------------------------------------------------|----------|-----------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| spark.jars.packages | ✅ | | Set with latest/required version io.acryl:acryl-spark-lineage:0.2.15 |
+| spark.extraListeners | ✅ | | datahub.spark.DatahubSparkListener |
+| spark.datahub.emitter | | rest | Specify the ways to emit metadata. By default it sends to DataHub using REST emitter. Valid options are rest, kafka or file |
+| spark.datahub.rest.server | | http://localhost:8080 | Datahub server url eg: |
+| spark.datahub.rest.token | | | Authentication token. |
+| spark.datahub.rest.disable_ssl_verification | | false | Disable SSL certificate validation. Caution: Only use this if you know what you are doing! |
+| spark.datahub.rest.disable_chunked_encoding | | false | Disable Chunked Transfer Encoding. In some environment chunked encoding causes issues. With this config option it can be disabled. ||
+| spark.datahub.rest.max_retries | | 0 | Number of times a request retried if failed |
+| spark.datahub.rest.retry_interval | | 10 | Number of seconds to wait between retries |
+| spark.datahub.file.filename | | | The file where metadata will be written if file emitter is set |
+| spark.datahub.kafka.bootstrap | | | The Kafka bootstrap server url to use if the Kafka emitter is set |
+| spark.datahub.kafka.schema_registry_url | | | The Schema registry url to use if the Kafka emitter is set |
+| spark.datahub.kafka.schema_registry_config. | | | Additional config to pass in to the Schema Registry Client |
+| spark.datahub.kafka.producer_config. | | | Additional config to pass in to the Kafka producer. For example: `--conf "spark.datahub.kafka.producer_config.client.id=my_client_id"` |
+| spark.datahub.metadata.pipeline.platformInstance | | | Pipeline level platform instance |
+| spark.datahub.metadata.dataset.platformInstance | | | dataset level platform instance (it is usefult to set if you have it in your glue ingestion) |
+| spark.datahub.metadata.dataset.env | | PROD | [Supported values](https://datahubproject.io/docs/graphql/enums#fabrictype). In all other cases, will fallback to PROD |
+| spark.datahub.metadata.dataset.hivePlatformAlias | | hive | By default, datahub assigns Hive-like tables to the Hive platform. If you are using Glue as your Hive metastore, set this config flag to `glue` |
| spark.datahub.metadata.include_scheme | | true | Include scheme from the path URI (e.g. hdfs://, s3://) in the dataset URN. We recommend setting this value to false, it is set to true for backwards compatibility with previous versions |
-| spark.datahub.metadata.remove_partition_pattern | | | Remove partition pattern. (e.g. /partition=\d+) It change database/table/partition=123 to database/table |
-| spark.datahub.coalesce_jobs | | true | Only one datajob(task) will be emitted containing all input and output datasets for the spark application |
-| spark.datahub.parent.datajob_urn | | | Specified dataset will be set as upstream dataset for datajob created. Effective only when spark.datahub.coalesce_jobs is set to true |
-| spark.datahub.metadata.dataset.materialize | | false | Materialize Datasets in DataHub |
-| spark.datahub.platform.s3.path_spec_list | | | List of pathspec per platform |
-| spark.datahub.metadata.dataset.include_schema_metadata | false | | Emit dataset schema metadata based on the spark execution. It is recommended to get schema information from platform specific DataHub sources as this is less reliable |
-| spark.datahub.flow_name | | | If it is set it will be used as the DataFlow name otherwise it uses spark app name as flow_name |
-| spark.datahub.file_partition_regexp | | | Strip partition part from the path if path end matches with the specified regexp. Example `year=.*/month=.*/day=.*` |
-| spark.datahub.tags | | | Comma separated list of tags to attach to the DataFlow |
-| spark.datahub.domains | | | Comma separated list of domain urns to attach to the DataFlow |
-| spark.datahub.stage_metadata_coalescing | | | Normally it coalesces and sends metadata at the onApplicationEnd event which is never called on Databricks or on Glue. You should enable this on Databricks if you want coalesced run. |
-| spark.datahub.patch.enabled | | false | Set this to true to send lineage as a patch, which appends rather than overwrites existing Dataset lineage edges. By default, it is disabled. |
-| spark.datahub.metadata.dataset.lowerCaseUrns | | false | Set this to true to lowercase dataset urns. By default, it is disabled. |
-| spark.datahub.disableSymlinkResolution | | false | Set this to true if you prefer using the s3 location instead of the Hive table. By default, it is disabled. |
-| spark.datahub.s3.bucket | | | The name of the bucket where metadata will be written if s3 emitter is set |
-| spark.datahub.s3.prefix | | | The prefix for the file where metadata will be written on s3 if s3 emitter is set |
-| spark.datahub.s3.filename | | | The name of the file where metadata will be written if it is not set random filename will be used on s3 if s3 emitter is set |
-
+| spark.datahub.metadata.remove_partition_pattern | | | Remove partition pattern. (e.g. /partition=\d+) It change database/table/partition=123 to database/table |
+| spark.datahub.coalesce_jobs | | true | Only one datajob(task) will be emitted containing all input and output datasets for the spark application |
+| spark.datahub.parent.datajob_urn | | | Specified dataset will be set as upstream dataset for datajob created. Effective only when spark.datahub.coalesce_jobs is set to true |
+| spark.datahub.metadata.dataset.materialize | | false | Materialize Datasets in DataHub |
+| spark.datahub.platform.s3.path_spec_list | | | List of pathspec per platform |
+| spark.datahub.metadata.dataset.include_schema_metadata | false | | Emit dataset schema metadata based on the spark execution. It is recommended to get schema information from platform specific DataHub sources as this is less reliable |
+| spark.datahub.flow_name | | | If it is set it will be used as the DataFlow name otherwise it uses spark app name as flow_name |
+| spark.datahub.file_partition_regexp | | | Strip partition part from the path if path end matches with the specified regexp. Example `year=.*/month=.*/day=.*` |
+| spark.datahub.tags | | | Comma separated list of tags to attach to the DataFlow |
+| spark.datahub.domains | | | Comma separated list of domain urns to attach to the DataFlow |
+| spark.datahub.stage_metadata_coalescing | | | Normally it coalesces and sends metadata at the onApplicationEnd event which is never called on Databricks or on Glue. You should enable this on Databricks if you want coalesced run. |
+| spark.datahub.patch.enabled | | false | Set this to true to send lineage as a patch, which appends rather than overwrites existing Dataset lineage edges. By default, it is disabled. |
+| spark.datahub.metadata.dataset.lowerCaseUrns | | false | Set this to true to lowercase dataset urns. By default, it is disabled. |
+| spark.datahub.disableSymlinkResolution | | false | Set this to true if you prefer using the s3 location instead of the Hive table. By default, it is disabled. |
+| spark.datahub.s3.bucket | | | The name of the bucket where metadata will be written if s3 emitter is set |
+| spark.datahub.s3.prefix | | | The prefix for the file where metadata will be written on s3 if s3 emitter is set |
+| spark.datahub.s3.filename | | | The name of the file where metadata will be written if it is not set random filename will be used on s3 if s3 emitter is set |
+| spark.datahub.s3.filename | | | The name of the file where metadata will be written if it is not set random filename will be used on s3 if s3 emitter is set |
+|spark.datahub.log.mcps | | true | Set this to true to log MCPS to the log. By default, it is enabled. |
+|spark.datahub.legacyLineageCleanup.enabled| | false | Set this to true to remove legacy lineages from older Spark Plugin runs. This will remove those lineages from the Datasets which it adds to DataJob. By default, it is disabled. |
## What to Expect: The Metadata Model
@@ -358,6 +360,19 @@ Use Java 8 to build the project. The project uses Gradle as the build tool. To b
+
## Changelog
+### Version 0.2.17
+- *Major changes*:
+ - Finegrained lineage is emitted on the DataJob and not on the emitted Datasets. This is the correct behaviour which was not correct earlier. This causes earlier emitted finegrained lineages won't be overwritten by the new ones.
+ You can remove the old lineages by setting `spark.datahub.legacyLineageCleanup.enabled=true`. Make sure you have the latest server if you enable with patch support. (this was introduced since 0.2.17-rc5)
+
+- *Changes*:
+ - OpenLineage 1.25.0 upgrade
+ - Add option to disable chunked encoding in the datahub rest sink -> `spark.datahub.rest.disable_chunked_encoding`
+ - Add option to specify the mcp kafka topic for the datahub kafka sink -> `spark.datahub.kafka.mcp_topic`
+ - Add option to remove legacy lineages from older Spark Plugin runs. This will remove those lineages from the Datasets which it adds to DataJob -> `spark.datahub.legacyLineageCleanup.enabled`
+- *Fixes*:
+ - Fix handling map transformation in the lineage. Earlier it generated wrong lineage for map transformation.
+
### Version 0.2.16
- Remove logging DataHub config into logs
diff --git a/metadata-integration/java/acryl-spark-lineage/src/main/java/datahub/spark/DatahubEventEmitter.java b/metadata-integration/java/acryl-spark-lineage/src/main/java/datahub/spark/DatahubEventEmitter.java
index 0bcc7db9e87408..84f397226ce912 100644
--- a/metadata-integration/java/acryl-spark-lineage/src/main/java/datahub/spark/DatahubEventEmitter.java
+++ b/metadata-integration/java/acryl-spark-lineage/src/main/java/datahub/spark/DatahubEventEmitter.java
@@ -1,12 +1,18 @@
package datahub.spark;
+import static com.linkedin.metadata.Constants.*;
import static datahub.spark.converter.SparkStreamingEventToDatahub.*;
import static io.datahubproject.openlineage.converter.OpenLineageToDataHub.*;
import static io.datahubproject.openlineage.utils.DatahubUtils.*;
+import com.fasterxml.jackson.annotation.JsonInclude;
+import com.fasterxml.jackson.core.StreamReadConstraints;
+import com.fasterxml.jackson.databind.ObjectMapper;
import com.linkedin.common.GlobalTags;
import com.linkedin.common.UrnArray;
import com.linkedin.common.urn.DataJobUrn;
+import com.linkedin.data.DataMap;
+import com.linkedin.data.template.JacksonDataTemplateCodec;
import com.linkedin.data.template.StringMap;
import com.linkedin.dataprocess.DataProcessInstanceRelationships;
import com.linkedin.dataprocess.RunResultType;
@@ -62,12 +68,23 @@ public class DatahubEventEmitter extends EventEmitter {
private final Map schemaMap = new HashMap<>();
private SparkLineageConf datahubConf;
private static final int DEFAULT_TIMEOUT_SEC = 10;
+ private final ObjectMapper objectMapper;
+ private final JacksonDataTemplateCodec dataTemplateCodec;
private final EventFormatter eventFormatter = new EventFormatter();
public DatahubEventEmitter(SparkOpenLineageConfig config, String applicationJobName)
throws URISyntaxException {
super(config, applicationJobName);
+ objectMapper = new ObjectMapper().setSerializationInclusion(JsonInclude.Include.NON_NULL);
+ int maxSize =
+ Integer.parseInt(
+ System.getenv()
+ .getOrDefault(INGESTION_MAX_SERIALIZED_STRING_LENGTH, MAX_JACKSON_STRING_SIZE));
+ objectMapper
+ .getFactory()
+ .setStreamReadConstraints(StreamReadConstraints.builder().maxStringLength(maxSize).build());
+ dataTemplateCodec = new JacksonDataTemplateCodec(objectMapper.getFactory());
}
private Optional getEmitter() {
@@ -407,7 +424,14 @@ protected void emitMcps(List mcps) {
.map(
mcp -> {
try {
- log.info("emitting mcpw: " + mcp);
+ if (this.datahubConf.isLogMcps()) {
+ DataMap map = mcp.data();
+ String serializedMCP = dataTemplateCodec.mapToString(map);
+ log.info("emitting mcpw: {}", serializedMCP);
+ } else {
+ log.info(
+ "emitting aspect: {} for urn: {}", mcp.getAspectName(), mcp.getEntityUrn());
+ }
return emitter.get().emit(mcp);
} catch (IOException ioException) {
log.error("Failed to emit metadata to DataHub", ioException);
diff --git a/metadata-integration/java/acryl-spark-lineage/src/main/java/datahub/spark/conf/SparkConfigParser.java b/metadata-integration/java/acryl-spark-lineage/src/main/java/datahub/spark/conf/SparkConfigParser.java
index 3860285083c4bb..824cd1a687b264 100644
--- a/metadata-integration/java/acryl-spark-lineage/src/main/java/datahub/spark/conf/SparkConfigParser.java
+++ b/metadata-integration/java/acryl-spark-lineage/src/main/java/datahub/spark/conf/SparkConfigParser.java
@@ -31,6 +31,7 @@ public class SparkConfigParser {
public static final String FILE_EMITTER_FILE_NAME = "file.filename";
public static final String DISABLE_SSL_VERIFICATION_KEY = "rest.disable_ssl_verification";
public static final String REST_DISABLE_CHUNKED_ENCODING = "rest.disable_chunked_encoding";
+ public static final String CONFIG_LOG_MCPS = "log.mcps";
public static final String MAX_RETRIES = "rest.max_retries";
public static final String RETRY_INTERVAL_IN_SEC = "rest.retry_interval_in_sec";
@@ -51,6 +52,7 @@ public class SparkConfigParser {
public static final String COALESCE_KEY = "coalesce_jobs";
public static final String PATCH_ENABLED = "patch.enabled";
+ public static final String LEGACY_LINEAGE_CLEANUP = "legacyLineageCleanup.enabled";
public static final String DISABLE_SYMLINK_RESOLUTION = "disableSymlinkResolution";
public static final String STAGE_METADATA_COALESCING = "stage_metadata_coalescing";
@@ -158,6 +160,7 @@ public static DatahubOpenlineageConfig sparkConfigToDatahubOpenlineageConf(
Config sparkConfig, SparkAppContext sparkAppContext) {
DatahubOpenlineageConfig.DatahubOpenlineageConfigBuilder builder =
DatahubOpenlineageConfig.builder();
+ builder.isSpark(true);
builder.filePartitionRegexpPattern(
SparkConfigParser.getFilePartitionRegexpPattern(sparkConfig));
builder.fabricType(SparkConfigParser.getCommonFabricType(sparkConfig));
@@ -172,6 +175,7 @@ public static DatahubOpenlineageConfig sparkConfigToDatahubOpenlineageConf(
builder.commonDatasetPlatformInstance(SparkConfigParser.getCommonPlatformInstance(sparkConfig));
builder.hivePlatformAlias(SparkConfigParser.getHivePlatformAlias(sparkConfig));
builder.usePatch(SparkConfigParser.isPatchEnabled(sparkConfig));
+ builder.removeLegacyLineage(SparkConfigParser.isLegacyLineageCleanupEnabled(sparkConfig));
builder.disableSymlinkResolution(SparkConfigParser.isDisableSymlinkResolution(sparkConfig));
builder.lowerCaseDatasetUrns(SparkConfigParser.isLowerCaseDatasetUrns(sparkConfig));
try {
@@ -311,6 +315,13 @@ public static boolean isDatasetMaterialize(Config datahubConfig) {
&& datahubConfig.getBoolean(DATASET_MATERIALIZE_KEY);
}
+ public static boolean isLogMcps(Config datahubConfig) {
+ if (datahubConfig.hasPath(CONFIG_LOG_MCPS)) {
+ return datahubConfig.getBoolean(CONFIG_LOG_MCPS);
+ }
+ return true;
+ }
+
public static boolean isIncludeSchemaMetadata(Config datahubConfig) {
if (datahubConfig.hasPath(DATASET_INCLUDE_SCHEMA_METADATA)) {
return datahubConfig.getBoolean(DATASET_INCLUDE_SCHEMA_METADATA);
@@ -352,6 +363,14 @@ public static boolean isPatchEnabled(Config datahubConfig) {
return datahubConfig.hasPath(PATCH_ENABLED) && datahubConfig.getBoolean(PATCH_ENABLED);
}
+ public static boolean isLegacyLineageCleanupEnabled(Config datahubConfig) {
+ if (!datahubConfig.hasPath(LEGACY_LINEAGE_CLEANUP)) {
+ return false;
+ }
+ return datahubConfig.hasPath(LEGACY_LINEAGE_CLEANUP)
+ && datahubConfig.getBoolean(LEGACY_LINEAGE_CLEANUP);
+ }
+
public static boolean isDisableSymlinkResolution(Config datahubConfig) {
if (!datahubConfig.hasPath(DISABLE_SYMLINK_RESOLUTION)) {
return false;
diff --git a/metadata-integration/java/acryl-spark-lineage/src/main/java/datahub/spark/conf/SparkLineageConf.java b/metadata-integration/java/acryl-spark-lineage/src/main/java/datahub/spark/conf/SparkLineageConf.java
index 014cff873bbde9..96afe729b82c00 100644
--- a/metadata-integration/java/acryl-spark-lineage/src/main/java/datahub/spark/conf/SparkLineageConf.java
+++ b/metadata-integration/java/acryl-spark-lineage/src/main/java/datahub/spark/conf/SparkLineageConf.java
@@ -17,6 +17,7 @@ public class SparkLineageConf {
final DatahubOpenlineageConfig openLineageConf;
@Builder.Default final boolean coalesceEnabled = true;
@Builder.Default final boolean emitCoalescePeriodically = false;
+ @Builder.Default final boolean logMcps = true;
final SparkAppContext sparkAppContext;
final DatahubEmitterConfig datahubEmitterConfig;
@Builder.Default final List tags = new LinkedList<>();
@@ -32,6 +33,7 @@ public static SparkLineageConf toSparkLineageConf(
SparkConfigParser.sparkConfigToDatahubOpenlineageConf(sparkConfig, sparkAppContext);
builder.openLineageConf(datahubOpenlineageConfig);
builder.coalesceEnabled(SparkConfigParser.isCoalesceEnabled(sparkConfig));
+ builder.logMcps(SparkConfigParser.isLogMcps(sparkConfig));
if (SparkConfigParser.getTags(sparkConfig) != null) {
builder.tags(Arrays.asList(Objects.requireNonNull(SparkConfigParser.getTags(sparkConfig))));
}
diff --git a/metadata-integration/java/acryl-spark-lineage/src/test/java/datahub/spark/OpenLineageEventToDatahubTest.java b/metadata-integration/java/acryl-spark-lineage/src/test/java/datahub/spark/OpenLineageEventToDatahubTest.java
index ef2b17e9932f2f..b9a142364d4e89 100644
--- a/metadata-integration/java/acryl-spark-lineage/src/test/java/datahub/spark/OpenLineageEventToDatahubTest.java
+++ b/metadata-integration/java/acryl-spark-lineage/src/test/java/datahub/spark/OpenLineageEventToDatahubTest.java
@@ -814,4 +814,32 @@ public void testProcessGCSInputsOutputs() throws URISyntaxException, IOException
dataset.getUrn().toString());
}
}
+
+ public void testProcessMappartitionJob() throws URISyntaxException, IOException {
+ DatahubOpenlineageConfig.DatahubOpenlineageConfigBuilder builder =
+ DatahubOpenlineageConfig.builder();
+ builder.fabricType(FabricType.DEV);
+ builder.lowerCaseDatasetUrns(true);
+ builder.materializeDataset(true);
+ builder.includeSchemaMetadata(true);
+ builder.isSpark(true);
+
+ String olEvent =
+ IOUtils.toString(
+ this.getClass().getResourceAsStream("/ol_events/map_partition_job.json"),
+ StandardCharsets.UTF_8);
+
+ OpenLineage.RunEvent runEvent = OpenLineageClientUtils.runEventFromJson(olEvent);
+ DatahubJob datahubJob = OpenLineageToDataHub.convertRunEventToJob(runEvent, builder.build());
+
+ assertNotNull(datahubJob);
+
+ assertEquals(1, datahubJob.getInSet().size());
+ for (DatahubDataset dataset : datahubJob.getInSet()) {
+ assertEquals(
+ "urn:li:dataset:(urn:li:dataPlatform:s3,my-bucket/my_dir/my_file.csv,DEV)",
+ dataset.getUrn().toString());
+ }
+ assertEquals(0, datahubJob.getOutSet().size());
+ }
}
diff --git a/metadata-integration/java/acryl-spark-lineage/src/test/resources/ol_events/map_partition_job.json b/metadata-integration/java/acryl-spark-lineage/src/test/resources/ol_events/map_partition_job.json
new file mode 100644
index 00000000000000..39560a782840ce
--- /dev/null
+++ b/metadata-integration/java/acryl-spark-lineage/src/test/resources/ol_events/map_partition_job.json
@@ -0,0 +1,66 @@
+{
+ "eventTime": "2024-11-20T12:59:29.059Z",
+ "producer": "https://github.com/OpenLineage/OpenLineage/tree/1.24.2/integration/spark",
+ "schemaURL": "https://openlineage.io/spec/2-0-2/OpenLineage.json#/$defs/RunEvent",
+ "eventType": "START",
+ "run": {
+ "runId": "01902a1e-0b05-750e-b38d-439998f7a853",
+ "facets": {
+ "parent": {
+ "_producer": "https://github.com/OpenLineage/OpenLineage/tree/1.24.2/integration/spark",
+ "_schemaURL": "https://openlineage.io/spec/facets/1-0-1/ParentRunFacet.json#/$defs/ParentRunFacet",
+ "run": {
+ "runId": "01902a1e-0b05-750e-b38d-439998f7a853"
+ },
+ "job": {
+ "namespace": "default",
+ "name": "spark_context_session"
+ }
+ },
+ "processing_engine": {
+ "_producer": "https://github.com/OpenLineage/OpenLineage/tree/1.24.2/integration/spark",
+ "_schemaURL": "https://openlineage.io/spec/facets/1-1-1/ProcessingEngineRunFacet.json#/$defs/ProcessingEngineRunFacet",
+ "version": "3.4.2",
+ "name": "spark"
+ },
+ "spark_jobDetails": {
+ "_producer": "https://github.com/OpenLineage/OpenLineage/tree/1.24.2/integration/spark",
+ "_schemaURL": "https://openlineage.io/spec/2-0-2/OpenLineage.json#/$defs/RunFacet",
+ "jobId": 0
+ },
+ "spark_properties": {
+ "_producer": "https://github.com/OpenLineage/OpenLineage/tree/1.24.2/integration/spark",
+ "_schemaURL": "https://openlineage.io/spec/2-0-2/OpenLineage.json#/$defs/RunFacet",
+ "properties": {
+ "spark.master": "yarn",
+ "spark.app.name": "SparkContextSession"
+ }
+ }
+ }
+ },
+ "job": {
+ "namespace": "default",
+ "name": "spark_context_session.map_partitions_parallel_collection",
+ "facets": {
+ "jobType": {
+ "_producer": "https://github.com/OpenLineage/OpenLineage/tree/1.24.2/integration/spark",
+ "_schemaURL": "https://openlineage.io/spec/facets/2-0-3/JobTypeJobFacet.json#/$defs/JobTypeJobFacet",
+ "processingType": "BATCH",
+ "integration": "SPARK",
+ "jobType": "RDD_JOB"
+ }
+ }
+ },
+ "inputs": [
+ {
+ "namespace": "s3://my-bucket",
+ "name": "my_dir/my_file.csv"
+ }
+ ],
+ "outputs": [
+ {
+ "namespace": "s3://my-bucket",
+ "name": "my_dir/my_file.csv"
+ }
+ ]
+}
\ No newline at end of file
diff --git a/metadata-integration/java/openlineage-converter/src/main/java/io/datahubproject/openlineage/config/DatahubOpenlineageConfig.java b/metadata-integration/java/openlineage-converter/src/main/java/io/datahubproject/openlineage/config/DatahubOpenlineageConfig.java
index 5abb3c90d232bd..c725673eae47b5 100644
--- a/metadata-integration/java/openlineage-converter/src/main/java/io/datahubproject/openlineage/config/DatahubOpenlineageConfig.java
+++ b/metadata-integration/java/openlineage-converter/src/main/java/io/datahubproject/openlineage/config/DatahubOpenlineageConfig.java
@@ -16,6 +16,7 @@
@Getter
@ToString
public class DatahubOpenlineageConfig {
+ @Builder.Default private final boolean isSpark = false;
@Builder.Default private final boolean isStreaming = false;
@Builder.Default private final String pipelineName = null;
private final String platformInstance;
@@ -34,6 +35,7 @@ public class DatahubOpenlineageConfig {
@Builder.Default private Map urnAliases = new HashMap<>();
@Builder.Default private final boolean disableSymlinkResolution = false;
@Builder.Default private final boolean lowerCaseDatasetUrns = false;
+ @Builder.Default private final boolean removeLegacyLineage = false;
public List getPathSpecsForPlatform(String platform) {
if ((pathSpecs == null) || (pathSpecs.isEmpty())) {
diff --git a/metadata-integration/java/openlineage-converter/src/main/java/io/datahubproject/openlineage/converter/OpenLineageToDataHub.java b/metadata-integration/java/openlineage-converter/src/main/java/io/datahubproject/openlineage/converter/OpenLineageToDataHub.java
index 9237ee60f473b4..9fcfc68bd03f55 100644
--- a/metadata-integration/java/openlineage-converter/src/main/java/io/datahubproject/openlineage/converter/OpenLineageToDataHub.java
+++ b/metadata-integration/java/openlineage-converter/src/main/java/io/datahubproject/openlineage/converter/OpenLineageToDataHub.java
@@ -675,9 +675,30 @@ private static void convertJobToDataJob(
datahubJob.setJobInfo(dji);
DataJobInputOutput inputOutput = new DataJobInputOutput();
+ boolean inputsEqualOutputs = false;
+ if ((datahubConf.isSpark())
+ && ((event.getInputs() != null && event.getOutputs() != null)
+ && (event.getInputs().size() == event.getOutputs().size()))) {
+ inputsEqualOutputs =
+ event.getInputs().stream()
+ .map(OpenLineage.Dataset::getName)
+ .collect(Collectors.toSet())
+ .equals(
+ event.getOutputs().stream()
+ .map(OpenLineage.Dataset::getName)
+ .collect(Collectors.toSet()));
+ if (inputsEqualOutputs) {
+ log.info(
+ "Inputs equals Outputs: {}. This is most probably because of an rdd map operation and we only process Inputs",
+ inputsEqualOutputs);
+ }
+ }
+
processJobInputs(datahubJob, event, datahubConf);
- processJobOutputs(datahubJob, event, datahubConf);
+ if (!inputsEqualOutputs) {
+ processJobOutputs(datahubJob, event, datahubConf);
+ }
DataProcessInstanceRunEvent dpire = processDataProcessInstanceResult(event);
datahubJob.setDataProcessInstanceRunEvent(dpire);
diff --git a/metadata-integration/java/openlineage-converter/src/main/java/io/datahubproject/openlineage/dataset/DatahubJob.java b/metadata-integration/java/openlineage-converter/src/main/java/io/datahubproject/openlineage/dataset/DatahubJob.java
index 60caaae359677f..e2aa2c3a04c406 100644
--- a/metadata-integration/java/openlineage-converter/src/main/java/io/datahubproject/openlineage/dataset/DatahubJob.java
+++ b/metadata-integration/java/openlineage-converter/src/main/java/io/datahubproject/openlineage/dataset/DatahubJob.java
@@ -28,7 +28,10 @@
import com.linkedin.dataprocess.DataProcessInstanceRelationships;
import com.linkedin.dataprocess.DataProcessInstanceRunEvent;
import com.linkedin.dataset.FineGrainedLineage;
+import com.linkedin.dataset.FineGrainedLineageArray;
import com.linkedin.dataset.Upstream;
+import com.linkedin.dataset.UpstreamArray;
+import com.linkedin.dataset.UpstreamLineage;
import com.linkedin.domain.Domains;
import com.linkedin.metadata.aspect.patch.builder.DataJobInputOutputPatchBuilder;
import com.linkedin.metadata.aspect.patch.builder.GlobalTagsPatchBuilder;
@@ -167,11 +170,34 @@ public List toMcps(DatahubOpenlineageConfig config) thro
return mcps;
}
+ private FineGrainedLineageArray mergeFinegrainedLineages() {
+ FineGrainedLineageArray fgls = new FineGrainedLineageArray();
+
+ for (DatahubDataset dataset : inSet) {
+ if (dataset.lineage != null && dataset.lineage.getFineGrainedLineages() != null) {
+ dataset.lineage.getFineGrainedLineages().stream()
+ .filter(Objects::nonNull)
+ .forEach(fgls::add);
+ }
+ }
+
+ for (DatahubDataset dataset : outSet) {
+ if (dataset.lineage != null && dataset.lineage.getFineGrainedLineages() != null) {
+ dataset.lineage.getFineGrainedLineages().stream()
+ .filter(Objects::nonNull)
+ .forEach(fgls::add);
+ }
+ }
+
+ return fgls;
+ }
+
private void generateDataJobInputOutputMcp(
EdgeArray inputEdges,
EdgeArray outputEdges,
DatahubOpenlineageConfig config,
List mcps) {
+
DataJobInputOutput dataJobInputOutput = new DataJobInputOutput();
log.info("Adding DataJob edges to {}", jobUrn);
if (config.isUsePatch() && (!parentJobs.isEmpty() || !inSet.isEmpty() || !outSet.isEmpty())) {
@@ -186,6 +212,27 @@ private void generateDataJobInputOutputMcp(
for (DataJobUrn parentJob : parentJobs) {
dataJobInputOutputPatchBuilder.addInputDatajobEdge(parentJob);
}
+
+ FineGrainedLineageArray fgls = mergeFinegrainedLineages();
+ fgls.forEach(
+ fgl -> {
+ Objects.requireNonNull(fgl.getUpstreams())
+ .forEach(
+ upstream -> {
+ Objects.requireNonNull(fgl.getDownstreams())
+ .forEach(
+ downstream -> {
+ dataJobInputOutputPatchBuilder.addFineGrainedUpstreamField(
+ upstream,
+ fgl.getConfidenceScore(),
+ StringUtils.defaultIfEmpty(
+ fgl.getTransformOperation(), "TRANSFORM"),
+ downstream,
+ fgl.getQuery());
+ });
+ });
+ });
+
MetadataChangeProposal dataJobInputOutputMcp = dataJobInputOutputPatchBuilder.build();
log.info(
"dataJobInputOutputMcp: {}",
@@ -195,6 +242,8 @@ private void generateDataJobInputOutputMcp(
mcps.add(dataJobInputOutputPatchBuilder.build());
} else {
+ FineGrainedLineageArray fgls = mergeFinegrainedLineages();
+ dataJobInputOutput.setFineGrainedLineages(fgls);
dataJobInputOutput.setInputDatasetEdges(inputEdges);
dataJobInputOutput.setInputDatasets(new DatasetUrnArray());
dataJobInputOutput.setOutputDatasetEdges(outputEdges);
@@ -235,6 +284,49 @@ private void generateDataProcessInstanceMcp(
generateDataProcessInstanceRelationship(mcps);
}
+ private void deleteOldDatasetLineage(
+ DatahubDataset dataset, DatahubOpenlineageConfig config, List mcps) {
+ if (dataset.getLineage() != null) {
+ if (config.isUsePatch()) {
+ if (!dataset.getLineage().getUpstreams().isEmpty()) {
+ UpstreamLineagePatchBuilder upstreamLineagePatchBuilder =
+ new UpstreamLineagePatchBuilder().urn(dataset.getUrn());
+ for (Upstream upstream : dataset.getLineage().getUpstreams()) {
+ upstreamLineagePatchBuilder.removeUpstream(upstream.getDataset());
+ }
+
+ log.info("Removing FineGrainedLineage to {}", dataset.getUrn());
+ for (FineGrainedLineage fineGrainedLineage :
+ Objects.requireNonNull(dataset.getLineage().getFineGrainedLineages())) {
+ for (Urn upstream : Objects.requireNonNull(fineGrainedLineage.getUpstreams())) {
+ for (Urn downstream : Objects.requireNonNull(fineGrainedLineage.getDownstreams())) {
+ upstreamLineagePatchBuilder.removeFineGrainedUpstreamField(
+ upstream,
+ StringUtils.defaultIfEmpty(
+ fineGrainedLineage.getTransformOperation(), "TRANSFORM"),
+ downstream,
+ null);
+ }
+ }
+ }
+ MetadataChangeProposal mcp = upstreamLineagePatchBuilder.build();
+ log.info(
+ "upstreamLineagePatch: {}",
+ mcp.getAspect().getValue().asString(Charset.defaultCharset()));
+ mcps.add(mcp);
+ }
+ } else {
+ if (!dataset.getLineage().getUpstreams().isEmpty()) {
+ // Remove earlier created UpstreamLineage which most probably was created by the plugin.
+ UpstreamLineage upstreamLineage = new UpstreamLineage();
+ upstreamLineage.setUpstreams(new UpstreamArray());
+ upstreamLineage.setFineGrainedLineages(new FineGrainedLineageArray());
+ addAspectToMcps(dataset.getUrn(), DATASET_ENTITY_TYPE, upstreamLineage, mcps);
+ }
+ }
+ }
+ }
+
private Pair processDownstreams(
DatahubOpenlineageConfig config, List mcps) {
UrnArray outputUrnArray = new UrnArray();
@@ -263,43 +355,13 @@ private Pair processDownstreams(
dataset.getUrn(), DATASET_ENTITY_TYPE, dataset.getSchemaMetadata(), mcps);
}
- if (dataset.getLineage() != null) {
- if (config.isUsePatch()) {
- if (!dataset.getLineage().getUpstreams().isEmpty()) {
- UpstreamLineagePatchBuilder upstreamLineagePatchBuilder =
- new UpstreamLineagePatchBuilder().urn(dataset.getUrn());
- for (Upstream upstream : dataset.getLineage().getUpstreams()) {
- upstreamLineagePatchBuilder.addUpstream(
- upstream.getDataset(), upstream.getType());
- }
-
- log.info("Adding FineGrainedLineage to {}", dataset.getUrn());
- for (FineGrainedLineage fineGrainedLineage :
- Objects.requireNonNull(dataset.getLineage().getFineGrainedLineages())) {
- for (Urn upstream : Objects.requireNonNull(fineGrainedLineage.getUpstreams())) {
- for (Urn downstream :
- Objects.requireNonNull(fineGrainedLineage.getDownstreams())) {
- upstreamLineagePatchBuilder.addFineGrainedUpstreamField(
- upstream,
- fineGrainedLineage.getConfidenceScore(),
- StringUtils.defaultIfEmpty(
- fineGrainedLineage.getTransformOperation(), "TRANSFORM"),
- downstream,
- null);
- }
- }
- }
- MetadataChangeProposal mcp = upstreamLineagePatchBuilder.build();
- log.info(
- "upstreamLineagePatch: {}",
- mcp.getAspect().getValue().asString(Charset.defaultCharset()));
- mcps.add(mcp);
- }
- } else {
- addAspectToMcps(dataset.getUrn(), DATASET_ENTITY_TYPE, dataset.getLineage(), mcps);
- }
+ // Remove lineage which was added by older plugin that set lineage on Datasets and not on
+ // DataJobs
+ if (config.isRemoveLegacyLineage()) {
+ deleteOldDatasetLineage(dataset, config, mcps);
}
});
+
return Pair.of(outputUrnArray, outputEdges);
}
@@ -330,10 +392,6 @@ private Pair processUpstreams(
addAspectToMcps(
dataset.getUrn(), DATASET_ENTITY_TYPE, dataset.getSchemaMetadata(), mcps);
}
-
- if (dataset.getLineage() != null) {
- addAspectToMcps(dataset.getUrn(), DATASET_ENTITY_TYPE, dataset.getLineage(), mcps);
- }
});
return Pair.of(inputUrnArray, inputEdges);
}
diff --git a/metadata-io/src/main/java/com/linkedin/metadata/client/JavaEntityClient.java b/metadata-io/src/main/java/com/linkedin/metadata/client/JavaEntityClient.java
index 3d35f5956b0f4f..35d133c74c0692 100644
--- a/metadata-io/src/main/java/com/linkedin/metadata/client/JavaEntityClient.java
+++ b/metadata-io/src/main/java/com/linkedin/metadata/client/JavaEntityClient.java
@@ -775,7 +775,8 @@ public List batchIngestProposals(
List updatedUrns = new ArrayList<>();
Iterators.partition(
- metadataChangeProposals.iterator(), Math.max(1, entityClientConfig.getBatchGetV2Size()))
+ metadataChangeProposals.iterator(),
+ Math.max(1, entityClientConfig.getBatchIngestSize()))
.forEachRemaining(
batch -> {
AspectsBatch aspectsBatch =
diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/ESIndexBuilder.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/ESIndexBuilder.java
index 6de79b6c4b181e..792e67e69f2da6 100644
--- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/ESIndexBuilder.java
+++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/ESIndexBuilder.java
@@ -411,6 +411,8 @@ private void reindex(ReindexConfig indexState) throws Throwable {
boolean reindexTaskCompleted = false;
Pair documentCounts = getDocumentCounts(indexState.name(), tempIndexName);
long documentCountsLastUpdated = System.currentTimeMillis();
+ long previousDocCount = documentCounts.getSecond();
+ long estimatedMinutesRemaining = 0;
while (System.currentTimeMillis() < timeoutAt) {
log.info(
@@ -421,8 +423,22 @@ private void reindex(ReindexConfig indexState) throws Throwable {
Pair tempDocumentsCount = getDocumentCounts(indexState.name(), tempIndexName);
if (!tempDocumentsCount.equals(documentCounts)) {
- documentCountsLastUpdated = System.currentTimeMillis();
+ long currentTime = System.currentTimeMillis();
+ long timeElapsed = currentTime - documentCountsLastUpdated;
+ long docsIndexed = tempDocumentsCount.getSecond() - previousDocCount;
+
+ // Calculate indexing rate (docs per millisecond)
+ double indexingRate = timeElapsed > 0 ? (double) docsIndexed / timeElapsed : 0;
+
+ // Calculate remaining docs and estimated time
+ long remainingDocs = tempDocumentsCount.getFirst() - tempDocumentsCount.getSecond();
+ long estimatedMillisRemaining =
+ indexingRate > 0 ? (long) (remainingDocs / indexingRate) : 0;
+ estimatedMinutesRemaining = estimatedMillisRemaining / (1000 * 60);
+
+ documentCountsLastUpdated = currentTime;
documentCounts = tempDocumentsCount;
+ previousDocCount = documentCounts.getSecond();
}
if (documentCounts.getFirst().equals(documentCounts.getSecond())) {
@@ -435,12 +451,15 @@ private void reindex(ReindexConfig indexState) throws Throwable {
break;
} else {
+ float progressPercentage =
+ 100 * (1.0f * documentCounts.getSecond()) / documentCounts.getFirst();
log.warn(
- "Task: {} - Document counts do not match {} != {}. Complete: {}%",
+ "Task: {} - Document counts do not match {} != {}. Complete: {}%. Estimated time remaining: {} minutes",
parentTaskId,
documentCounts.getFirst(),
documentCounts.getSecond(),
- 100 * (1.0f * documentCounts.getSecond()) / documentCounts.getFirst());
+ progressPercentage,
+ estimatedMinutesRemaining);
long lastUpdateDelta = System.currentTimeMillis() - documentCountsLastUpdated;
if (lastUpdateDelta > (300 * 1000)) {
diff --git a/metadata-models/src/main/resources/entity-registry.yml b/metadata-models/src/main/resources/entity-registry.yml
index 0193e5e2c5c6c3..1556b72e4aefb1 100644
--- a/metadata-models/src/main/resources/entity-registry.yml
+++ b/metadata-models/src/main/resources/entity-registry.yml
@@ -70,6 +70,7 @@ entities:
- glossaryTerms
- institutionalMemory
- dataPlatformInstance
+ - container
- browsePathsV2
- structuredProperties
- forms
@@ -93,6 +94,7 @@ entities:
- glossaryTerms
- institutionalMemory
- dataPlatformInstance
+ - container
- browsePathsV2
- structuredProperties
- incidentsSummary
diff --git a/metadata-service/configuration/src/main/resources/bootstrap_mcps/ingestion-datahub-gc.yaml b/metadata-service/configuration/src/main/resources/bootstrap_mcps/ingestion-datahub-gc.yaml
index c0c5be85b16b1d..8879a2f6549945 100644
--- a/metadata-service/configuration/src/main/resources/bootstrap_mcps/ingestion-datahub-gc.yaml
+++ b/metadata-service/configuration/src/main/resources/bootstrap_mcps/ingestion-datahub-gc.yaml
@@ -21,19 +21,30 @@
truncate_indices: {{truncate_indices}}{{^truncate_indices}}true{{/truncate_indices}}
truncate_index_older_than_days: {{truncate_indices_retention_days}}{{^truncate_indices_retention_days}}30{{/truncate_indices_retention_days}}
dataprocess_cleanup:
+ enabled: {{dataprocess_cleanup.enabled}}{{^dataprocess_cleanup.enabled}}false{{/dataprocess_cleanup.enabled}}
retention_days: {{dataprocess_cleanup.retention_days}}{{^dataprocess_cleanup.retention_days}}10{{/dataprocess_cleanup.retention_days}}
- delete_empty_data_jobs: {{dataprocess_cleanup.delete_empty_data_jobs}}{{^dataprocess_cleanup.delete_empty_data_jobs}}true{{/dataprocess_cleanup.delete_empty_data_jobs}}
- delete_empty_data_flows: {{dataprocess_cleanup.delete_empty_data_flows}}{{^dataprocess_cleanup.delete_empty_data_flows}}true{{/dataprocess_cleanup.delete_empty_data_flows}}
+ delete_empty_data_jobs: {{dataprocess_cleanup.delete_empty_data_jobs}}{{^dataprocess_cleanup.delete_empty_data_jobs}}false{{/dataprocess_cleanup.delete_empty_data_jobs}}
+ delete_empty_data_flows: {{dataprocess_cleanup.delete_empty_data_flows}}{{^dataprocess_cleanup.delete_empty_data_flows}}false{{/dataprocess_cleanup.delete_empty_data_flows}}
hard_delete_entities: {{dataprocess_cleanup.hard_delete_entities}}{{^dataprocess_cleanup.hard_delete_entities}}false{{/dataprocess_cleanup.hard_delete_entities}}
keep_last_n: {{dataprocess_cleanup.keep_last_n}}{{^dataprocess_cleanup.keep_last_n}}5{{/dataprocess_cleanup.keep_last_n}}
+ batch_size: {{dataprocess_cleanup.batch_size}}{{^dataprocess_cleanup.batch_size}}500{{/dataprocess_cleanup.batch_size}}
+ max_workers: {{dataprocess_cleanup.max_workers}}{{^dataprocess_cleanup.max_workers}}10{{/dataprocess_cleanup.max_workers}}
soft_deleted_entities_cleanup:
retention_days: {{soft_deleted_entities_cleanup.retention_days}}{{^soft_deleted_entities_cleanup.retention_days}}10{{/soft_deleted_entities_cleanup.retention_days}}
+ enabled: {{soft_deleted_entities_cleanup.enabled}}{{^soft_deleted_entities_cleanup.enabled}}true{{/soft_deleted_entities_cleanup.enabled}}
+ batch_size: {{soft_deleted_entities_cleanup.batch_size}}{{^soft_deleted_entities_cleanup.batch_size}}500{{/soft_deleted_entities_cleanup.batch_size}}
+ max_workers: {{soft_deleted_entities_cleanup.max_workers}}{{^soft_deleted_entities_cleanup.max_workers}}10{{/soft_deleted_entities_cleanup.max_workers}}
+ limit_entities_delete: {{soft_deleted_entities_cleanup.limit_entities_delete}}{{^soft_deleted_entities_cleanup.limit_entities_delete}}25000{{/soft_deleted_entities_cleanup.limit_entities_delete}}
+ runtime_limit_seconds: {{soft_deleted_entities_cleanup.runtime_limit_seconds}}{{^soft_deleted_entities_cleanup.runtime_limit_seconds}}7200{{/soft_deleted_entities_cleanup.runtime_limit_seconds}}
execution_request_cleanup:
keep_history_min_count: {{execution_request_cleanup.keep_history_min_count}}{{^execution_request_cleanup.keep_history_min_count}}10{{/execution_request_cleanup.keep_history_min_count}}
keep_history_max_count: {{execution_request_cleanup.keep_history_max_count}}{{^execution_request_cleanup.keep_history_max_count}}1000{{/execution_request_cleanup.keep_history_max_count}}
- keep_history_max_days: {{execution_request_cleanup.keep_history_max_days}}{{^execution_request_cleanup.keep_history_max_days}}30{{/execution_request_cleanup.keep_history_max_days}}
+ keep_history_max_days: {{execution_request_cleanup.keep_history_max_days}}{{^execution_request_cleanup.keep_history_max_days}}90{{/execution_request_cleanup.keep_history_max_days}}
batch_read_size: {{execution_request_cleanup.batch_read_size}}{{^execution_request_cleanup.batch_read_size}}100{{/execution_request_cleanup.batch_read_size}}
- enabled: {{execution_request_cleanup.enabled}}{{^execution_request_cleanup.enabled}}false{{/execution_request_cleanup.enabled}}
+ enabled: {{execution_request_cleanup.enabled}}{{^execution_request_cleanup.enabled}}true{{/execution_request_cleanup.enabled}}
+ runtime_limit_seconds: {{execution_request_cleanup.runtime_limit_seconds}}{{^execution_request_cleanup.runtime_limit_seconds}}3600{{/execution_request_cleanup.runtime_limit_seconds}}
+ limit_entities_delete: {{execution_request_cleanup.limit_entities_delete}}{{^execution_request_cleanup.limit_entities_delete}}10000{{/execution_request_cleanup.limit_entities_delete}}
+ max_read_errors: {{execution_request_cleanup.max_read_errors}}{{^execution_request_cleanup.max_read_errors}}10{{/execution_request_cleanup.max_read_errors}}
extraArgs: {}
debugMode: false
executorId: default
diff --git a/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/RestliEntityClient.java b/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/RestliEntityClient.java
index 8d4c5e9228a71c..ca775619220831 100644
--- a/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/RestliEntityClient.java
+++ b/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/RestliEntityClient.java
@@ -103,10 +103,10 @@
import java.util.stream.StreamSupport;
import javax.annotation.Nonnull;
import javax.annotation.Nullable;
-import javax.mail.MethodNotSupportedException;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.NotImplementedException;
+import org.apache.http.MethodNotSupportedException;
import org.opensearch.core.common.util.CollectionUtils;
@Slf4j
@@ -1195,7 +1195,7 @@ public DataMap getRawAspect(
@Nonnull String aspect,
@Nonnull Long version)
throws RemoteInvocationException {
- throw new MethodNotSupportedException();
+ throw new MethodNotSupportedException("Method not supported");
}
@Override
diff --git a/metadata-utils/build.gradle b/metadata-utils/build.gradle
index 07ce50993655d2..4b24eeac50b0b7 100644
--- a/metadata-utils/build.gradle
+++ b/metadata-utils/build.gradle
@@ -26,6 +26,7 @@ dependencies {
implementation externalDependency.slf4jApi
compileOnly externalDependency.lombok
+ runtimeOnly externalDependency.javaxMail
annotationProcessor externalDependency.lombok
@@ -40,6 +41,9 @@ dependencies {
implementation(externalDependency.log4jApi) {
because("previous versions are vulnerable to CVE-2021-45105")
}
+ implementation(externalDependency.javaxMail) {
+ because("previous versions are vulnerable")
+ }
}
implementation externalDependency.logbackClassic
diff --git a/settings.gradle b/settings.gradle
index b0c2c707d566c0..77d0706549a439 100644
--- a/settings.gradle
+++ b/settings.gradle
@@ -78,3 +78,52 @@ include ':metadata-operation-context'
include ':metadata-service:openapi-servlet:models'
include ':metadata-integration:java:datahub-schematron:lib'
include ':metadata-integration:java:datahub-schematron:cli'
+
+def installPreCommitHooks() {
+ def preCommitInstalled = false
+ try {
+ def process = ["which", "pre-commit"].execute()
+ def stdout = new StringBuilder()
+ def stderr = new StringBuilder()
+ process.waitForProcessOutput(stdout, stderr)
+ preCommitInstalled = (process.exitValue() == 0)
+ println "Pre-commit check: ${stdout}"
+ } catch (Exception e) {
+ println "Error checking pre-commit: ${e.message}"
+ return
+ }
+
+ if (!preCommitInstalled) {
+ try {
+ def installProcess = ["python", "-m", "pip", "install", "pre-commit"].execute()
+ def stdout = new StringBuilder()
+ def stderr = new StringBuilder()
+ installProcess.waitForProcessOutput(stdout, stderr)
+ if (installProcess.exitValue() != 0) {
+ println "Failed to install pre-commit: ${stderr}"
+ return
+ }
+ println "Install output: ${stdout}"
+ } catch (Exception e) {
+ println "Error installing pre-commit: ${e.message}"
+ return
+ }
+ }
+
+ try {
+ def installHooksProcess = ["python", "-m", "pre_commit", "install"].execute()
+ def stdout = new StringBuilder()
+ def stderr = new StringBuilder()
+ installHooksProcess.waitForProcessOutput(stdout, stderr)
+ if (installHooksProcess.exitValue() != 0) {
+ println "Failed to install hooks: ${stderr}"
+ return
+ }
+ println "Hooks output: ${stdout}"
+ } catch (Exception e) {
+ println "Error installing hooks: ${e.message}"
+ return
+ }
+}
+
+installPreCommitHooks()
\ No newline at end of file
diff --git a/smoke-test/tests/structured_properties/test_structured_properties.py b/smoke-test/tests/structured_properties/test_structured_properties.py
index 533a03a55735a1..e3c33aa406efc4 100644
--- a/smoke-test/tests/structured_properties/test_structured_properties.py
+++ b/smoke-test/tests/structured_properties/test_structured_properties.py
@@ -839,3 +839,49 @@ def validate_search(qualified_name, expected):
# Validate search works for property #1 & #2
validate_search(property1.qualified_name, expected=[])
validate_search(property2.qualified_name, expected=[dataset_urns[0]])
+
+
+def test_structured_properties_list(ingest_cleanup_data, graph_client, caplog):
+ # Create property, assign value to target dataset urn
+ def create_property():
+ property_name = f"listTest{randint(10, 10000)}Property"
+ value_type = "string"
+ property_urn = f"urn:li:structuredProperty:{default_namespace}.{property_name}"
+
+ create_property_definition(
+ property_name=property_name,
+ graph=graph_client,
+ value_type=value_type,
+ cardinality="SINGLE",
+ )
+
+ test_property = StructuredProperties.from_datahub(
+ graph=graph_client, urn=property_urn
+ )
+ assert test_property is not None
+
+ return test_property
+
+ # create 2 structured properties
+ property1 = create_property()
+ property2 = create_property()
+ wait_for_writes_to_sync()
+
+ # validate that urns are in the list
+ structured_properties_urns = [
+ u for u in StructuredProperties.list_urns(graph_client)
+ ]
+ assert property1.urn in structured_properties_urns
+ assert property2.urn in structured_properties_urns
+
+ # list structured properties (full)
+ structured_properties = StructuredProperties.list(graph_client)
+ matched_properties = [
+ p for p in structured_properties if p.urn in [property1.urn, property2.urn]
+ ]
+ assert len(matched_properties) == 2
+ retrieved_property1 = next(p for p in matched_properties if p.urn == property1.urn)
+ retrieved_property2 = next(p for p in matched_properties if p.urn == property2.urn)
+
+ assert property1.dict() == retrieved_property1.dict()
+ assert property2.dict() == retrieved_property2.dict()