datahub-project
diff --git a/‎metadata-ingestion/src/datahub/sdk/_shared.py
Lines changed: 3 additions & 5 deletions b/‎metadata-ingestion/src/datahub/sdk/_shared.py
Lines changed: 3 additions & 5 deletions
diff --git a/‎metadata-ingestion/src/datahub/sdk/_utils.py
Lines changed: 4 additions & 0 deletions b/‎metadata-ingestion/src/datahub/sdk/_utils.py
Lines changed: 4 additions & 0 deletions
diff --git a/‎metadata-ingestion/src/datahub/sdk/dataset.py
Lines changed: 2 additions & 2 deletions b/‎metadata-ingestion/src/datahub/sdk/dataset.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎metadata-ingestion/src/datahub/sdk/lineage_client.py
Lines changed: 235 additions & 0 deletions b/‎metadata-ingestion/src/datahub/sdk/lineage_client.py
Lines changed: 235 additions & 0 deletions
diff --git a/‎metadata-ingestion/src/datahub/sdk/main_client.py
Lines changed: 4 additions & 1 deletion b/‎metadata-ingestion/src/datahub/sdk/main_client.py
Lines changed: 4 additions & 1 deletion
diff --git a/‎metadata-ingestion/src/datahub/sql_parsing/fingerprint_utils.py
Lines changed: 6 additions & 0 deletions b/‎metadata-ingestion/src/datahub/sql_parsing/fingerprint_utils.py
Lines changed: 6 additions & 0 deletions
@@ -41,7 +41,7 @@
     Urn,
     VersionSetUrn,
 )
-from datahub.sdk._utils import add_list_unique, remove_list_unique
+from datahub.sdk._utils import DEFAULT_ACTOR_URN, add_list_unique, remove_list_unique
 from datahub.sdk.entity import Entity
 from datahub.utilities.urns.error import InvalidUrnError
 
@@ -54,8 +54,6 @@
 
 ActorUrn: TypeAlias = Union[CorpUserUrn, CorpGroupUrn]
 
-_DEFAULT_ACTOR_URN = CorpUserUrn("__ingestion").urn()
-
 TrainingMetricsInputType: TypeAlias = Union[
     List[models.MLMetricClass], Dict[str, Optional[str]]
 ]
@@ -475,7 +473,7 @@ def _parse_glossary_term_association_class(
     def _terms_audit_stamp(self) -> models.AuditStampClass:
         return models.AuditStampClass(
             time=0,
-            actor=_DEFAULT_ACTOR_URN,
+            actor=DEFAULT_ACTOR_URN,
         )
 
     def set_terms(self, terms: TermsInputType) -> None:
@@ -563,7 +561,7 @@ def links(self) -> Optional[List[models.InstitutionalMemoryMetadataClass]]:
     def _institutional_memory_audit_stamp(self) -> models.AuditStampClass:
         return models.AuditStampClass(
             time=0,
-            actor=_DEFAULT_ACTOR_URN,
+            actor=DEFAULT_ACTOR_URN,
         )
 
     @classmethod
 
@@ -1,6 +1,10 @@
 from typing import Any, Callable, List, Protocol, TypeVar
 
 from datahub.errors import ItemNotFoundError
+from datahub.metadata.urns import CorpUserUrn
+
+# TODO: Change __ingestion to _ingestion.
+DEFAULT_ACTOR_URN = CorpUserUrn("__ingestion").urn()
 
 
 class _SupportsEq(Protocol):
 
@@ -87,7 +87,7 @@ def _parse_upstream_input(
         assert_never(upstream_input)
 
 
-def _parse_cll_mapping(
+def parse_cll_mapping(
     *,
     upstream: DatasetUrnOrStr,
     downstream: DatasetUrnOrStr,
@@ -142,7 +142,7 @@ def _parse_upstream_lineage_input(
                 )
             )
             cll.extend(
-                _parse_cll_mapping(
+                parse_cll_mapping(
                     upstream=dataset_urn,
                     downstream=downstream_urn,
                     cll_mapping=column_lineage,
 
@@ -0,0 +1,235 @@
+from __future__ import annotations
+
+import difflib
+import logging
+from typing import TYPE_CHECKING, List, Literal, Optional, Set, Union
+
+import datahub.metadata.schema_classes as models
+from datahub.emitter.mcp import MetadataChangeProposalWrapper
+from datahub.errors import SdkUsageError
+from datahub.metadata.schema_classes import SchemaMetadataClass
+from datahub.metadata.urns import DatasetUrn, QueryUrn
+from datahub.sdk._shared import DatasetUrnOrStr
+from datahub.sdk._utils import DEFAULT_ACTOR_URN
+from datahub.sdk.dataset import ColumnLineageMapping, parse_cll_mapping
+from datahub.specific.dataset import DatasetPatchBuilder
+from datahub.sql_parsing.fingerprint_utils import generate_hash
+from datahub.utilities.ordered_set import OrderedSet
+
+if TYPE_CHECKING:
+    from datahub.sdk.main_client import DataHubClient
+
+logger = logging.getLogger(__name__)
+
+_empty_audit_stamp = models.AuditStampClass(
+    time=0,
+    actor=DEFAULT_ACTOR_URN,
+)
+
+
+class LineageClient:
+    def __init__(self, client: DataHubClient):
+        self._client = client
+
+    def _get_fields_from_dataset_urn(self, dataset_urn: DatasetUrn) -> Set[str]:
+        schema_metadata = self._client._graph.get_aspect(
+            str(dataset_urn), SchemaMetadataClass
+        )
+        if schema_metadata is None:
+            return Set()
+
+        return {field.fieldPath for field in schema_metadata.fields}
+
+    @classmethod
+    def _get_strict_column_lineage(
+        cls,
+        upstream_fields: Set[str],
+        downstream_fields: Set[str],
+    ) -> ColumnLineageMapping:
+        """Find matches between upstream and downstream fields with case-insensitive matching."""
+        strict_column_lineage: ColumnLineageMapping = {}
+
+        # Create case-insensitive mapping of upstream fields
+        case_insensitive_map = {field.lower(): field for field in upstream_fields}
+
+        # Match downstream fields using case-insensitive comparison
+        for downstream_field in downstream_fields:
+            lower_field = downstream_field.lower()
+            if lower_field in case_insensitive_map:
+                # Use the original case of the upstream field
+                strict_column_lineage[downstream_field] = [
+                    case_insensitive_map[lower_field]
+                ]
+
+        return strict_column_lineage
+
+    @classmethod
+    def _get_fuzzy_column_lineage(
+        cls,
+        upstream_fields: Set[str],
+        downstream_fields: Set[str],
+    ) -> ColumnLineageMapping:
+        """Generate fuzzy matches between upstream and downstream fields."""
+
+        # Simple normalization function for better matching
+        def normalize(s: str) -> str:
+            return s.lower().replace("_", "")
+
+        # Create normalized lookup for upstream fields
+        normalized_upstream = {normalize(field): field for field in upstream_fields}
+
+        fuzzy_column_lineage = {}
+        for downstream_field in downstream_fields:
+            # Try exact match first
+            if downstream_field in upstream_fields:
+                fuzzy_column_lineage[downstream_field] = [downstream_field]
+                continue
+
+            # Try normalized match
+            norm_downstream = normalize(downstream_field)
+            if norm_downstream in normalized_upstream:
+                fuzzy_column_lineage[downstream_field] = [
+                    normalized_upstream[norm_downstream]
+                ]
+                continue
+
+            # If no direct match, find closest match using similarity
+            matches = difflib.get_close_matches(
+                norm_downstream,
+                normalized_upstream.keys(),
+                n=1,  # Return only the best match
+                cutoff=0.8,  # Adjust cutoff for sensitivity
+            )
+
+            if matches:
+                fuzzy_column_lineage[downstream_field] = [
+                    normalized_upstream[matches[0]]
+                ]
+
+        return fuzzy_column_lineage
+
+    def add_dataset_copy_lineage(
+        self,
+        *,
+        upstream: DatasetUrnOrStr,
+        downstream: DatasetUrnOrStr,
+        column_lineage: Union[
+            None, ColumnLineageMapping, Literal["auto_fuzzy", "auto_strict"]
+        ] = "auto_fuzzy",
+    ) -> None:
+        upstream = DatasetUrn.from_string(upstream)
+        downstream = DatasetUrn.from_string(downstream)
+
+        if column_lineage is None:
+            cll = None
+        elif column_lineage in ["auto_fuzzy", "auto_strict"]:
+            upstream_schema = self._get_fields_from_dataset_urn(upstream)
+            downstream_schema = self._get_fields_from_dataset_urn(downstream)
+            if column_lineage == "auto_fuzzy":
+                mapping = self._get_fuzzy_column_lineage(
+                    upstream_schema, downstream_schema
+                )
+            else:
+                mapping = self._get_strict_column_lineage(
+                    upstream_schema, downstream_schema
+                )
+            cll = parse_cll_mapping(
+                upstream=upstream,
+                downstream=downstream,
+                cll_mapping=mapping,
+            )
+        elif isinstance(column_lineage, dict):
+            cll = parse_cll_mapping(
+                upstream=upstream,
+                downstream=downstream,
+                cll_mapping=column_lineage,
+            )
+
+        updater = DatasetPatchBuilder(str(downstream))
+        updater.add_upstream_lineage(
+            models.UpstreamClass(
+                dataset=str(upstream),
+                type=models.DatasetLineageTypeClass.COPY,
+            )
+        )
+        for cl in cll or []:
+            updater.add_fine_grained_upstream_lineage(cl)
+
+        self._client.entities.update(updater)
+
+    def add_dataset_transform_lineage(
+        self,
+        *,
+        upstream: DatasetUrnOrStr,
+        downstream: DatasetUrnOrStr,
+        column_lineage: Optional[ColumnLineageMapping] = None,
+        query_text: Optional[str] = None,
+    ) -> None:
+        upstream = DatasetUrn.from_string(upstream)
+        downstream = DatasetUrn.from_string(downstream)
+
+        cll = None
+        if column_lineage is not None:
+            cll = parse_cll_mapping(
+                upstream=upstream,
+                downstream=downstream,
+                cll_mapping=column_lineage,
+            )
+
+        fields_involved = OrderedSet([str(upstream), str(downstream)])
+        if cll is not None:
+            for c in cll:
+                for field in c.upstreams or []:
+                    fields_involved.add(field)
+                for field in c.downstreams or []:
+                    fields_involved.add(field)
+
+        query_urn = None
+        query_entity = None
+        if query_text:
+            # Eventually we might want to use our regex-based fingerprinting instead.
+            fingerprint = generate_hash(query_text)
+            query_urn = QueryUrn(fingerprint).urn()
+
+            from datahub.sql_parsing.sql_parsing_aggregator import make_query_subjects
+
+            query_entity = MetadataChangeProposalWrapper.construct_many(
+                query_urn,
+                aspects=[
+                    models.QueryPropertiesClass(
+                        statement=models.QueryStatementClass(
+                            value=query_text, language=models.QueryLanguageClass.SQL
+                        ),
+                        source=models.QuerySourceClass.SYSTEM,
+                        created=_empty_audit_stamp,
+                        lastModified=_empty_audit_stamp,
+                    ),
+                    make_query_subjects(list(fields_involved)),
+                ],
+            )
+
+        updater = DatasetPatchBuilder(str(downstream))
+        updater.add_upstream_lineage(
+            models.UpstreamClass(
+                dataset=str(upstream),
+                type=models.DatasetLineageTypeClass.TRANSFORMED,
+                query=query_urn,
+            )
+        )
+        for cl in cll or []:
+            cl.query = query_urn
+            updater.add_fine_grained_upstream_lineage(cl)
+
+        # Throw if the dataset does not exist.
+        # We need to manually call .build() instead of reusing client.update()
+        # so that we make just one emit_mcps call.
+        if not self._client._graph.exists(updater.urn):
+            raise SdkUsageError(
+                f"Dataset {updater.urn} does not exist, and hence cannot be updated."
+            )
+        mcps: List[
+            Union[MetadataChangeProposalWrapper, models.MetadataChangeProposalClass]
+        ] = list(updater.build())
+        if query_entity:
+            mcps.extend(query_entity)
+        self._client._graph.emit_mcps(mcps)
@@ -6,6 +6,7 @@
 from datahub.ingestion.graph.client import DataHubGraph, get_default_graph
 from datahub.ingestion.graph.config import DatahubClientConfig
 from datahub.sdk.entity_client import EntityClient
+from datahub.sdk.lineage_client import LineageClient
 from datahub.sdk.resolver_client import ResolverClient
 from datahub.sdk.search_client import SearchClient
 
@@ -99,4 +100,6 @@ def resolve(self) -> ResolverClient:
     def search(self) -> SearchClient:
         return SearchClient(self)
 
-    # TODO: lineage client
+    @property
+    def lineage(self) -> LineageClient:
+        return LineageClient(self)
@@ -0,0 +1,6 @@
+import hashlib
+
+
+def generate_hash(text: str) -> str:
+    # Once we move to Python 3.9+, we can set `usedforsecurity=False`.
+    return hashlib.sha256(text.encode("utf-8")).hexdigest()