diff --git a/metadata-ingestion/docs/dev_guides/classification.md b/metadata-ingestion/docs/dev_guides/classification.md index 457725b6783e5..0c939fd1d3423 100644 --- a/metadata-ingestion/docs/dev_guides/classification.md +++ b/metadata-ingestion/docs/dev_guides/classification.md @@ -6,21 +6,21 @@ The classification feature enables sources to be configured to automatically pre Note that a `.` is used to denote nested fields in the YAML recipe. -| Field | Required | Type | Description | Default | -| ------------------------- | -------- | --------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |------------------------------------------------------------| -| enabled | | boolean | Whether classification should be used to auto-detect glossary terms | False | -| sample_size | | int | Number of sample values used for classification. | 100 | -| max_workers | | int | Number of worker processes to use for classification. Note that any number above 1 might lead to a deadlock. Set to 1 to disable. | 1 | -| info_type_to_term | | Dict[str,string] | Optional mapping to provide glossary term identifier for info type. | By default, info type is used as glossary term identifier. | -| classifiers | | Array of object | Classifiers to use to auto-detect glossary terms. If more than one classifier, infotype predictions from the classifier defined later in sequence take precedance. | [{'type': 'datahub', 'config': None}] | -| table_pattern | | AllowDenyPattern (see below for fields) | Regex patterns to filter tables for classification. This is used in combination with other patterns in parent config. Specify regex to match the entire table name in `database.schema.table` format. e.g. to match all tables starting with customer in Customer database and public schema, use the regex 'Customer.public.customer.*' | {'allow': ['.*'], 'deny': [], 'ignoreCase': True} | -| table_pattern.allow | | Array of string | List of regex patterns to include in ingestion | ['.*'] | -| table_pattern.deny | | Array of string | List of regex patterns to exclude from ingestion. | [] | -| table_pattern.ignoreCase | | boolean | Whether to ignore case sensitivity during pattern matching. | True | -| column_pattern | | AllowDenyPattern (see below for fields) | Regex patterns to filter columns for classification. This is used in combination with other patterns in parent config. Specify regex to match the column name in `database.schema.table.column` format. | {'allow': ['.*'], 'deny': [], 'ignoreCase': True} | -| column_pattern.allow | | Array of string | List of regex patterns to include in ingestion | ['.*'] | -| column_pattern.deny | | Array of string | List of regex patterns to exclude from ingestion. | [] | -| column_pattern.ignoreCase | | boolean | Whether to ignore case sensitivity during pattern matching. | True | +| Field | Required | Type | Description | Default | +| ------------------------- | -------- | --------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------- | +| enabled | | boolean | Whether classification should be used to auto-detect glossary terms | False | +| sample_size | | int | Number of sample values used for classification. | 100 | +| max_workers | | int | Number of worker processes to use for classification. Set to 1 to disable. | Number of CPU cores | +| info_type_to_term | | Dict[str,string] | Optional mapping to provide glossary term identifier for info type. | By default, info type is used as glossary term identifier. | +| classifiers | | Array of object | Classifiers to use to auto-detect glossary terms. If more than one classifier, infotype predictions from the classifier defined later in sequence take precedance. | [{'type': 'datahub', 'config': None}] | +| table_pattern | | AllowDenyPattern (see below for fields) | Regex patterns to filter tables for classification. This is used in combination with other patterns in parent config. Specify regex to match the entire table name in `database.schema.table` format. e.g. to match all tables starting with customer in Customer database and public schema, use the regex 'Customer.public.customer.\*' | {'allow': ['.*'], 'deny': [], 'ignoreCase': True} | +| table_pattern.allow | | Array of string | List of regex patterns to include in ingestion | ['.*'] | +| table_pattern.deny | | Array of string | List of regex patterns to exclude from ingestion. | [] | +| table_pattern.ignoreCase | | boolean | Whether to ignore case sensitivity during pattern matching. | True | +| column_pattern | | AllowDenyPattern (see below for fields) | Regex patterns to filter columns for classification. This is used in combination with other patterns in parent config. Specify regex to match the column name in `database.schema.table.column` format. | {'allow': ['.*'], 'deny': [], 'ignoreCase': True} | +| column_pattern.allow | | Array of string | List of regex patterns to include in ingestion | ['.*'] | +| column_pattern.deny | | Array of string | List of regex patterns to exclude from ingestion. | [] | +| column_pattern.ignoreCase | | boolean | Whether to ignore case sensitivity during pattern matching. | True | ## DataHub Classifier @@ -28,27 +28,29 @@ DataHub Classifier is the default classifier implementation, which uses [acryl-d ### Config Details -| Field | Required | Type | Description | Default | -| ------------------------------------------------------ | ----------------------------------------------------- | ---------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| confidence_level_threshold | | number | | 0.68 | -| strip_exclusion_formatting | | bool | A flag that determines whether the exclusion list uses exact matching or format stripping (case-insensitivity, punctuation removal, and special character removal). | True | -| info_types | | list[string] | List of infotypes to be predicted. By default, all supported infotypes are considered, along with any custom infotypes configured in `info_types_config`. | None | -| info_types_config | Configuration details for infotypes | Dict[str, InfoTypeConfig] | | See [reference_input.py](https://github.com/acryldata/datahub-classify/blob/main/datahub-classify/src/datahub_classify/reference_input.py) for default configuration. | +| Field | Required | Type | Description | Default | +| ------------------------------------------------------ | ------------------------------------------------------ | ---------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| confidence_level_threshold | | number | | 0.68 | +| strip_exclusion_formatting | | bool | A flag that determines whether the exclusion list uses exact matching or format stripping (case-insensitivity, punctuation removal, and special character removal). | True | +| info_types | | list[string] | List of infotypes to be predicted. By default, all supported infotypes are considered, along with any custom infotypes configured in `info_types_config`. | None | +| info_types_config | Configuration details for infotypes | Dict[str, InfoTypeConfig] | | See [reference_input.py](https://github.com/acryldata/datahub-classify/blob/main/datahub-classify/src/datahub_classify/reference_input.py) for default configuration. | | info_types_config.`key`.prediction_factors_and_weights | ❓ (required if info_types_config.`key` is set) | Dict[str,number] | Factors and their weights to consider when predicting info types | | -| info_types_config.`key`.exclude_name | | list[string] | Optional list of names to exclude from classification. | None | -| info_types_config.`key`.name | | NameFactorConfig (see below for fields) | | | -| info_types_config.`key`.name.regex | | Array of string | List of regex patterns the column name follows for the info type | ['.*'] | -| info_types_config.`key`.description | | DescriptionFactorConfig (see below for fields) | | | -| info_types_config.`key`.description.regex | | Array of string | List of regex patterns the column description follows for the info type | ['.*'] | -| info_types_config.`key`.datatype | | DataTypeFactorConfig (see below for fields) | | | -| info_types_config.`key`.datatype.type | | Array of string | List of data types for the info type | ['.*'] | -| info_types_config.`key`.values | | ValuesFactorConfig (see below for fields) | | | +| info_types_config.`key`.exclude_name | | list[string] | Optional list of names to exclude from classification. | None | +| info_types_config.`key`.name | | NameFactorConfig (see below for fields) | | | +| info_types_config.`key`.name.regex | | Array of string | List of regex patterns the column name follows for the info type | ['.*'] | +| info_types_config.`key`.description | | DescriptionFactorConfig (see below for fields) | | | +| info_types_config.`key`.description.regex | | Array of string | List of regex patterns the column description follows for the info type | ['.*'] | +| info_types_config.`key`.datatype | | DataTypeFactorConfig (see below for fields) | | | +| info_types_config.`key`.datatype.type | | Array of string | List of data types for the info type | ['.*'] | +| info_types_config.`key`.values | | ValuesFactorConfig (see below for fields) | | | | info_types_config.`key`.values.prediction_type | ❓ (required if info_types_config.`key`.values is set) | string | | None | -| info_types_config.`key`.values.regex | | Array of string | List of regex patterns the column value follows for the info type | None | -| info_types_config.`key`.values.library | | Array of string | Library used for prediction | None | -| minimum_values_threshold | | number | Minimum number of non-null column values required to process `values` prediction factor. | 50 | +| info_types_config.`key`.values.regex | | Array of string | List of regex patterns the column value follows for the info type | None | +| info_types_config.`key`.values.library | | Array of string | Library used for prediction | None | +| minimum_values_threshold | | number | Minimum number of non-null column values required to process `values` prediction factor. | 50 | | | + ### Supported infotypes + - `Email_Address` - `Gender` - `Credit_Debit_Card_Number` @@ -73,8 +75,7 @@ DataHub Classifier is the default classifier implementation, which uses [acryl-d - Classification for nested columns (struct, array type) - -## Examples +## Examples ### Basic @@ -99,7 +100,7 @@ source: classification: enabled: True classifiers: - - type: datahub + - type: datahub ``` ### Advanced Configuration: Customizing configuration for supported info types @@ -127,7 +128,7 @@ source: info_type_to_term: Email_Address: "Email" classifiers: - - type: datahub + - type: datahub config: confidence_level_threshold: 0.7 info_types_config: @@ -403,10 +404,8 @@ source: regex: [] library: - rule_based_logic - ``` - ### Advanced Configuration: Specifying Custom InfoType ```yml @@ -430,7 +429,7 @@ source: classification: enabled: True classifiers: - - type: datahub + - type: datahub config: confidence_level_threshold: 0.7 minimum_values_threshold: 10 @@ -452,4 +451,4 @@ source: ### DataHub Blog -* [PII Classification just got easier with DataHub](https://blog.datahubproject.io/pii-classification-just-got-easier-with-datahub-6bab2b63abcb) +- [PII Classification just got easier with DataHub](https://blog.datahubproject.io/pii-classification-just-got-easier-with-datahub-6bab2b63abcb) diff --git a/metadata-ingestion/src/datahub/entrypoints.py b/metadata-ingestion/src/datahub/entrypoints.py index 5ae465bbc89ee..24f2e12423d97 100644 --- a/metadata-ingestion/src/datahub/entrypoints.py +++ b/metadata-ingestion/src/datahub/entrypoints.py @@ -1,4 +1,5 @@ import logging +import multiprocessing import os import platform import sys @@ -217,6 +218,14 @@ def init(use_password: bool = False) -> None: def main(**kwargs): + # We use threads in a variety of places within our CLI. The multiprocessing + # "fork" start method is not safe to use with threads. + # MacOS and Windows already default to "spawn", and Linux will as well starting in Python 3.14. + # https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods + # Eventually it may make sense to use "forkserver" as the default where available, + # but we can revisit that in the future. + multiprocessing.set_start_method("spawn", force=True) + # This wrapper prevents click from suppressing errors. try: sys.exit(datahub(standalone_mode=False, **kwargs)) diff --git a/metadata-ingestion/src/datahub/ingestion/glossary/classification_mixin.py b/metadata-ingestion/src/datahub/ingestion/glossary/classification_mixin.py index 98c43079a3bc1..4465317ae351a 100644 --- a/metadata-ingestion/src/datahub/ingestion/glossary/classification_mixin.py +++ b/metadata-ingestion/src/datahub/ingestion/glossary/classification_mixin.py @@ -1,5 +1,6 @@ import concurrent.futures import logging +import multiprocessing from dataclasses import dataclass, field from functools import partial from math import ceil @@ -182,6 +183,11 @@ def async_classify( with concurrent.futures.ProcessPoolExecutor( max_workers=self.config.classification.max_workers, + # The fork start method, which is the default on Linux for Python < 3.14, is not + # safe when the main process uses threads. The default start method on windows/macOS is + # already spawn, and will be changed to spawn for Linux in Python 3.14. + # https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods + mp_context=multiprocessing.get_context("spawn"), ) as executor: column_info_proposal_futures = [ executor.submit( diff --git a/metadata-ingestion/src/datahub/ingestion/glossary/classifier.py b/metadata-ingestion/src/datahub/ingestion/glossary/classifier.py index bdcdcb8990eba..ddcb74e354613 100644 --- a/metadata-ingestion/src/datahub/ingestion/glossary/classifier.py +++ b/metadata-ingestion/src/datahub/ingestion/glossary/classifier.py @@ -1,3 +1,4 @@ +import os from abc import ABCMeta, abstractmethod from dataclasses import dataclass from typing import Any, Dict, List, Optional @@ -37,8 +38,8 @@ class ClassificationConfig(ConfigModel): ) max_workers: int = Field( - default=1, - description="Number of worker processes to use for classification. Note that any number above 1 might lead to a deadlock. Set to 1 to disable.", + default=(os.cpu_count() or 4), + description="Number of worker processes to use for classification. Set to 1 to disable.", ) table_pattern: AllowDenyPattern = Field(