Skip to content

Commit bf1faaa

Browse files
Merge branch 'master' into include-system-timestamp-for-metadata-modification
2 parents 394a57d + 32c62e5 commit bf1faaa

File tree

44 files changed

+3858
-443
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

44 files changed

+3858
-443
lines changed

datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/entitytype/EntityTypeUrnMapper.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,4 +111,8 @@ public static String getEntityTypeUrn(String name) {
111111
}
112112
return ENTITY_NAME_TO_ENTITY_TYPE_URN.get(name);
113113
}
114+
115+
public static boolean isValidEntityType(String entityTypeUrn) {
116+
return ENTITY_TYPE_URN_TO_NAME.containsKey(entityTypeUrn);
117+
}
114118
}

datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/structuredproperty/StructuredPropertyMapper.java

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
import com.linkedin.datahub.graphql.generated.StructuredPropertySettings;
2121
import com.linkedin.datahub.graphql.generated.TypeQualifier;
2222
import com.linkedin.datahub.graphql.types.common.mappers.util.MappingHelper;
23+
import com.linkedin.datahub.graphql.types.entitytype.EntityTypeUrnMapper;
2324
import com.linkedin.datahub.graphql.types.mappers.MapperUtils;
2425
import com.linkedin.datahub.graphql.types.mappers.ModelMapper;
2526
import com.linkedin.entity.EntityResponse;
@@ -30,7 +31,9 @@
3031
import java.util.stream.Collectors;
3132
import javax.annotation.Nonnull;
3233
import javax.annotation.Nullable;
34+
import lombok.extern.slf4j.Slf4j;
3335

36+
@Slf4j
3437
public class StructuredPropertyMapper
3538
implements ModelMapper<EntityResponse, StructuredPropertyEntity> {
3639

@@ -141,8 +144,21 @@ private TypeQualifier mapTypeQualifier(final StringArrayMap gmsTypeQualifier) {
141144
final TypeQualifier typeQualifier = new TypeQualifier();
142145
List<String> allowedTypes = gmsTypeQualifier.get(ALLOWED_TYPES);
143146
if (allowedTypes != null) {
147+
// filter out correct allowedTypes
148+
List<String> validAllowedTypes =
149+
allowedTypes.stream()
150+
.filter(EntityTypeUrnMapper::isValidEntityType)
151+
.collect(Collectors.toList());
152+
if (validAllowedTypes.size() != allowedTypes.size()) {
153+
log.error(
154+
String.format(
155+
"Property has invalid allowed types set. Current list of allowed types: %s",
156+
allowedTypes));
157+
}
144158
typeQualifier.setAllowedTypes(
145-
allowedTypes.stream().map(this::createEntityTypeEntity).collect(Collectors.toList()));
159+
validAllowedTypes.stream()
160+
.map(this::createEntityTypeEntity)
161+
.collect(Collectors.toList()));
146162
}
147163
return typeQualifier;
148164
}

datahub-web-react/src/app/analyticsDashboard/components/TimeSeriesChart.tsx

Lines changed: 34 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,38 @@ export function computeLines(chartData: TimeSeriesChartType, insertBlankPoints:
8484
return returnLines;
8585
}
8686

87+
const formatAxisDate = (value: number, chartData: TimeSeriesChartType) => {
88+
const date = new Date(value);
89+
90+
switch (chartData.interval) {
91+
case 'MONTH':
92+
return date.toLocaleDateString('en-US', {
93+
month: 'short',
94+
year: 'numeric',
95+
timeZone: 'UTC',
96+
});
97+
case 'WEEK':
98+
return date.toLocaleDateString('en-US', {
99+
month: 'short',
100+
day: 'numeric',
101+
timeZone: 'UTC',
102+
});
103+
case 'DAY':
104+
return date.toLocaleDateString('en-US', {
105+
weekday: 'short',
106+
day: 'numeric',
107+
timeZone: 'UTC',
108+
});
109+
default:
110+
return date.toLocaleDateString('en-US', {
111+
month: 'short',
112+
day: 'numeric',
113+
year: 'numeric',
114+
timeZone: 'UTC',
115+
});
116+
}
117+
};
118+
87119
export const TimeSeriesChart = ({
88120
chartData,
89121
width,
@@ -117,6 +149,7 @@ export const TimeSeriesChart = ({
117149
strokeWidth={style?.axisWidth}
118150
tickLabelProps={{ fill: 'black', fontFamily: 'inherit', fontSize: 10 }}
119151
numTicks={3}
152+
tickFormat={(value) => formatAxisDate(value, chartData)}
120153
/>
121154
<Axis
122155
orientation="right"
@@ -151,9 +184,7 @@ export const TimeSeriesChart = ({
151184
tooltipData?.nearestDatum && (
152185
<div>
153186
<div>
154-
{new Date(
155-
Number(accessors.xAccessor(tooltipData.nearestDatum.datum)),
156-
).toDateString()}
187+
{formatAxisDate(accessors.xAccessor(tooltipData.nearestDatum.datum), chartData)}
157188
</div>
158189
<div>{accessors.yAccessor(tooltipData.nearestDatum.datum)}</div>
159190
</div>

docs/cli.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -735,7 +735,7 @@ Please see our [Integrations page](https://datahubproject.io/integrations) if yo
735735
| [bigquery](./generated/ingestion/sources/bigquery.md) | `pip install 'acryl-datahub[bigquery]'` | BigQuery source |
736736
| [datahub-lineage-file](./generated/ingestion/sources/file-based-lineage.md) | _no additional dependencies_ | Lineage File source |
737737
| [datahub-business-glossary](./generated/ingestion/sources/business-glossary.md) | _no additional dependencies_ | Business Glossary File source |
738-
| [dbt](./generated/ingestion/sources/dbt.md) | _no additional dependencies_ | dbt source |
738+
| [dbt](./generated/ingestion/sources/dbt.md) | `pip install 'acryl-datahub[dbt]'` | dbt source |
739739
| [dremio](./generated/ingestion/sources/dremio.md) | `pip install 'acryl-datahub[dremio]'` | Dremio Source |
740740
| [druid](./generated/ingestion/sources/druid.md) | `pip install 'acryl-datahub[druid]'` | Druid Source |
741741
| [feast](./generated/ingestion/sources/feast.md) | `pip install 'acryl-datahub[feast]'` | Feast source (0.26.0) |

docs/managed-datahub/operator-guide/setting-up-remote-ingestion-executor.md

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,50 @@ The Helm chart [datahub-executor-worker](https://executor-helm.acryl.io/index.ya
125125
--set image.tag=v0.3.1 \
126126
acryl datahub-executor-worker
127127
```
128+
9. As of DataHub Cloud `v0.3.8.2` It is possible to pass secrets to ingestion recipes using Kubernetes Secret CRDs as shown below. This allows to update secrets at runtime without restarting Remote Executor process.
129+
```
130+
# 1. Create K8s Secret object in remote executor namespace, e.g.
131+
apiVersion: v1
132+
kind: Secret
133+
metadata:
134+
name: datahub-secret-store
135+
data:
136+
REDSHIFT_PASSWORD: cmVkc2hpZnQtc2VjcmV0Cg==
137+
SNOWFLAKE_PASSWORD: c25vd2ZsYWtlLXNlY3JldAo=
138+
# 2. Add secret into your Remote Executor deployment:
139+
extraVolumes:
140+
- name: datahub-secret-store
141+
secret:
142+
secretName: datahub-secret-store
143+
# 3. Mount it under /mnt/secrets directory
144+
extraVolumeMounts:
145+
- mountPath: /mnt/secrets
146+
name: datahub-secret-store
147+
```
148+
You can then reference the mounted secrets directly in the ingestion recipe:
149+
```yaml
150+
source:
151+
type: redshift
152+
config:
153+
host_port: '<redshift host:port>'
154+
username: connector_test
155+
table_lineage_mode: mixed
156+
include_table_lineage: true
157+
include_tables: true
158+
include_views: true
159+
profiling:
160+
enabled: true
161+
profile_table_level_only: false
162+
stateful_ingestion:
163+
enabled: true
164+
password: '${REDSHIFT_PASSWORD}'
165+
```
166+
167+
By default the executor will look for files mounted in `/mnt/secrets`, this is override-able by setting the env var:
168+
`DATAHUB_EXECUTOR_FILE_SECRET_BASEDIR` to a different location (default: `/mnt/secrets`)
169+
170+
These files are expected to be under 1MB in data by default. To increase this limit set a higher value using:
171+
`DATAHUB_EXECUTOR_FILE_SECRET_MAXLEN` (default: `1024768`, size in bytes)
128172

129173
## FAQ
130174

docs/managed-datahub/release-notes/v_0_3_8.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33

44
Release Availability Date
55
---
6-
21-Jan-2025
6+
29-Jan-2025
77

88
Recommended CLI/SDK
99
---

docs/modeling/extending-the-metadata-model.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -361,6 +361,8 @@ It takes the following parameters:
361361
This annotation is applied to fields inside an Aspect. It instructs DataHub to index the field so it can be retrieved
362362
via the search APIs.
363363

364+
:::note If you are adding @Searchable to a field that already has data, you'll want to restore indices [via api](https://datahubproject.io/docs/api/restli/restore-indices/) or [via upgrade step](https://github.com/datahub-project/datahub/blob/master/metadata-service/factories/src/main/java/com/linkedin/metadata/boot/steps/RestoreGlossaryIndices.java) to have it be populated with existing data.
365+
364366
It takes the following parameters:
365367

366368
- **fieldType**: string - The settings for how each field is indexed is defined by the field type. Each field type is

metadata-ingestion/setup.cfg

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,9 @@ warn_unused_configs = yes
1515
disallow_untyped_defs = no
1616

1717
# try to be a bit more strict in certain areas of the codebase
18+
[mypy-datahub]
19+
# Only for datahub's __init__.py - allow implicit reexport
20+
implicit_reexport = yes
1821
[mypy-datahub.*]
1922
ignore_missing_imports = no
2023
implicit_reexport = no
@@ -54,7 +57,7 @@ addopts = --cov=src --cov-report= --cov-config setup.cfg --strict-markers -p no:
5457
markers =
5558
slow: marks tests that are slow to run, including all docker-based tests (deselect with '-m not slow')
5659
integration: marks all integration tests, across all batches (deselect with '-m "not integration"')
57-
integration_batch_0: mark tests to run in batch 0 of integration tests. This is done mainly for parallelisation in CI. Batch 0 is the default batch.
60+
integration_batch_0: mark tests to run in batch 0 of integration tests. This is done mainly for parallelization in CI. Batch 0 is the default batch.
5861
integration_batch_1: mark tests to run in batch 1 of integration tests
5962
integration_batch_2: mark tests to run in batch 2 of integration tests
6063
testpaths =
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
from datahub.configuration.common import MetaError
2+
3+
# TODO: Move all other error types to this file.
4+
5+
6+
class SdkUsageError(MetaError):
7+
pass
8+
9+
10+
class AlreadyExistsError(SdkUsageError):
11+
pass
12+
13+
14+
class ItemNotFoundError(SdkUsageError):
15+
pass
16+
17+
18+
class MultipleItemsFoundError(SdkUsageError):
19+
pass
20+
21+
22+
class SchemaFieldKeyError(SdkUsageError, KeyError):
23+
pass
24+
25+
26+
class IngestionAttributionWarning(Warning):
27+
pass
28+
29+
30+
class MultipleSubtypesWarning(Warning):
31+
pass
32+
33+
34+
class ExperimentalWarning(Warning):
35+
pass

metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1673,10 +1673,11 @@ def to_platform_resource(
16731673
primary_key="",
16741674
)
16751675

1676-
# Extract user email mappings
1676+
# Extract user email mappings.
1677+
# Sort it to ensure the order is deterministic.
16771678
user_email_cache = {
16781679
user_id: user.email
1679-
for user_id, user in self._user_cache.items()
1680+
for user_id, user in sorted(self._user_cache.items())
16801681
if user.email
16811682
}
16821683

metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from typing import Callable, Dict, Iterable, List, MutableMapping, Optional
77

88
from datahub.ingestion.api.report import SupportsAsObj
9+
from datahub.ingestion.source.common.subtypes import DatasetSubTypes
910
from datahub.ingestion.source.snowflake.constants import SnowflakeObjectDomain
1011
from datahub.ingestion.source.snowflake.snowflake_connection import SnowflakeConnection
1112
from datahub.ingestion.source.snowflake.snowflake_query import (
@@ -100,6 +101,9 @@ class SnowflakeTable(BaseTable):
100101
def is_hybrid(self) -> bool:
101102
return self.type is not None and self.type == "HYBRID TABLE"
102103

104+
def get_subtype(self) -> DatasetSubTypes:
105+
return DatasetSubTypes.TABLE
106+
103107

104108
@dataclass
105109
class SnowflakeView(BaseView):
@@ -109,6 +113,9 @@ class SnowflakeView(BaseView):
109113
column_tags: Dict[str, List[SnowflakeTag]] = field(default_factory=dict)
110114
is_secure: bool = False
111115

116+
def get_subtype(self) -> DatasetSubTypes:
117+
return DatasetSubTypes.VIEW
118+
112119

113120
@dataclass
114121
class SnowflakeSchema:
@@ -154,6 +161,9 @@ class SnowflakeStream:
154161
column_tags: Dict[str, List[SnowflakeTag]] = field(default_factory=dict)
155162
last_altered: Optional[datetime] = None
156163

164+
def get_subtype(self) -> DatasetSubTypes:
165+
return DatasetSubTypes.SNOWFLAKE_STREAM
166+
157167

158168
class _SnowflakeTagCache:
159169
def __init__(self) -> None:

metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py

Lines changed: 11 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@
2121
from datahub.ingestion.source.aws.s3_util import make_s3_urn_for_lineage
2222
from datahub.ingestion.source.common.subtypes import (
2323
DatasetContainerSubTypes,
24-
DatasetSubTypes,
2524
)
2625
from datahub.ingestion.source.snowflake.constants import (
2726
GENERIC_PERMISSION_ERROR_KEY,
@@ -467,7 +466,13 @@ def _process_schema(
467466
context=f"{db_name}.{schema_name}",
468467
)
469468

470-
def _process_tags(self, snowflake_schema, schema_name, db_name, domain):
469+
def _process_tags(
470+
self,
471+
snowflake_schema: SnowflakeSchema,
472+
schema_name: str,
473+
db_name: str,
474+
domain: str,
475+
) -> None:
471476
snowflake_schema.tags = self.tag_extractor.get_tags_on_object(
472477
schema_name=schema_name, db_name=db_name, domain=domain
473478
)
@@ -837,15 +842,7 @@ def gen_dataset_workunits(
837842
if dpi_aspect:
838843
yield dpi_aspect
839844

840-
subTypes = SubTypes(
841-
typeNames=(
842-
[DatasetSubTypes.SNOWFLAKE_STREAM]
843-
if isinstance(table, SnowflakeStream)
844-
else [DatasetSubTypes.VIEW]
845-
if isinstance(table, SnowflakeView)
846-
else [DatasetSubTypes.TABLE]
847-
)
848-
)
845+
subTypes = SubTypes(typeNames=[table.get_subtype()])
849846

850847
yield MetadataChangeProposalWrapper(
851848
entityUrn=dataset_urn, aspect=subTypes
@@ -932,9 +929,9 @@ def get_dataset_properties(
932929
"OWNER_ROLE_TYPE": table.owner_role_type,
933930
"TABLE_NAME": table.table_name,
934931
"BASE_TABLES": table.base_tables,
935-
"STALE_AFTER": table.stale_after.isoformat()
936-
if table.stale_after
937-
else None,
932+
"STALE_AFTER": (
933+
table.stale_after.isoformat() if table.stale_after else None
934+
),
938935
}.items()
939936
if v
940937
}

metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -352,6 +352,15 @@ def __init__(self, config: SQLCommonConfig, ctx: PipelineContext, platform: str)
352352
)
353353
self.report.sql_aggregator = self.aggregator.report
354354

355+
def _add_default_options(self, sql_config: SQLCommonConfig) -> None:
356+
"""Add default SQLAlchemy options. Can be overridden by subclasses to add additional defaults."""
357+
# Extra default SQLAlchemy option for better connection pooling and threading.
358+
# https://docs.sqlalchemy.org/en/14/core/pooling.html#sqlalchemy.pool.QueuePool.params.max_overflow
359+
if sql_config.is_profiling_enabled():
360+
sql_config.options.setdefault(
361+
"max_overflow", sql_config.profiling.max_workers
362+
)
363+
355364
@classmethod
356365
def test_connection(cls, config_dict: dict) -> TestConnectionReport:
357366
test_report = TestConnectionReport()
@@ -519,12 +528,7 @@ def get_workunits_internal(self) -> Iterable[Union[MetadataWorkUnit, SqlWorkUnit
519528
# Known issue with sqlalchemy https://stackoverflow.com/questions/60804288/pycharm-duplicated-log-for-sqlalchemy-echo-true
520529
sqlalchemy_log._add_default_handler = lambda x: None # type: ignore
521530

522-
# Extra default SQLAlchemy option for better connection pooling and threading.
523-
# https://docs.sqlalchemy.org/en/14/core/pooling.html#sqlalchemy.pool.QueuePool.params.max_overflow
524-
if sql_config.is_profiling_enabled():
525-
sql_config.options.setdefault(
526-
"max_overflow", sql_config.profiling.max_workers
527-
)
531+
self._add_default_options(sql_config)
528532

529533
for inspector in self.get_inspectors():
530534
profiler = None

0 commit comments

Comments
 (0)