diff --git a/.github/workflows/changelog.yml b/.github/workflows/changelog.yml index e23491c..2c040c8 100644 --- a/.github/workflows/changelog.yml +++ b/.github/workflows/changelog.yml @@ -7,7 +7,7 @@ on: - master env: - DEFAULT_PYTHON: '3.12' + DEFAULT_PYTHON: '3.13' permissions: contents: read diff --git a/.github/workflows/dev-release.yml b/.github/workflows/dev-release.yml index e7fbf7d..c00d694 100644 --- a/.github/workflows/dev-release.yml +++ b/.github/workflows/dev-release.yml @@ -9,7 +9,7 @@ on: workflow_dispatch: env: - DEFAULT_PYTHON: '3.12' + DEFAULT_PYTHON: '3.13' concurrency: group: ${{ github.workflow }}-${{ github.head_ref || github.ref }} diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index ad3d190..ffd3fad 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -6,7 +6,7 @@ on: - '[0-9]+.[0-9]+.[0-9]+' env: - DEFAULT_PYTHON: '3.12' + DEFAULT_PYTHON: '3.13' jobs: release: @@ -110,4 +110,5 @@ jobs: name: ${{ steps.release-name.outputs.name }} body_path: changelog.md files: | - dist/* + dist/*.tar.gz + dist/*.whl diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index ca91fa9..56431f4 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -13,7 +13,7 @@ concurrency: cancel-in-progress: true env: - DEFAULT_PYTHON: '3.12' + DEFAULT_PYTHON: '3.13' jobs: tests: @@ -22,9 +22,13 @@ jobs: strategy: fail-fast: false matrix: - python-version: ['3.7', '3.12'] - pydantic-version: ['1', '2'] - os: [ubuntu-latest] + include: + - os: ubuntu-latest + python-version: '3.13' + pydantic-version: '2' + - os: ubuntu-22.04 + python-version: '3.7' + pydantic-version: '1' steps: - name: Checkout code @@ -116,7 +120,7 @@ jobs: run: ./combine_coverage.sh - name: Check coverage - uses: codecov/codecov-action@v4 + uses: codecov/codecov-action@v5 with: token: ${{ secrets.CODECOV_TOKEN }} file: ./reports/coverage.xml diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 43476b1..d3b52d4 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -44,7 +44,7 @@ repos: args: - --license-filepath - .spdx-license-header.txt - - --use-current-year + - --allow-past-years - --no-extra-eol - repo: https://github.com/codespell-project/codespell @@ -78,7 +78,7 @@ repos: - id: text-unicode-replacement-char - repo: https://github.com/asottile/pyupgrade - rev: v3.19.0 + rev: v3.19.1 hooks: - id: pyupgrade args: [--py37-plus, --keep-runtime-typing] @@ -102,7 +102,7 @@ repos: - black==24.4.2 - repo: https://github.com/pycqa/bandit - rev: 1.7.10 + rev: 1.8.2 hooks: - id: bandit args: diff --git a/.readthedocs.yml b/.readthedocs.yml index 894a5fc..980b842 100644 --- a/.readthedocs.yml +++ b/.readthedocs.yml @@ -1,9 +1,12 @@ version: 2 +sphinx: + configuration: docs/conf.py + build: os: ubuntu-22.04 tools: - python: '3.12' + python: '3.13' jobs: post_checkout: - git fetch --unshallow || true diff --git a/.spdx-license-header.txt b/.spdx-license-header.txt index 44939ae..ac93210 100644 --- a/.spdx-license-header.txt +++ b/.spdx-license-header.txt @@ -1,2 +1,2 @@ -SPDX-FileCopyrightText: 2021-2024 MTS PJSC +SPDX-FileCopyrightText: 2021-2025 MTS PJSC SPDX-License-Identifier: Apache-2.0 diff --git a/LICENSE.txt b/LICENSE.txt index 6b68d87..0b0ca7e 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -1,4 +1,4 @@ -Copyright 2021-2024 MTS PJSC. All rights reserved. +Copyright 2021-2025 MTS PJSC. All rights reserved. Apache License Version 2.0, January 2004 diff --git a/README.rst b/README.rst index c865f5d..dd8979f 100644 --- a/README.rst +++ b/README.rst @@ -3,24 +3,35 @@ ETL Entities ============ -|Repo Status| |PyPI| |PyPI License| |PyPI Python Version| -|Documentation| |Build Status| |Coverage| |pre-commit.ci| +|Repo Status| |PyPI Latest Release| |PyPI License| |PyPI Python Version| |PyPI Downloads| +|Documentation| |CI Status| |Test Coverage| |pre-commit.ci Status| .. |Repo Status| image:: https://www.repostatus.org/badges/latest/active.svg + :alt: Repo status - Active :target: https://github.com/MobileTeleSystems/etl-entities -.. |PyPI| image:: https://img.shields.io/pypi/v/etl-entities +.. |PyPI Latest Release| image:: https://img.shields.io/pypi/v/etl-entities + :alt: PyPI - Latest Release :target: https://pypi.org/project/etl-entities/ .. |PyPI License| image:: https://img.shields.io/pypi/l/etl-entities.svg + :alt: PyPI - License :target: https://github.com/MobileTeleSystems/etl-entities/blob/develop/LICENSE.txt .. |PyPI Python Version| image:: https://img.shields.io/pypi/pyversions/etl-entities.svg - :target: https://badge.fury.io/py/etl-entities + :alt: PyPI - Python Version + :target: https://pypi.org/project/etl-entities/ +.. |PyPI Downloads| image:: https://img.shields.io/pypi/dm/etl-entities + :alt: PyPI - Downloads + :target: https://pypi.org/project/etl-entities/ .. |Documentation| image:: https://readthedocs.org/projects/etl-entities/badge/?version=stable + :alt: Documentation - ReadTheDocs :target: https://etl-entities.readthedocs.io/ -.. |Build Status| image:: https://github.com/MobileTeleSystems/etl-entities/workflows/Tests/badge.svg +.. |CI Status| image:: https://github.com/MobileTeleSystems/etl-entities/workflows/Tests/badge.svg + :alt: Github Actions - latest CI build status :target: https://github.com/MobileTeleSystems/etl-entities/actions -.. |Coverage| image:: https://codecov.io/gh/MobileTeleSystems/etl-entities/branch/develop/graph/badge.svg?token=RIO8URKNZJ +.. |Test Coverage| image:: https://codecov.io/gh/MobileTeleSystems/etl-entities/branch/develop/graph/badge.svg?token=RIO8URKNZJ + :alt: Test coverage - percent :target: https://codecov.io/gh/MobileTeleSystems/etl-entities -.. |pre-commit.ci| image:: https://results.pre-commit.ci/badge/github/MobileTeleSystems/etl-entities/develop.svg +.. |pre-commit.ci Status| image:: https://results.pre-commit.ci/badge/github/MobileTeleSystems/etl-entities/develop.svg + :alt: pre-commit.ci - status :target: https://results.pre-commit.ci/latest/github/MobileTeleSystems/etl-entities/develop What is ETL Entities? @@ -35,6 +46,7 @@ Currently implemented: * ``ColumnDateHWM`` * ``ColumnDateTimeHWM`` * ``FileListHWM`` + * ``FileModifiedTimeHWM`` * ``KeyValueIntHWM`` * HWM Store classes: diff --git a/codecov.yml b/codecov.yml index 7291c7a..f70a673 100644 --- a/codecov.yml +++ b/codecov.yml @@ -2,5 +2,5 @@ coverage: status: project: default: - target: 94% + target: 92% threshold: 1% diff --git a/docs/changelog/2.5.0.rst b/docs/changelog/2.5.0.rst new file mode 100644 index 0000000..0aca6c8 --- /dev/null +++ b/docs/changelog/2.5.0.rst @@ -0,0 +1,13 @@ +2.5.0 (2025-01-27) +================== + +Features +-------- + +- Implement ``FileModifiedTimeHWM``, HWM based on file modification time. (:github:pull:`109`) + + +Improvements +------------ + +- Add compatibility with ``Python 3.13`` (:github:pull:`106`) diff --git a/docs/changelog/index.rst b/docs/changelog/index.rst index b965974..7d532e6 100644 --- a/docs/changelog/index.rst +++ b/docs/changelog/index.rst @@ -3,6 +3,7 @@ :caption: Changelog DRAFT + 2.5.0 2.4.0 2.3.1 2.3.0 diff --git a/docs/conf.py b/docs/conf.py index ccde1b4..1909d57 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -25,7 +25,7 @@ # -- Project information ----------------------------------------------------- project = "etl-entities" -copyright = "2021-2024 MTS PJSC" +copyright = "2021-2025 MTS PJSC" author = "DataOps.ETL" # The version info for the project you're documenting, acts as replacement for diff --git a/docs/hwm/file/file_mtime_hwm.rst b/docs/hwm/file/file_mtime_hwm.rst new file mode 100644 index 0000000..05a2471 --- /dev/null +++ b/docs/hwm/file/file_mtime_hwm.rst @@ -0,0 +1,7 @@ +File Modified Time HWM +=========================== + +.. currentmodule:: etl_entities.hwm.file.file_mtime_hwm + +.. autoclass:: FileModifiedTimeHWM + :members: name, set_value, dict, json, copy, covers, update diff --git a/docs/hwm/file/index.rst b/docs/hwm/file/index.rst index 4f7ff5a..fdff2f5 100644 --- a/docs/hwm/file/index.rst +++ b/docs/hwm/file/index.rst @@ -8,6 +8,7 @@ File HWM :caption: HWM classes file_list_hwm + file_mtime_hwm What is File HWM? ----------------- @@ -40,18 +41,13 @@ This technique is called ``High WaterMark`` or ``HWM`` for short. It is used by different `strategies `_ to implement some complex logic of filtering source data. - Supported types --------------- There are a several ways to track HWM value: - * Save the entire file list, and then select only files not present in this list - (``file_list``) - * Save max modified time of all files, and then select only files with ``modified_time`` + * Save list o file paths, and then select only files not present in this list - :obj:`FileListHWM` + * Save max modified time of all files, and then select only files with modified time (``file.stat().st_mtime``) - :obj:`FileModifiedTimeHWM` higher than this value * If file name contains some incrementing value, e.g. id or datetime, - parse it and save max value of all files, then select only files with higher value - * and so on - -Currently the only HWM type implemented for files is ``file_list``. Other ones can be implemented on-demand + parse it and save max value of all files, then select only files with higher value - not implemented for now. diff --git a/etl_entities/VERSION b/etl_entities/VERSION index 197c4d5..437459c 100644 --- a/etl_entities/VERSION +++ b/etl_entities/VERSION @@ -1 +1 @@ -2.4.0 +2.5.0 diff --git a/etl_entities/__init__.py b/etl_entities/__init__.py index e23bb5b..5c36d98 100644 --- a/etl_entities/__init__.py +++ b/etl_entities/__init__.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS PJSC +# SPDX-FileCopyrightText: 2021-2025 MTS PJSC # SPDX-License-Identifier: Apache-2.0 import os diff --git a/etl_entities/entity.py b/etl_entities/entity.py index 213b0ce..af3a968 100644 --- a/etl_entities/entity.py +++ b/etl_entities/entity.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS PJSC +# SPDX-FileCopyrightText: 2021-2025 MTS PJSC # SPDX-License-Identifier: Apache-2.0 # isort: skip_file diff --git a/etl_entities/hwm/__init__.py b/etl_entities/hwm/__init__.py index 9eef626..8591f5a 100644 --- a/etl_entities/hwm/__init__.py +++ b/etl_entities/hwm/__init__.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS PJSC +# SPDX-FileCopyrightText: 2021-2025 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from etl_entities.hwm.column.column_hwm import ColumnHWM from etl_entities.hwm.column.date_hwm import ColumnDateHWM @@ -6,6 +6,7 @@ from etl_entities.hwm.column.int_hwm import ColumnIntHWM from etl_entities.hwm.file.file_hwm import FileHWM from etl_entities.hwm.file.file_list_hwm import FileListHWM +from etl_entities.hwm.file.file_mtime_hwm import FileModifiedTimeHWM from etl_entities.hwm.hwm import HWM from etl_entities.hwm.hwm_type_registry import HWMTypeRegistry, register_hwm_type from etl_entities.hwm.key_value.key_value_hwm import KeyValueHWM @@ -19,6 +20,7 @@ "ColumnIntHWM", "FileHWM", "FileListHWM", + "FileModifiedTimeHWM", "KeyValueHWM", "KeyValueIntHWM", "HWMTypeRegistry", diff --git a/etl_entities/hwm/column/__init__.py b/etl_entities/hwm/column/__init__.py index 54237d1..62a0496 100644 --- a/etl_entities/hwm/column/__init__.py +++ b/etl_entities/hwm/column/__init__.py @@ -1,2 +1,2 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS PJSC +# SPDX-FileCopyrightText: 2021-2025 MTS PJSC # SPDX-License-Identifier: Apache-2.0 diff --git a/etl_entities/hwm/column/column_hwm.py b/etl_entities/hwm/column/column_hwm.py index 387565f..6b77d9b 100644 --- a/etl_entities/hwm/column/column_hwm.py +++ b/etl_entities/hwm/column/column_hwm.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS PJSC +# SPDX-FileCopyrightText: 2021-2025 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations @@ -164,10 +164,7 @@ def update(self: ColumnHWMType, value: ColumnValueType) -> ColumnHWMType: 2 """ - if self.value is None: - return self.set_value(value) - - if self.value < value: # type: ignore[operator] + if self.value is None or self.value < value: # type: ignore[operator] return self.set_value(value) return self diff --git a/etl_entities/hwm/column/date_hwm.py b/etl_entities/hwm/column/date_hwm.py index b458005..538f1bd 100644 --- a/etl_entities/hwm/column/date_hwm.py +++ b/etl_entities/hwm/column/date_hwm.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS PJSC +# SPDX-FileCopyrightText: 2021-2025 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations @@ -18,7 +18,7 @@ @register_hwm_type("column_date") class ColumnDateHWM(ColumnHWM[date]): - """Date HWM type + """HWM based on tracking latest column value of type :obj:`datetime.date`. Parameters ---------- diff --git a/etl_entities/hwm/column/datetime_hwm.py b/etl_entities/hwm/column/datetime_hwm.py index d30dc97..5c899aa 100644 --- a/etl_entities/hwm/column/datetime_hwm.py +++ b/etl_entities/hwm/column/datetime_hwm.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS PJSC +# SPDX-FileCopyrightText: 2021-2025 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations @@ -18,8 +18,7 @@ @register_hwm_type("column_datetime") class ColumnDateTimeHWM(ColumnHWM[datetime]): - """DateTime HWM type - + """HWM based on tracking latest column value of type :obj:`datetime.datetime`. Parameters ---------- diff --git a/etl_entities/hwm/column/int_hwm.py b/etl_entities/hwm/column/int_hwm.py index bc03de7..304a33c 100644 --- a/etl_entities/hwm/column/int_hwm.py +++ b/etl_entities/hwm/column/int_hwm.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS PJSC +# SPDX-FileCopyrightText: 2021-2025 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations @@ -16,7 +16,7 @@ @register_hwm_type("column_int") class ColumnIntHWM(ColumnHWM[int]): - """Integer HWM type + """HWM based on tracking latest column value of type :obj:`int`. Parameters ---------- diff --git a/etl_entities/hwm/file/__init__.py b/etl_entities/hwm/file/__init__.py index 54237d1..62a0496 100644 --- a/etl_entities/hwm/file/__init__.py +++ b/etl_entities/hwm/file/__init__.py @@ -1,2 +1,2 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS PJSC +# SPDX-FileCopyrightText: 2021-2025 MTS PJSC # SPDX-License-Identifier: Apache-2.0 diff --git a/etl_entities/hwm/file/file_hwm.py b/etl_entities/hwm/file/file_hwm.py index aea0741..01ab704 100644 --- a/etl_entities/hwm/file/file_hwm.py +++ b/etl_entities/hwm/file/file_hwm.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS PJSC +# SPDX-FileCopyrightText: 2021-2025 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/etl_entities/hwm/file/file_list_hwm.py b/etl_entities/hwm/file/file_list_hwm.py index 6fb9ec8..1c5c3c5 100644 --- a/etl_entities/hwm/file/file_list_hwm.py +++ b/etl_entities/hwm/file/file_list_hwm.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS PJSC +# SPDX-FileCopyrightText: 2021-2025 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations @@ -21,7 +21,7 @@ @register_hwm_type("file_list") class FileListHWM(FileHWM[FileListType]): - """File List HWM type + """HWM based on tracking list of file names. Parameters ---------- @@ -72,10 +72,10 @@ def covers(self, value: str | os.PathLike) -> bool: # type: ignore -------- >>> from etl_entities.hwm import FileListHWM - >>> hwm = FileListHWM(value={"/some/path.py"}, name="my_hwm") - >>> hwm.covers("/some/path.py") # path in HWM + >>> hwm = FileListHWM(value={"/some/old_file.py"}, name="my_hwm") + >>> hwm.covers("/some/old_file.py") # path in HWM True - >>> hwm.covers("/another/path.py") # path not in HWM + >>> hwm.covers("/some/new_file.py") # path not in HWM False """ @@ -90,7 +90,7 @@ def update(self: FileListHWMType, value: str | os.PathLike | Iterable[str | os.P Returns ------- - result : FileHWM + result : FileListHWM Self diff --git a/etl_entities/hwm/file/file_mtime_hwm.py b/etl_entities/hwm/file/file_mtime_hwm.py new file mode 100644 index 0000000..ba595c4 --- /dev/null +++ b/etl_entities/hwm/file/file_mtime_hwm.py @@ -0,0 +1,174 @@ +# SPDX-FileCopyrightText: 2021-2025 MTS PJSC +# SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + +from datetime import datetime +from typing import Iterable, Optional, TypeVar + +from typing_extensions import Protocol, runtime_checkable + +try: + from pydantic.v1 import validator +except (ImportError, AttributeError): + from pydantic import validator # type: ignore[no-redef, assignment] + +from etl_entities.hwm import FileHWM +from etl_entities.hwm.hwm_type_registry import register_hwm_type + +FileModifiedTimeHWMType = TypeVar("FileModifiedTimeHWMType", bound="FileModifiedTimeHWM") + + +@runtime_checkable +class StatWithMtime(Protocol): + @property + def st_mtime(self) -> float | None: ... # noqa: E704 + + +@runtime_checkable +class PathWithStats(Protocol): + def is_file(self) -> bool: ... # noqa: E704 + def exists(self) -> bool: ... # noqa: E704 + def stat(self) -> StatWithMtime: ... # noqa: E704 + + +@register_hwm_type("file_modification_time") +class FileModifiedTimeHWM(FileHWM[Optional[datetime]]): # noqa: WPS338r + """HWM based on tracking file modification time. + + .. warning:: + + This HWM types is not very precise, as some filesystems may have whole second precision, + so files created within the same second may be skipped. + + Also this HWM should not be used if file modification time can be changed after the file + was already handled by previous ETL process run. This could lead to reading the same file twice. + + .. versionadded:: 2.5.0 + + Parameters + ---------- + name : ``str`` + + HWM unique name + + value : :obj:`datetime.datetime` or ``None``, default: ``None`` + + HWM value + + directory : :obj:`pathlib.Path`, default: ``None`` + + Directory for HWM value. + + description : ``str``, default: ``""`` + + Description of HWM + + expression : Any, default: ``None`` + + HWM expression + + modified_time : :obj:`datetime.datetime`, default: current datetime + + HWM value modification time + + Examples + -------- + + .. code:: python + + from datetime import datetime, timezone + from etl_entities.hwm import FileModifiedTimeHWM + + hwm = FileModifiedTimeHWM( + name="hwm_name", + value=datetime(2025, 1, 1, 11, 22, 33, 456789, tzinfo=timezone.utc), + ) + """ + + value: Optional[datetime] = None + + @validator("value", pre=True) + def _parse_isoformat(cls, value): # noqa: N805 + if isinstance(value, str): + return datetime.fromisoformat(value) + return value + + @validator("value") + def _always_include_tz(cls, value: datetime | None): # noqa: N805r + if value and value.tzinfo is None: + return value.astimezone() + return value + + def covers(self, value: datetime | int | float | PathWithStats) -> bool: # type: ignore + """Return ``True`` if input value is already covered by HWM + + Examples + -------- + + >>> from pathlib import Path + >>> from etl_entities.hwm import FileModifiedTimeHWM + >>> hwm = FileModifiedTimeHWM( + ... name="hwm_name", + ... value=datetime(2025, 1, 1, 11, 22, 33, 456789), + ... ) + >>> hwm.covers(Path("/some/old_file.py")) # path not in HWM + False + """ + + new_value: datetime | None + if isinstance(value, PathWithStats): + new_timestamp = value.stat().st_mtime if value.exists() and value.is_file() else None + new_value = self._check_new_value(new_timestamp) + else: + new_value = self._check_new_value(value) + + return self.value is not None and new_value is not None and self.value.timestamp() >= new_value.timestamp() + + def update( + self: FileModifiedTimeHWMType, + value: datetime | int | float | PathWithStats | Iterable[PathWithStats], + ) -> FileModifiedTimeHWMType: + """Updates current HWM value with some implementation-specific logic, and return HWM. + + .. note:: + + Changes HWM value in place + + Returns + ------- + result : FileModifiedTimeHWM + + Self + + Examples + -------- + + >>> from pathlib import Path + >>> from etl_entities.hwm import FileModifiedTimeHWM + >>> hwm = FileModifiedTimeHWM( + ... name='hwm_name', + ... value=datetime(2025, 1, 1, 11, 22, 33, 456789), + ... ) + >>> # old file is already covered + >>> hwm.update(Path("/some/old_file.py")).value + datetime.datetime(2025, 1, 1, 11, 22, 33, 456789, tzinfo=...) + """ + + new_value: datetime | None = None + if isinstance(value, Iterable): + timestamps = [path.stat().st_mtime for path in value if path.exists() and path.is_file()] + new_timestamp = max(filter(None, timestamps), default=None) + new_value = self._check_new_value(new_timestamp) + elif isinstance(value, PathWithStats): + new_timestamp = value.stat().st_mtime if value.exists() and value.is_file() else None + new_value = self._check_new_value(new_timestamp) + else: + new_value = self._check_new_value(value) + + if not self.value and new_value: + return self.set_value(new_value) + + if self.value and new_value and self.value.timestamp() < new_value.timestamp(): + return self.set_value(new_value) + + return self diff --git a/etl_entities/hwm/hwm.py b/etl_entities/hwm/hwm.py index e233bb5..c54c417 100644 --- a/etl_entities/hwm/hwm.py +++ b/etl_entities/hwm/hwm.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS PJSC +# SPDX-FileCopyrightText: 2021-2025 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/etl_entities/hwm/hwm_type_registry.py b/etl_entities/hwm/hwm_type_registry.py index 77c5986..e0f01c1 100644 --- a/etl_entities/hwm/hwm_type_registry.py +++ b/etl_entities/hwm/hwm_type_registry.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS PJSC +# SPDX-FileCopyrightText: 2021-2025 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/etl_entities/hwm/key_value/__init__.py b/etl_entities/hwm/key_value/__init__.py index 54237d1..62a0496 100644 --- a/etl_entities/hwm/key_value/__init__.py +++ b/etl_entities/hwm/key_value/__init__.py @@ -1,2 +1,2 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS PJSC +# SPDX-FileCopyrightText: 2021-2025 MTS PJSC # SPDX-License-Identifier: Apache-2.0 diff --git a/etl_entities/hwm/key_value/key_value_hwm.py b/etl_entities/hwm/key_value/key_value_hwm.py index 92e5e4f..c57a176 100644 --- a/etl_entities/hwm/key_value/key_value_hwm.py +++ b/etl_entities/hwm/key_value/key_value_hwm.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS PJSC +# SPDX-FileCopyrightText: 2021-2025 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/etl_entities/hwm/key_value/key_value_int_hwm.py b/etl_entities/hwm/key_value/key_value_int_hwm.py index 2786629..9902fac 100644 --- a/etl_entities/hwm/key_value/key_value_int_hwm.py +++ b/etl_entities/hwm/key_value/key_value_int_hwm.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS PJSC +# SPDX-FileCopyrightText: 2021-2025 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/etl_entities/hwm_store/__init__.py b/etl_entities/hwm_store/__init__.py index b55c83f..35651b1 100644 --- a/etl_entities/hwm_store/__init__.py +++ b/etl_entities/hwm_store/__init__.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS PJSC +# SPDX-FileCopyrightText: 2021-2025 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from etl_entities.hwm_store.base_hwm_store import BaseHWMStore from etl_entities.hwm_store.hwm_store_class_registry import ( diff --git a/etl_entities/hwm_store/base_hwm_store.py b/etl_entities/hwm_store/base_hwm_store.py index 0dfb198..2138e2c 100644 --- a/etl_entities/hwm_store/base_hwm_store.py +++ b/etl_entities/hwm_store/base_hwm_store.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS PJSC +# SPDX-FileCopyrightText: 2021-2025 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/etl_entities/hwm_store/hwm_store_class_registry.py b/etl_entities/hwm_store/hwm_store_class_registry.py index 80da7c2..71a7e89 100644 --- a/etl_entities/hwm_store/hwm_store_class_registry.py +++ b/etl_entities/hwm_store/hwm_store_class_registry.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS PJSC +# SPDX-FileCopyrightText: 2021-2025 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/etl_entities/hwm_store/hwm_store_detect.py b/etl_entities/hwm_store/hwm_store_detect.py index 3d61aca..533269c 100644 --- a/etl_entities/hwm_store/hwm_store_detect.py +++ b/etl_entities/hwm_store/hwm_store_detect.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS PJSC +# SPDX-FileCopyrightText: 2021-2025 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/etl_entities/hwm_store/hwm_store_stack_manager.py b/etl_entities/hwm_store/hwm_store_stack_manager.py index 2c41929..b553115 100644 --- a/etl_entities/hwm_store/hwm_store_stack_manager.py +++ b/etl_entities/hwm_store/hwm_store_stack_manager.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS PJSC +# SPDX-FileCopyrightText: 2021-2025 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/etl_entities/hwm_store/memory_hwm_store.py b/etl_entities/hwm_store/memory_hwm_store.py index df7d354..1357292 100644 --- a/etl_entities/hwm_store/memory_hwm_store.py +++ b/etl_entities/hwm_store/memory_hwm_store.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS PJSC +# SPDX-FileCopyrightText: 2021-2025 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/etl_entities/old_hwm/__init__.py b/etl_entities/old_hwm/__init__.py index f7cbeba..3770e62 100644 --- a/etl_entities/old_hwm/__init__.py +++ b/etl_entities/old_hwm/__init__.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS PJSC +# SPDX-FileCopyrightText: 2021-2025 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from etl_entities.old_hwm.column_hwm import ColumnHWM from etl_entities.old_hwm.date_hwm import DateHWM diff --git a/etl_entities/old_hwm/column_hwm.py b/etl_entities/old_hwm/column_hwm.py index e943bdb..94fd13f 100644 --- a/etl_entities/old_hwm/column_hwm.py +++ b/etl_entities/old_hwm/column_hwm.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS PJSC +# SPDX-FileCopyrightText: 2021-2025 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/etl_entities/old_hwm/date_hwm.py b/etl_entities/old_hwm/date_hwm.py index bc0352f..a228def 100644 --- a/etl_entities/old_hwm/date_hwm.py +++ b/etl_entities/old_hwm/date_hwm.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS PJSC +# SPDX-FileCopyrightText: 2021-2025 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/etl_entities/old_hwm/datetime_hwm.py b/etl_entities/old_hwm/datetime_hwm.py index 22ec933..4047c15 100644 --- a/etl_entities/old_hwm/datetime_hwm.py +++ b/etl_entities/old_hwm/datetime_hwm.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS PJSC +# SPDX-FileCopyrightText: 2021-2025 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/etl_entities/old_hwm/file_hwm.py b/etl_entities/old_hwm/file_hwm.py index db3adba..4f01432 100644 --- a/etl_entities/old_hwm/file_hwm.py +++ b/etl_entities/old_hwm/file_hwm.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS PJSC +# SPDX-FileCopyrightText: 2021-2025 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/etl_entities/old_hwm/file_list_hwm.py b/etl_entities/old_hwm/file_list_hwm.py index b0a057e..1dfed28 100644 --- a/etl_entities/old_hwm/file_list_hwm.py +++ b/etl_entities/old_hwm/file_list_hwm.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS PJSC +# SPDX-FileCopyrightText: 2021-2025 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/etl_entities/old_hwm/hwm.py b/etl_entities/old_hwm/hwm.py index 4b26c97..d4905af 100644 --- a/etl_entities/old_hwm/hwm.py +++ b/etl_entities/old_hwm/hwm.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS PJSC +# SPDX-FileCopyrightText: 2021-2025 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/etl_entities/old_hwm/int_hwm.py b/etl_entities/old_hwm/int_hwm.py index e2618c4..619a049 100644 --- a/etl_entities/old_hwm/int_hwm.py +++ b/etl_entities/old_hwm/int_hwm.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS PJSC +# SPDX-FileCopyrightText: 2021-2025 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/etl_entities/plugins/__init__.py b/etl_entities/plugins/__init__.py index 1ec8e92..cf18f38 100644 --- a/etl_entities/plugins/__init__.py +++ b/etl_entities/plugins/__init__.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS PJSC +# SPDX-FileCopyrightText: 2021-2025 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from etl_entities.plugins.import_plugins import import_plugins diff --git a/etl_entities/plugins/import_plugins.py b/etl_entities/plugins/import_plugins.py index 752e0bb..283620e 100644 --- a/etl_entities/plugins/import_plugins.py +++ b/etl_entities/plugins/import_plugins.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS PJSC +# SPDX-FileCopyrightText: 2021-2025 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/etl_entities/process/__init__.py b/etl_entities/process/__init__.py index 1abb39c..3d2498e 100644 --- a/etl_entities/process/__init__.py +++ b/etl_entities/process/__init__.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS PJSC +# SPDX-FileCopyrightText: 2021-2025 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from etl_entities.process.process import Process from etl_entities.process.process_stack_manager import ProcessStackManager diff --git a/etl_entities/process/process.py b/etl_entities/process/process.py index fb6a9fd..7a9aa2c 100644 --- a/etl_entities/process/process.py +++ b/etl_entities/process/process.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS PJSC +# SPDX-FileCopyrightText: 2021-2025 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/etl_entities/process/process_stack_manager.py b/etl_entities/process/process_stack_manager.py index 49a5edc..c1a868e 100644 --- a/etl_entities/process/process_stack_manager.py +++ b/etl_entities/process/process_stack_manager.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS PJSC +# SPDX-FileCopyrightText: 2021-2025 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/etl_entities/source/__init__.py b/etl_entities/source/__init__.py index a575a62..924c9b7 100644 --- a/etl_entities/source/__init__.py +++ b/etl_entities/source/__init__.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS PJSC +# SPDX-FileCopyrightText: 2021-2025 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from etl_entities.source.db import Column, Table from etl_entities.source.file import RemoteFolder diff --git a/etl_entities/source/db/__init__.py b/etl_entities/source/db/__init__.py index 078d80c..eaed470 100644 --- a/etl_entities/source/db/__init__.py +++ b/etl_entities/source/db/__init__.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS PJSC +# SPDX-FileCopyrightText: 2021-2025 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from etl_entities.source.db.column import Column from etl_entities.source.db.table import Table diff --git a/etl_entities/source/db/column.py b/etl_entities/source/db/column.py index 87d02fb..2bab3eb 100644 --- a/etl_entities/source/db/column.py +++ b/etl_entities/source/db/column.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS PJSC +# SPDX-FileCopyrightText: 2021-2025 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/etl_entities/source/db/table.py b/etl_entities/source/db/table.py index cb2849f..62e0827 100644 --- a/etl_entities/source/db/table.py +++ b/etl_entities/source/db/table.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS PJSC +# SPDX-FileCopyrightText: 2021-2025 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/etl_entities/source/file/__init__.py b/etl_entities/source/file/__init__.py index 0657487..a40b702 100644 --- a/etl_entities/source/file/__init__.py +++ b/etl_entities/source/file/__init__.py @@ -1,3 +1,3 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS PJSC +# SPDX-FileCopyrightText: 2021-2025 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from etl_entities.source.file.remote_folder import RemoteFolder diff --git a/etl_entities/source/file/remote_folder.py b/etl_entities/source/file/remote_folder.py index 0d5d77f..06ca360 100644 --- a/etl_entities/source/file/remote_folder.py +++ b/etl_entities/source/file/remote_folder.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS PJSC +# SPDX-FileCopyrightText: 2021-2025 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/etl_entities/version.py b/etl_entities/version.py index 1676e33..ebb224d 100644 --- a/etl_entities/version.py +++ b/etl_entities/version.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS PJSC +# SPDX-FileCopyrightText: 2021-2025 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from pathlib import Path diff --git a/setup.cfg b/setup.cfg index 67d45fa..28a4cb8 100644 --- a/setup.cfg +++ b/setup.cfg @@ -196,7 +196,9 @@ ignore = # WPS410 Found wrong metadata variable: __all__ WPS410, # WPS474 Found import object collision - WPS474 + WPS474, +# WPS412 Found `__init__.py` module with logic + WPS412 # http://flake8.pycqa.org/en/latest/user/options.html?highlight=per-file-ignores#cmdoption-flake8-per-file-ignores per-file-ignores = diff --git a/setup.py b/setup.py index f49ad74..ce93908 100644 --- a/setup.py +++ b/setup.py @@ -45,7 +45,7 @@ def parse_requirements(file: Path) -> list[str]: url="https://github.com/MobileTeleSystems/etl-entities", packages=find_packages(exclude=["docs", "docs.*", "tests", "tests.*"]), classifiers=[ - "Development Status :: 3 - Alpha", + "Development Status :: 5 - Production/Stable", "Framework :: Pydantic", "Framework :: Pydantic :: 1", "Framework :: Pydantic :: 2", @@ -59,6 +59,7 @@ def parse_requirements(file: Path) -> list[str]: "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", "Typing :: Typed", ], project_urls={ diff --git a/tests/test_hwm/test_file_mtime_hwm.py b/tests/test_hwm/test_file_mtime_hwm.py new file mode 100644 index 0000000..054c47f --- /dev/null +++ b/tests/test_hwm/test_file_mtime_hwm.py @@ -0,0 +1,516 @@ +from __future__ import annotations + +import os +import secrets +import time +from datetime import datetime, timedelta, timezone +from pathlib import Path +from unittest.mock import Mock + +import pytest + +from etl_entities.hwm import FileModifiedTimeHWM +from etl_entities.instance import AbsolutePath + + +@pytest.mark.parametrize( + "input_value, expected_value", + [ + # values are always timezone-aware + ( + datetime(2023, 12, 30, 11, 22, 33, 456789), + datetime(2023, 12, 30, 11, 22, 33, 456789).astimezone(), + ), + ( + "2023-12-30T11:22:33.456789", + datetime(2023, 12, 30, 11, 22, 33, 456789).astimezone(), + ), + ( + datetime(2023, 12, 30, 11, 22, 33, 456789, tzinfo=timezone.utc), + datetime(2023, 12, 30, 11, 22, 33, 456789, tzinfo=timezone.utc), + ), + ( + "2023-12-30T11:22:33.456789+00:00", + datetime(2023, 12, 30, 11, 22, 33, 456789, tzinfo=timezone.utc), + ), + ( + 1703935353.456789, # timestampts are always UTC + datetime(2023, 12, 30, 11, 22, 33, 456789, tzinfo=timezone.utc), + ), + ( + datetime(2023, 12, 30, 11, 22, 33, 456789, tzinfo=timezone(timedelta(hours=3))), + datetime(2023, 12, 30, 11, 22, 33, 456789, tzinfo=timezone(timedelta(hours=3))), + ), + ( + "2023-12-30T11:22:33.456789+03:00", + datetime(2023, 12, 30, 11, 22, 33, 456789, tzinfo=timezone(timedelta(hours=3))), + ), + ], +) +def test_file_modified_time_hwm_valid_input(input_value, expected_value): + name = "file_mtime" + modified_time = datetime.now() - timedelta(days=5) + + empty_hwm = FileModifiedTimeHWM(name=name) + assert empty_hwm.name == name + assert empty_hwm.value is None + + minimal_hwm = FileModifiedTimeHWM(name=name, value=input_value) + assert minimal_hwm.name == name + assert minimal_hwm.value == expected_value + + hwm = FileModifiedTimeHWM( + name=name, + value=input_value, + description="my hwm", + directory="/absolute", + expression="something", + modified_time=modified_time, + ) + assert hwm.name == name + assert hwm.value == expected_value + assert hwm.description == "my hwm" + assert hwm.entity == AbsolutePath("/absolute") + assert hwm.expression == "something" + assert hwm.modified_time == modified_time + + +def test_file_modified_time_hwm_wrong_input(): + with pytest.raises(ValueError): + # missing name + FileModifiedTimeHWM() + + with pytest.raises(ValueError): + # missing name + FileModifiedTimeHWM(value=datetime(2025, 1, 1)) + + with pytest.raises(ValueError): + # cannot parse + FileModifiedTimeHWM(name="file_mtime", value="wtf") + + with pytest.raises(ValueError): + # extra fields not allowed + FileModifiedTimeHWM(name="file_mtime", unknown="unknown") + + +@pytest.mark.parametrize( + "input_value, expected_value", + [ + ( + datetime(2023, 12, 30, 11, 22, 33, 456789), + datetime(2023, 12, 30, 11, 22, 33, 456789).astimezone(), + ), + ( + datetime(2023, 12, 30, 11, 22, 33, 456789, tzinfo=timezone.utc), + datetime(2023, 12, 30, 11, 22, 33, 456789, tzinfo=timezone.utc), + ), + ( + datetime(2023, 12, 30, 11, 22, 33, 456789, tzinfo=timezone(timedelta(hours=3))), + datetime(2023, 12, 30, 11, 22, 33, 456789, tzinfo=timezone(timedelta(hours=3))), + ), + ], +) +def test_file_modified_time_hwm_set_value(input_value, expected_value): + hwm = FileModifiedTimeHWM(name="file_mtime") + + hwm1 = hwm.copy() + hwm1.set_value(input_value) + assert hwm1.value == expected_value + assert hwm1.modified_time > hwm.modified_time + + hwm2 = hwm.copy() + hwm2.set_value(input_value + timedelta(seconds=1)) + assert hwm2.value == expected_value + timedelta(seconds=1) + assert hwm2.modified_time > hwm1.modified_time + + hwm3 = hwm2.copy() + hwm3.set_value(input_value - timedelta(seconds=1)) + assert hwm3.value == expected_value - timedelta(seconds=1) + assert hwm3.modified_time > hwm2.modified_time + + +def test_file_modified_time_hwm_frozen(): + hwm = FileModifiedTimeHWM(name="file_mtime") + + for attr in ("value", "entity", "expression", "description", "modified_time"): + for item in (1, "abc", None, datetime.now()): + with pytest.raises(TypeError): + setattr(hwm, attr, item) + + +def test_file_modified_time_hwm_compare(): + name1 = secrets.token_hex() + name2 = secrets.token_hex() + + value1 = datetime(2025, 1, 1, 11, 22, 33, 456789, tzinfo=timezone.utc) + value2 = datetime(2025, 1, 1, 11, 22, 33, 456789, tzinfo=timezone(timedelta(hours=1))) + + folder1 = AbsolutePath("/some/path") + folder2 = AbsolutePath("/another/path") + + hwm1 = FileModifiedTimeHWM(name=name1, value=value1) + hwm2 = FileModifiedTimeHWM(name=name2, value=value1) + hwm3 = FileModifiedTimeHWM(name=name1, value=value2) + hwm4 = FileModifiedTimeHWM(name=name2, value=value2) + + hwm5 = FileModifiedTimeHWM(name=name1, directory=folder1) + hwm6 = FileModifiedTimeHWM(name=name1, directory=folder2) + + hwm7 = FileModifiedTimeHWM(name=name1, description="abc") + hwm8 = FileModifiedTimeHWM(name=name1, description="bcd") + + hwm9 = FileModifiedTimeHWM(name=name1, expression="abc") + hwm10 = FileModifiedTimeHWM(name=name1, expression="bcd") + + modified_time = datetime.now() - timedelta(days=5) + hwm_with_different_mtime = FileModifiedTimeHWM(name=name1, value=value1, modified_time=modified_time) + + # modified time is ignored when comparing + assert hwm1 == hwm_with_different_mtime + + items = (hwm1, hwm2, hwm3, hwm4, hwm5, hwm6, hwm7, hwm8, hwm9, hwm10) + + # items with different attribute values (except modified_time) are not equal + for item1 in items: + for item2 in items: + if item1 is not item2: + assert item1 != item2 + + +@pytest.mark.parametrize( + "tzinfo", + [ + None, + timezone.utc, + timezone(timedelta(hours=3)), + ], +) +def test_file_modified_time_hwm_covers(tzinfo): + value = datetime(2025, 1, 1, 11, 22, 33, 456789, tzinfo=tzinfo) + new_value = value + timedelta(seconds=1) + + empty_hwm = FileModifiedTimeHWM(name="file_mtime") + assert not empty_hwm.covers(value) + assert not empty_hwm.covers(new_value) + + hwm = FileModifiedTimeHWM(name="file_mtime", value=value) + assert hwm.covers(value) + assert not hwm.covers(new_value) + + assert hwm.covers(value.timestamp()) + assert not hwm.covers(new_value.timestamp()) + + old_file = Mock(spec=Path) + old_file_stat = Mock(spec=os.stat_result) + old_file_stat.st_mtime = value.timestamp() + old_file.stat.return_value = old_file_stat + old_file.exists.return_value = True + old_file.is_file.return_value = True + + new_file = Mock(spec=Path) + new_file_stat = Mock(spec=os.stat_result) + new_file_stat.st_mtime = new_value.timestamp() + new_file.stat.return_value = new_file_stat + new_file.exists.return_value = True + new_file.is_file.return_value = True + + missing_path = Mock(spec=Path) + missing_path.exists.return_value = False + + directory_path = Mock(spec=Path) + directory_path.exists.return_value = True + directory_path.is_file.return_value = False + + assert hwm.covers(old_file) + assert not hwm.covers(new_file) + assert not hwm.covers(missing_path) + assert not hwm.covers(directory_path) + + +@pytest.mark.parametrize( + "value", + [ + datetime.now().astimezone(), + None, + ], +) +def test_file_modified_time_hwm_update_none(value): + initial_hwm = FileModifiedTimeHWM(name="file_mtime", value=value) + + # empty value -> do nothing + hwm = initial_hwm.copy() + updated_hwm = hwm.update(None) + updated_hwm = updated_hwm.update([]) + updated_hwm = updated_hwm.update({}) + + assert updated_hwm.value == value # unchanged + assert updated_hwm == initial_hwm + assert updated_hwm is hwm # same obj is returned + assert updated_hwm.modified_time == initial_hwm.modified_time + + +@pytest.mark.parametrize( + "tzinfo", + [ + None, + timezone.utc, + timezone(timedelta(hours=3)), + ], +) +def test_file_modified_time_hwm_update_datetime(tzinfo: timezone | None): + empty_hwm = FileModifiedTimeHWM(name="file_mtime") + + value = datetime(2025, 1, 1, 11, 22, 33, 456789, tzinfo=tzinfo) + new_value = value + timedelta(seconds=1) + + hwm = empty_hwm.copy() + updated_hwm = hwm.update(value) + assert updated_hwm.value == value.astimezone() + assert updated_hwm is hwm # modified in-place + assert hwm.modified_time > empty_hwm.modified_time + latest_modified_time = hwm.modified_time + + updated_hwm = hwm.update(new_value) + assert updated_hwm.value == new_value.astimezone() + assert updated_hwm is hwm # modified in-place + assert updated_hwm.modified_time > latest_modified_time + latest_modified_time = updated_hwm.modified_time + + updated_hwm = hwm.update(value) # previous value + assert updated_hwm.value == new_value.astimezone() # unchanged + assert updated_hwm is hwm + assert updated_hwm.modified_time == latest_modified_time + + +@pytest.mark.parametrize( + "tzinfo", + [ + None, + timezone.utc, + timezone(timedelta(hours=3)), + ], +) +def test_file_modified_time_hwm_update_timestamp(tzinfo: timezone | None): + empty_hwm = FileModifiedTimeHWM(name="file_mtime") + + value = datetime(2025, 1, 1, 11, 22, 33, 456789, tzinfo=tzinfo) + + hwm = empty_hwm.copy() + updated_hwm = hwm.update(value.timestamp()) + assert updated_hwm.value == value.astimezone() + assert updated_hwm is hwm # modified in-place + assert hwm.modified_time > empty_hwm.modified_time + latest_modified_time = hwm.modified_time + + updated_hwm = hwm.update(value.timestamp() + 1) + assert updated_hwm.value == value.astimezone() + timedelta(seconds=1) + assert updated_hwm is hwm # modified in-place + assert updated_hwm.modified_time > latest_modified_time + latest_modified_time = updated_hwm.modified_time + + updated_hwm = hwm.update(value.timestamp()) # previous value + assert updated_hwm.value == value.astimezone() + timedelta(seconds=1) # unchanged + assert updated_hwm is hwm + assert updated_hwm.modified_time == latest_modified_time + + +@pytest.mark.parametrize( + "tzinfo", + [ + None, + timezone.utc, + timezone(timedelta(hours=3)), + ], +) +def test_file_modified_time_hwm_update_filepath(tzinfo: timezone | None): + empty_hwm = FileModifiedTimeHWM(name="file_mtime") + + value = datetime(2025, 1, 1, 11, 22, 33, 456789, tzinfo=tzinfo) + + old_file = Mock(spec=Path) + old_file_stat = Mock(spec=os.stat_result) + old_file_stat.st_mtime = value.timestamp() + old_file.stat.return_value = old_file_stat + old_file.exists.return_value = True + old_file.is_file.return_value = True + + hwm = empty_hwm.copy() + updated_hwm = hwm.update(old_file) + assert updated_hwm.value == value.astimezone() + assert updated_hwm is hwm # modified in-place + assert hwm.modified_time > empty_hwm.modified_time + latest_modified_time = hwm.modified_time + + new_file = Mock(spec=Path) + new_file_stat = Mock(spec=os.stat_result) + new_file_stat.st_mtime = value.timestamp() + 1 + new_file.stat.return_value = new_file_stat + new_file.exists.return_value = True + new_file.is_file.return_value = True + + updated_hwm = hwm.update(new_file) + assert updated_hwm.value == value.astimezone() + timedelta(seconds=1) + assert updated_hwm is hwm # modified in-place + assert updated_hwm.modified_time > latest_modified_time + latest_modified_time = updated_hwm.modified_time + + updated_hwm = hwm.update(old_file) # previous value + assert updated_hwm.value == value.astimezone() + timedelta(seconds=1) # unchanged + assert updated_hwm is hwm + assert updated_hwm.modified_time == latest_modified_time + + missing_path = Mock(spec=Path) + missing_path.exists.return_value = False + + updated_hwm = hwm.update(missing_path) + assert updated_hwm.value == value.astimezone() + timedelta(seconds=1) + assert updated_hwm is hwm + assert updated_hwm.modified_time == latest_modified_time + + directory_path = Mock(spec=Path) + directory_path.is_file.return_value = False + + updated_hwm = hwm.update(directory_path) + assert updated_hwm.value == value.astimezone() + timedelta(seconds=1) + assert updated_hwm is hwm + assert updated_hwm.modified_time == latest_modified_time + + +@pytest.mark.parametrize( + "tzinfo", + [ + None, + timezone.utc, + timezone(timedelta(hours=3)), + ], +) +def test_file_modified_time_hwm_update_filepath_iterable(tzinfo: timezone | None): + empty_hwm = FileModifiedTimeHWM(name="file_mtime") + + value = datetime(2025, 1, 1, 11, 22, 33, 456789, tzinfo=tzinfo) + + old_file1 = Mock(spec=Path) + old_file1_stat = Mock(spec=os.stat_result) + old_file1_stat.st_mtime = value.timestamp() - 10 + old_file1.stat.return_value = old_file1_stat + old_file1.exists.return_value = True + old_file1.is_file.return_value = True + + old_file2 = Mock(spec=Path) + old_file2_stat = Mock(spec=os.stat_result) + old_file2_stat.st_mtime = value.timestamp() + old_file2.stat.return_value = old_file2_stat + old_file2.exists.return_value = True + old_file2.is_file.return_value = True + + hwm = empty_hwm.copy() + updated_hwm = hwm.update([old_file1, old_file2]) + assert updated_hwm.value == value.astimezone() + assert updated_hwm is hwm # modified in-place + assert hwm.modified_time > empty_hwm.modified_time + latest_modified_time = hwm.modified_time + + new_file = Mock(spec=Path) + new_file_stat = Mock(spec=os.stat_result) + new_file_stat.st_mtime = value.timestamp() + 1 + new_file.stat.return_value = new_file_stat + new_file.exists.return_value = True + new_file.is_file.return_value = True + + updated_hwm = hwm.update({new_file}) + assert updated_hwm.value == value.astimezone() + timedelta(seconds=1) + assert updated_hwm is hwm # modified in-place + assert updated_hwm.modified_time > latest_modified_time + latest_modified_time = updated_hwm.modified_time + + updated_hwm = hwm.update([old_file1, new_file]) # previous value + assert updated_hwm.value == value.astimezone() + timedelta(seconds=1) # unchanged + assert updated_hwm is hwm + assert updated_hwm.modified_time == latest_modified_time + + +def test_file_modified_time_hwm_update_filepath_real(tmp_path: Path): + non_existing = tmp_path / "missing" + + some_file = tmp_path / "new_file" + some_file.touch() + + hwm = FileModifiedTimeHWM(name="file_mtime") + assert not hwm.covers(some_file) + assert not hwm.covers(tmp_path) + assert not hwm.covers(non_existing) + latest_modified_time = hwm.modified_time + + hwm.update(some_file) + assert hwm.covers(some_file) + assert not hwm.covers(tmp_path) + assert not hwm.covers(non_existing) + assert hwm.modified_time > latest_modified_time + latest_modified_time = hwm.modified_time + + time.sleep(0.1) + + new_file = tmp_path / "new_file2" + new_file.touch() + + assert not hwm.covers(new_file) + assert not hwm.covers(tmp_path) + assert not hwm.covers(non_existing) + + hwm.update([some_file, some_file.parent, non_existing, new_file]) + assert hwm.covers(some_file) + assert hwm.covers(new_file) + assert not hwm.covers(tmp_path) + assert not hwm.covers(non_existing) + assert hwm.modified_time > latest_modified_time + latest_modified_time = hwm.modified_time + + hwm.update(some_file) + assert hwm.covers(some_file) + assert hwm.covers(new_file) + assert not hwm.covers(tmp_path) + assert not hwm.covers(non_existing) + assert hwm.modified_time == latest_modified_time + + +def test_file_modified_time_hwm_serialization(): + modified_time = datetime.now() - timedelta(days=5) + + value = datetime(2025, 1, 1, 11, 22, 33, 456789, tzinfo=timezone.utc) + hwm = FileModifiedTimeHWM( + name="file_mtime", + value=value, + directory="/some/path", + expression="some", + description="some description", + modified_time=modified_time, + ) + + expected = { + "type": "file_modification_time", + "name": "file_mtime", + "value": "2025-01-01T11:22:33.456789+00:00", + "entity": "/some/path", + "expression": "some", + "description": "some description", + "modified_time": modified_time.isoformat(), + } + + serialized = hwm.serialize() + assert expected == serialized + assert FileModifiedTimeHWM.deserialize(serialized) == hwm + + empty_hwm = FileModifiedTimeHWM(name="file_mtime", modified_time=modified_time) + empty_hwm_expected = { + "type": "file_modification_time", + "name": "file_mtime", + "value": None, + "entity": None, + "expression": None, + "description": "", + "modified_time": modified_time.isoformat(), + } + + empty_hwm_serialized = empty_hwm.serialize() + assert empty_hwm_serialized == empty_hwm_expected + assert FileModifiedTimeHWM.deserialize(empty_hwm_serialized) == empty_hwm diff --git a/tests/test_hwm_store/test_hwm_store_get_set.py b/tests/test_hwm_store/test_hwm_store_get_set.py index 4817a34..dc6647f 100644 --- a/tests/test_hwm_store/test_hwm_store_get_set.py +++ b/tests/test_hwm_store/test_hwm_store_get_set.py @@ -1,12 +1,31 @@ +import os import secrets -from datetime import date, datetime, timedelta +from datetime import date, datetime, timezone +from pathlib import Path +from unittest.mock import Mock import pytest -from etl_entities.hwm import ColumnDateHWM, ColumnDateTimeHWM, ColumnIntHWM, FileListHWM +from etl_entities.hwm import ( + ColumnDateHWM, + ColumnDateTimeHWM, + ColumnIntHWM, + FileListHWM, + FileModifiedTimeHWM, +) from etl_entities.hwm_store import MemoryHWMStore +def file_with_mtime(mtime: datetime) -> Path: + result = Mock(spec=Path) + result.exists.return_value = True + result.is_file.return_value = True + result_stat = Mock(spec=os.stat_result) + result_stat.st_mtime = mtime.timestamp() + result.stat.return_value = result_stat + return result + + @pytest.fixture( params=[ ( @@ -29,17 +48,17 @@ ColumnDateHWM( name=secrets.token_hex(5), source=secrets.token_hex(5), - value=date(year=2023, month=8, day=15), + value=date(2025, 1, 1), ), - timedelta(days=31), + date(2025, 1, 2), ), ( ColumnDateTimeHWM( name=secrets.token_hex(5), source=secrets.token_hex(5), - value=datetime(year=2023, month=8, day=15, hour=11, minute=22, second=33), + value=datetime(2025, 1, 1, 11, 22, 33, 456789), ), - timedelta(seconds=50), + datetime(2025, 1, 1, 22, 33, 44, 567890), ), ( FileListHWM( @@ -57,6 +76,14 @@ ), "/absolute/path/file3", ), + ( + FileModifiedTimeHWM( + name=secrets.token_hex(5), + directory="/absolute/path", + value=datetime(2025, 1, 1, 11, 22, 33, 456789, tzinfo=timezone.utc), + ), + file_with_mtime(datetime(2025, 1, 1, 22, 33, 44, 567890, tzinfo=timezone.utc)), + ), ], ) def hwm_delta(request): @@ -72,7 +99,7 @@ def test_hwm_store_get_save(hwm_delta): assert hwm_store.get_hwm(hwm.name) == hwm # changing HWM object does not change MemoryHWMStore data - hwm1 = hwm.copy() + delta + hwm1 = hwm.copy().update(delta) assert hwm_store.get_hwm(hwm.name) == hwm # it is changed only after explicit call of .set_hwm()