diff --git a/.github/workflows/dependency_checker.yml b/.github/workflows/dependency_checker.yml new file mode 100644 index 0000000..7dbe870 --- /dev/null +++ b/.github/workflows/dependency_checker.yml @@ -0,0 +1,29 @@ +name: Dependency Checker +on: + workflow_dispatch: + schedule: + - cron: '0 0 * * 1-5' +jobs: + build: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Set up Python 3.9 + uses: actions/setup-python@v5 + with: + python-version: 3.9 + - name: Install dependencies + run: | + python -m pip install .[dev] + make check-deps OUTPUT_FILEPATH=latest_requirements.txt + - name: Create pull request + id: cpr + uses: peter-evans/create-pull-request@v4 + with: + token: ${{ secrets.GH_ACCESS_TOKEN }} + commit-message: Update latest dependencies + title: Automated Latest Dependency Updates + body: "This is an auto-generated PR with **latest** dependency updates." + branch: latest-dependency-update + branch-suffix: short-commit-hash + base: main diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 5bf8e60..23e69bc 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -10,7 +10,7 @@ jobs: runs-on: ${{ matrix.os }} strategy: matrix: - python-version: ['3.8', '3.9', '3.10', '3.11'] + python-version: ['3.8', '3.9', '3.10', '3.11', '3.12'] os: [ubuntu-latest, macos-latest, windows-latest] steps: - uses: actions/checkout@v1 diff --git a/.github/workflows/minimum.yml b/.github/workflows/minimum.yml index e56b5a5..832e7f0 100644 --- a/.github/workflows/minimum.yml +++ b/.github/workflows/minimum.yml @@ -10,7 +10,7 @@ jobs: runs-on: ${{ matrix.os }} strategy: matrix: - python-version: ['3.8', '3.9', '3.10', '3.11'] + python-version: ['3.8', '3.9', '3.10', '3.11', '3.12'] os: [ubuntu-latest, macos-latest, windows-latest] steps: - uses: actions/checkout@v1 diff --git a/.github/workflows/readme.yml b/.github/workflows/readme.yml index b3a4413..0967aa8 100644 --- a/.github/workflows/readme.yml +++ b/.github/workflows/readme.yml @@ -10,7 +10,7 @@ jobs: runs-on: ${{ matrix.os }} strategy: matrix: - python-version: ['3.8', '3.9', '3.10', '3.11'] + python-version: ['3.8', '3.9', '3.10', '3.11', '3.12'] os: [ubuntu-latest, macos-latest] # skip windows bc rundoc fails steps: - uses: actions/checkout@v1 @@ -22,5 +22,7 @@ jobs: run: | python -m pip install --upgrade pip python -m pip install invoke rundoc . + python -m pip install tomli + python -m pip install packaging - name: Run the README.md run: invoke readme diff --git a/.github/workflows/static_code_analysis.yml b/.github/workflows/static_code_analysis.yml new file mode 100644 index 0000000..72bc07f --- /dev/null +++ b/.github/workflows/static_code_analysis.yml @@ -0,0 +1,33 @@ +name: Static Code Analysis + +on: + release: + types: [published] + workflow_dispatch: + +jobs: + code-analysis: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Set up Python 3.10 + uses: actions/setup-python@v5 + with: + python-version: '3.10' + - name: Install dependencies + run: | + python -m pip install --upgrade pip + python -m pip install bandit==1.7.7 + - name: Save code analysis + run: bandit -r . -x ./tests -f txt -o static_code_analysis.txt --exit-zero + - name: Create pull request + id: cpr + uses: peter-evans/create-pull-request@v4 + with: + token: ${{ secrets.GH_ACCESS_TOKEN }} + commit-message: Update static code analysis + title: Latest Code Analysis + body: "This is an auto-generated PR with the **latest** code analysis results." + branch: static-code-analysis + branch-suffix: short-commit-hash + base: main diff --git a/.github/workflows/unit.yml b/.github/workflows/unit.yml index 4805cc3..ef48d6e 100644 --- a/.github/workflows/unit.yml +++ b/.github/workflows/unit.yml @@ -10,7 +10,7 @@ jobs: runs-on: ${{ matrix.os }} strategy: matrix: - python-version: ['3.8', '3.9', '3.10', '3.11'] + python-version: ['3.8', '3.9', '3.10', '3.11', '3.12'] os: [ubuntu-latest, macos-latest, windows-latest] steps: - uses: actions/checkout@v1 diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst index 62d1a08..08a98bb 100644 --- a/CONTRIBUTING.rst +++ b/CONTRIBUTING.rst @@ -174,17 +174,17 @@ Release Workflow The process of releasing a new version involves several steps combining both ``git`` and ``bumpversion`` which, briefly: -1. Merge what is in ``master`` branch into ``stable`` branch. +1. Merge what is in ``main`` branch into ``stable`` branch. 2. Update the version in ``setup.cfg``, ``deepecho/__init__.py`` and ``HISTORY.md`` files. 3. Create a new git tag pointing at the corresponding commit in ``stable`` branch. -4. Merge the new commit from ``stable`` into ``master``. +4. Merge the new commit from ``stable`` into ``main``. 5. Update the version in ``setup.cfg`` and ``deepecho/__init__.py`` to open the next development iteration. .. note:: Before starting the process, make sure that ``HISTORY.md`` has been updated with a new entry that explains the changes that will be included in the new version. - Normally this is just a list of the Pull Requests that have been merged to master + Normally this is just a list of the Pull Requests that have been merged to main since the last release. Once this is done, run of the following commands: @@ -219,9 +219,9 @@ This will perform the following actions: 2. Bump the current version to the next release candidate, ``X.Y.Z.dev(N+1)`` After this is done, the new pre-release can be installed by including the ``dev`` section in the -dependency specification, either in ``setup.py``:: +dependency specification, either in ``pyproject.toml``:: - install_requires = [ + dependencies = [ ... 'deepecho>=X.Y.Z.dev', ... diff --git a/HISTORY.md b/HISTORY.md index 9711fc5..5293bbb 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,5 +1,30 @@ # History +## 0.6.0 - 2024-04-10 + +This release adds support for Python 3.12! + +### Maintenance + +* Support Python 3.12 - Issue [#85](https://github.com/sdv-dev/DeepEcho/issues/85) by @fealho +* Transition from using setup.py to pyproject.toml to specify project metadata - Issue [#86](https://github.com/sdv-dev/DeepEcho/issues/86) by @R-Palazzo +* Remove bumpversion and use bump-my-version - Issue [#87](https://github.com/sdv-dev/DeepEcho/issues/87) by @R-Palazzo +* Add dependency checker - Issue [#96](https://github.com/sdv-dev/DeepEcho/issues/96) by @lajohn4747 +* Add bandit workflow - Issue [#98](https://github.com/sdv-dev/DeepEcho/issues/98) by @R-Palazzo + +### Bugs Fixed + +* Fix make check candidate - Issue [#91](https://github.com/sdv-dev/DeepEcho/issues/91) by @R-Palazzo +* Fix minimum version workflow when pointing to github branch - Issue [#99](https://github.com/sdv-dev/DeepEcho/issues/99) by @R-Palazzo + +## 0.5.0 - 2023-11-13 + +This release updates the PAR's model progress bar to show loss values and time elapsed (verbose option). + +### New Features +* Update progress bar for PAR fitting - Issue [#80](https://github.com/sdv-dev/DeepEcho/issues/80) by @frances-h + + ## 0.4.2 - 2023-07-25 This release drops support for Python 3.7 and adds support for Python 3.11. diff --git a/INSTALL.md b/INSTALL.md index 132da72..3d7ee65 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -3,7 +3,7 @@ ## Requirements **DeepEcho** has been developed and tested on -[Python 3.8, 3.9, 3.10 and 3.11](https://www.python.org/downloads/) +[Python 3.8, 3.9, 3.10, 3.11 and 3.12](https://www.python.org/downloads/) Also, although it is not strictly required, the usage of a [virtualenv]( https://virtualenv.pypa.io/en/latest/) is highly recommended in order to avoid @@ -47,12 +47,12 @@ make install If you intend to modify the source code or contribute to the project you will need to install it from the source using the `make install-develop` command. In this case, we -recommend you to branch from `master` first: +recommend you to branch from `main` first: ```bash git clone git@github.com:sdv-dev/DeepEcho cd DeepEcho -git checkout master +git checkout main git checkout -b make install-develp ``` diff --git a/MANIFEST.in b/MANIFEST.in deleted file mode 100644 index 2978997..0000000 --- a/MANIFEST.in +++ /dev/null @@ -1,12 +0,0 @@ -include AUTHORS.rst -include CONTRIBUTING.rst -include HISTORY.md -include LICENSE -include README.md -include deepecho/data/demo.csv - -recursive-include tests * -recursive-exclude * __pycache__ -recursive-exclude * *.py[co] - -recursive-include docs *.md *.rst conf.py Makefile make.bat *.jpg *.png *.gif diff --git a/Makefile b/Makefile index d8b28c3..4bfef15 100644 --- a/Makefile +++ b/Makefile @@ -79,30 +79,17 @@ install-develop: clean-build clean-pyc ## install the package in editable mode a # LINT TARGETS -.PHONY: lint-deepecho -lint-deepecho: ## check style with flake8 and isort - flake8 deepecho - isort -c --recursive deepecho - pylint deepecho --rcfile=setup.cfg - -.PHONY: lint-tests -lint-tests: ## check style with flake8 and isort - flake8 --ignore=D tests - isort -c --recursive tests - .PHONY: lint -lint: ## Run all code style checks - invoke lint +lint: + ruff check . + ruff format . --check .PHONY: fix-lint -fix-lint: ## fix lint issues using autoflake, autopep8, and isort - find deepecho tests -name '*.py' | xargs autoflake --in-place --remove-all-unused-imports --remove-unused-variables - autopep8 --in-place --recursive --aggressive deepecho tests - isort --apply --atomic --recursive deepecho tests - +fix-lint: + ruff check --fix . + ruff format . # TEST TARGETS - .PHONY: test-unit test-unit: ## run unit tests using pytest invoke unit @@ -145,8 +132,7 @@ coverage: ## check code coverage quickly with the default Python .PHONY: dist dist: clean ## builds source and wheel package - python setup.py sdist - python setup.py bdist_wheel + python -m build --wheel --sdist ls -l dist .PHONY: publish-confirm @@ -165,46 +151,46 @@ publish: dist publish-confirm ## package and upload a release twine upload dist/* .PHONY: bumpversion-release -bumpversion-release: ## Merge master to stable and bumpversion release +bumpversion-release: ## Merge main to stable and bumpversion release git checkout stable || git checkout -b stable - git merge --no-ff master -m"make release-tag: Merge branch 'master' into stable" - bumpversion release + git merge --no-ff main -m"make release-tag: Merge branch 'main' into stable" + bump-my-version bump release git push --tags origin stable .PHONY: bumpversion-release-test -bumpversion-release-test: ## Merge master to stable and bumpversion release +bumpversion-release-test: ## Merge main to stable and bumpversion release git checkout stable || git checkout -b stable - git merge --no-ff master -m"make release-tag: Merge branch 'master' into stable" - bumpversion release --no-tag + git merge --no-ff main -m"make release-tag: Merge branch 'main' into stable" + bump-my-version bump release --no-tag @echo git push --tags origin stable .PHONY: bumpversion-patch -bumpversion-patch: ## Merge stable to master and bumpversion patch - git checkout master +bumpversion-patch: ## Merge stable to main and bumpversion patch + git checkout main git merge stable - bumpversion --no-tag patch + bump-my-version bump patch --no-tag git push .PHONY: bumpversion-candidate bumpversion-candidate: ## Bump the version to the next candidate - bumpversion candidate --no-tag + bump-my-version bump candidate --no-tag .PHONY: bumpversion-minor bumpversion-minor: ## Bump the version the next minor skipping the release - bumpversion --no-tag minor + bump-my-version bump minor --no-tag .PHONY: bumpversion-major bumpversion-major: ## Bump the version the next major skipping the release - bumpversion --no-tag major + bump-my-version bump major --no-tag .PHONY: bumpversion-revert bumpversion-revert: ## Undo a previous bumpversion-release - git checkout master + git checkout main git branch -D stable CLEAN_DIR := $(shell git status --short | grep -v ??) CURRENT_BRANCH := $(shell git rev-parse --abbrev-ref HEAD 2>/dev/null) -CURRENT_VERSION := $(shell grep "^current_version" setup.cfg | grep -o "dev[0-9]*") +CURRENT_VERSION := $(shell grep "^current_version" pyproject.toml | grep -o "dev[0-9]*") CHANGELOG_LINES := $(shell git diff HEAD..origin/stable HISTORY.md 2>&1 | wc -l) .PHONY: check-clean @@ -213,10 +199,10 @@ ifneq ($(CLEAN_DIR),) $(error There are uncommitted changes) endif -.PHONY: check-master -check-master: ## Check if we are in master branch -ifneq ($(CURRENT_BRANCH),master) - $(error Please make the release from master branch\n) +.PHONY: check-main +check-main: ## Check if we are in main branch +ifneq ($(CURRENT_BRANCH),main) + $(error Please make the release from main branch\n) endif .PHONY: check-candidate @@ -231,8 +217,13 @@ ifeq ($(CHANGELOG_LINES),0) $(error Please insert the release notes in HISTORY.md before releasing) endif +.PHONY: check-deps +check-deps: # Dependency targets + $(eval allow_list='numpy=|pandas=|torch=|tqdm=') + pip freeze | grep -v "SDMetrics.git" | grep -E $(allow_list) | sort > $(OUTPUT_FILEPATH) + .PHONY: check-release -check-release: check-clean check-candidate check-master check-history ## Check if the release can be made +check-release: check-clean check-candidate check-main check-history ## Check if the release can be made @echo "A new release can be made" .PHONY: release @@ -242,7 +233,7 @@ release: check-release bumpversion-release publish bumpversion-patch release-test: check-release bumpversion-release-test publish-test bumpversion-revert .PHONY: release-candidate -release-candidate: check-master publish bumpversion-candidate +release-candidate: check-main publish bumpversion-candidate .PHONY: release-candidate-test -release-candidate-test: check-clean check-master publish-test +release-candidate-test: check-clean check-main publish-test diff --git a/README.md b/README.md index 70960eb..d145073 100644 --- a/README.md +++ b/README.md @@ -6,17 +6,17 @@ [![Development Status](https://img.shields.io/badge/Development%20Status-2%20--%20Pre--Alpha-yellow)](https://pypi.org/search/?c=Development+Status+%3A%3A+2+-+Pre-Alpha) [![PyPi Shield](https://img.shields.io/pypi/v/deepecho.svg)](https://pypi.python.org/pypi/deepecho) -[![Tests](https://github.com/sdv-dev/DeepEcho/workflows/Run%20Tests/badge.svg)](https://github.com/sdv-dev/DeepEcho/actions?query=workflow%3A%22Run+Tests%22+branch%3Amaster) +[![Tests](https://github.com/sdv-dev/DeepEcho/workflows/Run%20Tests/badge.svg)](https://github.com/sdv-dev/DeepEcho/actions?query=workflow%3A%22Run+Tests%22+branch%3Amain) [![Downloads](https://pepy.tech/badge/deepecho)](https://pepy.tech/project/deepecho) -[![Coverage Status](https://codecov.io/gh/sdv-dev/DeepEcho/branch/master/graph/badge.svg)](https://codecov.io/gh/sdv-dev/DeepEcho) -[![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/sdv-dev/DeepEcho/master?filepath=tutorials/timeseries_data) +[![Coverage Status](https://codecov.io/gh/sdv-dev/DeepEcho/branch/main/graph/badge.svg)](https://codecov.io/gh/sdv-dev/DeepEcho) +[![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/sdv-dev/DeepEcho/main?filepath=tutorials/timeseries_data) [![Slack](https://img.shields.io/badge/Slack%20Workspace-Join%20now!-36C5F0?logo=slack)](https://bit.ly/sdv-slack-invite)

- +

@@ -49,12 +49,12 @@ time series**. It provides: [SDV Blog]: https://sdv.dev/blog [Documentation]: https://sdv.dev/SDV [Repository]: https://github.com/sdv-dev/DeepEcho -[License]: https://github.com/sdv-dev/DeepEcho/blob/master/LICENSE +[License]: https://github.com/sdv-dev/DeepEcho/blob/main/LICENSE [Development Status]: https://pypi.org/search/?c=Development+Status+%3A%3A+2+-+Pre-Alpha -[Slack Logo]: https://github.com/sdv-dev/SDV/blob/master/docs/images/slack.png +[Slack Logo]: https://github.com/sdv-dev/SDV/blob/stable/docs/images/slack.png [Community]: https://bit.ly/sdv-slack-invite -[MyBinder Logo]: https://github.com/sdv-dev/SDV/blob/master/docs/images/mybinder.png -[Tutorials]: https://mybinder.org/v2/gh/sdv-dev/DeepEcho/master?filepath=tutorials +[MyBinder Logo]: https://github.com/sdv-dev/SDV/blob/stable/docs/images/mybinder.png +[Tutorials]: https://mybinder.org/v2/gh/sdv-dev/DeepEcho/main?filepath=tutorials # Install @@ -160,7 +160,7 @@ us developing new features or cool ideas!
- +


diff --git a/deepecho/__init__.py b/deepecho/__init__.py index 7562065..a2169fb 100644 --- a/deepecho/__init__.py +++ b/deepecho/__init__.py @@ -2,7 +2,7 @@ __author__ = 'DataCebo, Inc.' __email__ = 'info@sdv.dev' -__version__ = '0.4.2' +__version__ = '0.6.0.dev1' __path__ = __import__('pkgutil').extend_path(__path__, __name__) from deepecho.demo import load_demo diff --git a/deepecho/models/base.py b/deepecho/models/base.py index fa44058..cfd3982 100644 --- a/deepecho/models/base.py +++ b/deepecho/models/base.py @@ -6,7 +6,7 @@ from deepecho.sequences import assemble_sequences -class DeepEcho(): +class DeepEcho: """The base class for DeepEcho models.""" _verbose = True @@ -28,7 +28,13 @@ def _validate(sequences, context_types, data_types): data_types: See `fit`. """ - dtypes = set(['continuous', 'categorical', 'ordinal', 'count', 'datetime']) + dtypes = set([ + 'continuous', + 'categorical', + 'ordinal', + 'count', + 'datetime', + ]) assert all(dtype in dtypes for dtype in context_types) assert all(dtype in dtypes for dtype in data_types) @@ -99,8 +105,15 @@ def _get_data_types(data, data_types, columns): return dtypes_list - def fit(self, data, entity_columns=None, context_columns=None, - data_types=None, segment_size=None, sequence_index=None): + def fit( + self, + data, + entity_columns=None, + context_columns=None, + data_types=None, + segment_size=None, + sequence_index=None, + ): """Fit the model to a dataframe containing time series data. Args: @@ -135,8 +148,7 @@ def fit(self, data, entity_columns=None, context_columns=None, if segment_size is not None and not isinstance(segment_size, int): if sequence_index is None: raise TypeError( - '`segment_size` must be of type `int` if ' - 'no `sequence_index` is given.' + '`segment_size` must be of type `int` if ' 'no `sequence_index` is given.' ) if data[sequence_index].dtype.kind != 'M': raise TypeError( @@ -161,7 +173,12 @@ def fit(self, data, entity_columns=None, context_columns=None, data_types = self._get_data_types(data, data_types, self._data_columns) context_types = self._get_data_types(data, data_types, self._context_columns) sequences = assemble_sequences( - data, self._entity_columns, self._context_columns, segment_size, sequence_index) + data, + self._entity_columns, + self._context_columns, + segment_size, + sequence_index, + ) # Validate and fit self._validate(sequences, context_types, data_types) @@ -242,7 +259,7 @@ def sample(self, num_entities=None, context=None, sequence_length=None): # Reformat as a DataFrame group = pd.DataFrame( dict(zip(self._data_columns, sequence)), - columns=self._data_columns + columns=self._data_columns, ) group[self._entity_columns] = entity_values for column, value in zip(self._context_columns, context_values): diff --git a/deepecho/models/basic_gan.py b/deepecho/models/basic_gan.py index 94a7979..c6beadb 100644 --- a/deepecho/models/basic_gan.py +++ b/deepecho/models/basic_gan.py @@ -13,10 +13,13 @@ def _expand_context(data, context): - return torch.cat([ - data, - context.unsqueeze(0).expand(data.shape[0], context.shape[0], context.shape[1]) - ], dim=2) + return torch.cat( + [ + data, + context.unsqueeze(0).expand(data.shape[0], context.shape[0], context.shape[1]), + ], + dim=2, + ) class BasicGenerator(torch.nn.Module): @@ -65,7 +68,7 @@ def forward(self, context=None, sequence_length=None): """ latent = torch.randn( size=(sequence_length, context.size(0), self.latent_size), - device=self.device + device=self.device, ) latent = _expand_context(latent, context) @@ -150,8 +153,16 @@ class BasicGANModel(DeepEcho): _model_data_size = None _generator = None - def __init__(self, epochs=1024, latent_size=32, hidden_size=16, - gen_lr=1e-3, dis_lr=1e-3, cuda=True, verbose=True): + def __init__( + self, + epochs=1024, + latent_size=32, + hidden_size=16, + gen_lr=1e-3, + dis_lr=1e-3, + cuda=True, + verbose=True, + ): self._epochs = epochs self._gen_lr = gen_lr self._dis_lr = dis_lr @@ -211,7 +222,7 @@ def _index_map(columns, types): 'type': column_type, 'min': np.min(values), 'max': np.max(values), - 'indices': (dimensions, dimensions + 1) + 'indices': (dimensions, dimensions + 1), } dimensions += 2 @@ -221,10 +232,7 @@ def _index_map(columns, types): indices[value] = dimensions dimensions += 1 - mapping[column] = { - 'type': column_type, - 'indices': indices - } + mapping[column] = {'type': column_type, 'indices': indices} else: raise ValueError(f'Unsupported type: {column_type}') @@ -317,7 +325,7 @@ def _value_to_tensor(self, tensor, value, properties): self._one_hot_encode(tensor, value, properties) else: - raise ValueError() # Theoretically unreachable + raise ValueError() # Theoretically unreachable def _data_to_tensor(self, data): """Convert the input data to the corresponding tensor. @@ -370,7 +378,7 @@ def _tensor_to_data(self, tensor): elif column_type in ('categorical', 'ordinal'): value = self._one_hot_decode(tensor, row, properties) else: - raise ValueError() # Theoretically unreachable + raise ValueError() # Theoretically unreachable column_data.append(value) @@ -412,7 +420,7 @@ def _truncate(self, generated): end_flag = sequence[:, self._data_size] if (end_flag == 1.0).any(): cut_idx = end_flag.detach().cpu().numpy().argmax() - sequence[cut_idx + 1:] = 0.0 + sequence[cut_idx + 1 :] = 0.0 def _generate(self, context, sequence_length=None): generated = self._generator( diff --git a/deepecho/models/par.py b/deepecho/models/par.py index 0b06005..57e2a68 100644 --- a/deepecho/models/par.py +++ b/deepecho/models/par.py @@ -27,10 +27,13 @@ def forward(self, x, c): if isinstance(x, torch.nn.utils.rnn.PackedSequence): x, lengths = torch.nn.utils.rnn.pad_packed_sequence(x) if self.context_size: - x = torch.cat([ - x, - c.unsqueeze(0).expand(x.shape[0], c.shape[0], c.shape[1]) - ], dim=2) + x = torch.cat( + [ + x, + c.unsqueeze(0).expand(x.shape[0], c.shape[0], c.shape[1]), + ], + dim=2, + ) x = self.down(x) x = torch.nn.utils.rnn.pack_padded_sequence(x, lengths, enforce_sorted=False) @@ -41,10 +44,13 @@ def forward(self, x, c): else: if self.context_size: - x = torch.cat([ - x, - c.unsqueeze(0).expand(x.shape[0], c.shape[0], c.shape[1]) - ], dim=2) + x = torch.cat( + [ + x, + c.unsqueeze(0).expand(x.shape[0], c.shape[0], c.shape[1]), + ], + dim=2, + ) x = self.down(x) x, _ = self.rnn(x) @@ -105,6 +111,7 @@ def __init__(self, epochs=128, sample_size=1, cuda=True, verbose=True): self.device = torch.device(device) self.verbose = verbose + self.loss_values = pd.DataFrame(columns=['Epoch', 'Loss']) LOGGER.info('%s instance created', self) @@ -125,7 +132,7 @@ def _idx_map(self, x, t): 'mu': np.nanmean(x[i]), 'std': np.nanstd(x[i]), 'nulls': pd.isnull(x[i]).any(), - 'indices': (idx, idx + 1, idx + 2) + 'indices': (idx, idx + 1, idx + 2), } idx += 3 @@ -135,15 +142,12 @@ def _idx_map(self, x, t): 'min': np.nanmin(x[i]), 'range': np.nanmax(x[i]) - np.nanmin(x[i]), 'nulls': pd.isnull(x[i]).any(), - 'indices': (idx, idx + 1, idx + 2) + 'indices': (idx, idx + 1, idx + 2), } idx += 3 elif t == 'categorical' or t == 'ordinal': - idx_map[i] = { - 'type': t, - 'indices': {} - } + idx_map[i] = {'type': t, 'indices': {}} idx += 1 for v in set(x[i]): if pd.isnull(v): @@ -185,8 +189,8 @@ def _build(self, sequences, context_types, data_types): 'indices': { '': self._data_dims, '': self._data_dims + 1, - '': self._data_dims + 2 - } + '': self._data_dims + 2, + }, } self._data_dims += 3 @@ -224,7 +228,10 @@ def _data_to_tensor(self, data): x[p_idx] = 0.0 x[missing_idx] = 1.0 if pd.isnull(data[key][i]) else 0.0 - elif props['type'] in ['categorical', 'ordinal']: # categorical + elif props['type'] in [ + 'categorical', + 'ordinal', + ]: # categorical value = data[key][i] if pd.isnull(value): value = None @@ -249,15 +256,21 @@ def _context_to_tensor(self, context): for key, props in self._ctx_map.items(): if props['type'] in ['continuous', 'datetime']: mu_idx, sigma_idx, missing_idx = props['indices'] - x[mu_idx] = 0.0 if (pd.isnull(context[key]) or props['std'] == 0) else ( - context[key] - props['mu']) / props['std'] + x[mu_idx] = ( + 0.0 + if (pd.isnull(context[key]) or props['std'] == 0) + else (context[key] - props['mu']) / props['std'] + ) x[sigma_idx] = 0.0 x[missing_idx] = 1.0 if pd.isnull(context[key]) else 0.0 elif props['type'] in ['count']: r_idx, p_idx, missing_idx = props['indices'] - x[r_idx] = 0.0 if (pd.isnull(context[key]) or props['range'] == 0) else ( - context[key] - props['min']) / props['range'] + x[r_idx] = ( + 0.0 + if (pd.isnull(context[key]) or props['range'] == 0) + else (context[key] - props['min']) / props['range'] + ) x[p_idx] = 0.0 x[missing_idx] = 1.0 if pd.isnull(context[key]) else 0.0 @@ -321,9 +334,13 @@ def fit_sequences(self, sequences, context_types, data_types): self._model = PARNet(self._data_dims, self._ctx_dims).to(self.device) optimizer = torch.optim.Adam(self._model.parameters(), lr=1e-3) - iterator = range(self.epochs) + iterator = tqdm(range(self.epochs), disable=(not self.verbose)) if self.verbose: - iterator = tqdm(iterator) + pbar_description = 'Loss ({loss:.3f})' + iterator.set_description(pbar_description.format(loss=0)) + + # Reset loss_values dataframe + self.loss_values = pd.DataFrame(columns=['Epoch', 'Loss']) X_padded, seq_len = torch.nn.utils.rnn.pad_packed_sequence(X) for epoch in iterator: @@ -333,8 +350,21 @@ def fit_sequences(self, sequences, context_types, data_types): optimizer.zero_grad() loss = self._compute_loss(X_padded[1:, :, :], Y_padded[:-1, :, :], seq_len) loss.backward() + + epoch_loss_df = pd.DataFrame({ + 'Epoch': [epoch], + 'Loss': [loss.item()], + }) + if not self.loss_values.empty: + self.loss_values = pd.concat([ + self.loss_values, + epoch_loss_df, + ]).reset_index(drop=True) + else: + self.loss_values = epoch_loss_df + if self.verbose: - iterator.set_description(f'Epoch {epoch +1} | Loss {loss.item()}') + iterator.set_description(pbar_description.format(loss=loss.item())) optimizer.step() @@ -358,7 +388,7 @@ def _compute_loss(self, X_padded, Y_padded, seq_len): This list contains the length of each sequence. """ log_likelihood = 0.0 - _, batch_size, input_size = X_padded.shape + _, batch_size, _input_size = X_padded.shape for key, props in self._data_map.items(): if props['type'] in ['continuous', 'timestamp']: @@ -369,14 +399,16 @@ def _compute_loss(self, X_padded, Y_padded, seq_len): for i in range(batch_size): dist = torch.distributions.normal.Normal( - mu[:seq_len[i], i], sigma[:seq_len[i], i]) - log_likelihood += torch.sum(dist.log_prob(X_padded[-seq_len[i]:, i, mu_idx])) + mu[: seq_len[i], i], sigma[: seq_len[i], i] + ) + log_likelihood += torch.sum(dist.log_prob(X_padded[-seq_len[i] :, i, mu_idx])) - p_true = X_padded[:seq_len[i], i, missing_idx] - p_pred = missing[:seq_len[i], i] + p_true = X_padded[: seq_len[i], i, missing_idx] + p_pred = missing[: seq_len[i], i] log_likelihood += torch.sum(p_true * p_pred) - log_likelihood += torch.sum((1.0 - p_true) * torch.log( - 1.0 - torch.exp(p_pred))) + log_likelihood += torch.sum( + (1.0 - p_true) * torch.log(1.0 - torch.exp(p_pred)) + ) elif props['type'] in ['count']: r_idx, p_idx, missing_idx = props['indices'] @@ -387,22 +419,26 @@ def _compute_loss(self, X_padded, Y_padded, seq_len): for i in range(batch_size): dist = torch.distributions.negative_binomial.NegativeBinomial( - r[:seq_len[i], i], p[:seq_len[i], i], validate_args=False) - log_likelihood += torch.sum(dist.log_prob(x[:seq_len[i], i])) - - p_true = X_padded[:seq_len[i], i, missing_idx] - p_pred = missing[:seq_len[i], i] + r[: seq_len[i], i], + p[: seq_len[i], i], + validate_args=False, + ) + log_likelihood += torch.sum(dist.log_prob(x[: seq_len[i], i])) + + p_true = X_padded[: seq_len[i], i, missing_idx] + p_pred = missing[: seq_len[i], i] log_likelihood += torch.sum(p_true * p_pred) - log_likelihood += torch.sum((1.0 - p_true) * torch.log( - 1.0 - torch.exp(p_pred))) + log_likelihood += torch.sum( + (1.0 - p_true) * torch.log(1.0 - torch.exp(p_pred)) + ) elif props['type'] in ['categorical', 'ordinal']: idx = list(props['indices'].values()) log_softmax = torch.nn.functional.log_softmax(Y_padded[:, :, idx], dim=2) for i in range(batch_size): - target = X_padded[:seq_len[i], i, idx] - predicted = log_softmax[:seq_len[i], i] + target = X_padded[: seq_len[i], i, idx] + predicted = log_softmax[: seq_len[i], i] target = torch.argmax(target, dim=1).unsqueeze(dim=1) log_likelihood += torch.sum(predicted.gather(dim=1, index=target)) @@ -426,14 +462,14 @@ def _tensor_to_data(self, x): data[key] = [] for i in range(seq_len): if props['type'] in ['continuous', 'datetime']: - mu_idx, sigma_idx, missing_idx = props['indices'] + mu_idx, _sigma_idx, missing_idx = props['indices'] if (x[i, 0, missing_idx] > 0) and props['nulls']: data[key].append(None) else: data[key].append(x[i, 0, mu_idx].item() * props['std'] + props['mu']) elif props['type'] in ['count']: - r_idx, p_idx, missing_idx = props['indices'] + r_idx, _p_idx, missing_idx = props['indices'] if x[i, 0, missing_idx] > 0 and props['nulls']: data[key].append(None) else: @@ -456,7 +492,7 @@ def _tensor_to_data(self, x): def _sample_state(self, x): log_likelihood = 0.0 - seq_len, batch_size, input_size = x.shape + seq_len, batch_size, _input_size = x.shape assert seq_len == 1 and batch_size == 1 for key, props in self._data_map.items(): diff --git a/deepecho/sequences.py b/deepecho/sequences.py index e425053..847c511 100644 --- a/deepecho/sequences.py +++ b/deepecho/sequences.py @@ -117,20 +117,20 @@ def _convert_to_dicts(segments, context_columns): else: context = [] - lists = [ - list(row) - for _, row in segment.items() - ] - sequences.append({ - 'context': context, - 'data': lists - }) + lists = [list(row) for _, row in segment.items()] + sequences.append({'context': context, 'data': lists}) return sequences -def assemble_sequences(data, entity_columns, context_columns, segment_size, - sequence_index, drop_sequence_index=True): +def assemble_sequences( + data, + entity_columns, + context_columns, + segment_size, + sequence_index, + drop_sequence_index=True, +): """Build sequences from the data, grouping first by entity and then segmenting by size. Input is a ``pandas.DataFrame`` containing all the data, lists of entity and context @@ -185,8 +185,9 @@ def assemble_sequences(data, entity_columns, context_columns, segment_size, if len(sequence[context_columns].drop_duplicates()) > 1: raise ValueError('Context columns are not constant within each entity.') - entity_segments = segment_sequence(sequence, segment_size, - sequence_index, drop_sequence_index) + entity_segments = segment_sequence( + sequence, segment_size, sequence_index, drop_sequence_index + ) segments.extend(entity_segments) return _convert_to_dicts(segments, context_columns) diff --git a/latest_requirements.txt b/latest_requirements.txt new file mode 100644 index 0000000..5e4ab8d --- /dev/null +++ b/latest_requirements.txt @@ -0,0 +1,4 @@ +numpy==1.26.4 +pandas==2.2.1 +torch==2.2.2 +tqdm==4.66.2 diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..059dd39 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,205 @@ +[project] +name = 'deepecho' +description = 'Create sequential synthetic data of mixed types using a GAN.' +authors = [{ name = 'DataCebo, Inc.', email = 'info@sdv.dev' }] +classifiers = [ + 'Development Status :: 2 - Pre-Alpha', + 'Intended Audience :: Developers', + 'License :: Free for non-commercial use', + 'Natural Language :: English', + 'Programming Language :: Python :: 3', + 'Programming Language :: Python :: 3.8', + 'Programming Language :: Python :: 3.9', + 'Programming Language :: Python :: 3.10', + 'Programming Language :: Python :: 3.11', + 'Programming Language :: Python :: 3.12', + 'Topic :: Scientific/Engineering :: Artificial Intelligence', +] +keywords = ['deepecho', 'DeepEcho'] +dynamic = ['version'] +license = { text = 'BSL-1.1' } +requires-python = '>=3.8,<3.13' +readme = 'README.md' +dependencies = [ + "numpy>=1.20.0;python_version<'3.10'", + "numpy>=1.23.3;python_version>='3.10' and python_version<'3.12'", + "numpy>=1.26.0;python_version>='3.12'", + "pandas>=1.1.3;python_version<'3.10'", + "pandas>=1.3.4;python_version>='3.10' and python_version<'3.11'", + "pandas>=1.5.0;python_version>='3.11'", + "torch>=1.8.0;python_version<'3.10'", + "torch>=1.11.0;python_version>='3.10' and python_version<'3.11'", + "torch>=2.0.0;python_version>='3.11' and python_version<'3.12'", + "torch>=2.2.0;python_version>='3.12'", + 'tqdm>=4.29', +] + +[project.urls] +"Source Code"= "https://github.com/sdv-dev/Deepecho/" +"Issue Tracker" = "https://github.com/sdv-dev/Deepecho/issues" +"Twitter" = "https://twitter.com/sdv_dev" +"Chat" = "https://bit.ly/sdv-slack-invite" + +[project.optional-dependencies] +test = [ + 'pytest>=3.4.2', + 'pytest-cov>=2.6.0', + 'pytest-rerunfailures>=10.3,<15', + 'jupyter>=1.0.0,<2', + 'rundoc>=0.4.3,<0.5', + 'pytest-runner >= 2.11.1', + 'tomli>=2.0.0,<3', +] +dev = [ + 'deepecho[test]', + + # general + 'setuptools<70', + 'build>=1.0.0,<2', + 'bump-my-version>=0.18.3,<1', + 'pip>=9.0.1', + 'watchdog>=1.0.1,<5', + + # style check + 'ruff>=0.3.2,<1', + + # distribute on PyPI + 'twine>=1.10.0,<4', + 'wheel>=0.30.0', + + # Advanced testing + 'coverage>=4.5.1,<6', + 'tox>=2.9.1,<4', + + # Invoking test commands + 'invoke' +] + +[tool.setuptools] +include-package-data = true +license-files = ['LICENSE'] + +[tool.setuptools.packages.find] +include = ['deepecho', 'deepecho.*'] +namespaces = false + +[tool.setuptools.package-data] +'*' = [ + 'AUTHORS.rst', + 'CONTRIBUTING.rst', + 'HISTORY.md', + 'README.md', + '*.md', + '*.rst', + 'conf.py', + 'Makefile', + 'make.bat', + '*.jpg', + '*.png', + '*.gif', +] +'deepecho' = ['data/demo.csv'] + +[tool.setuptools.exclude-package-data] +'*' = [ + '* __pycache__', + '*.py[co]', + 'static_code_analysis.txt', +] + +[tool.setuptools.dynamic] +version = {attr = 'deepecho.__version__'} + +[tool.isort] +include_trailing_comment = true +line_length = 99 +lines_between_types = 0 +multi_line_output = 4 +not_skip = ['__init__.py'] +use_parentheses = true + +[tool.pytest.ini_options] +collect_ignore = ['pyproject.toml'] + +[tool.bumpversion] +current_version = "0.6.0.dev1" +parse = '(?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))?' +serialize = [ + '{major}.{minor}.{patch}.{release}{candidate}', + '{major}.{minor}.{patch}' +] +search = '{current_version}' +replace = '{new_version}' +regex = false +ignore_missing_version = false +tag = true +sign_tags = false +tag_name = 'v{new_version}' +tag_message = 'Bump version: {current_version} → {new_version}' +allow_dirty = false +commit = true +message = 'Bump version: {current_version} → {new_version}' +commit_args = '' + +[tool.bumpversion.parts.release] +first_value = 'dev' +optional_value = 'release' +values = [ + 'dev', + 'release' +] + +[[tool.bumpversion.files]] +filename = "deepecho/__init__.py" +search = "__version__ = '{current_version}'" +replace = "__version__ = '{new_version}'" + +[build-system] +requires = ['setuptools', 'wheel'] +build-backend = 'setuptools.build_meta' + +[tool.ruff] +preview = true +line-length = 99 +src = ["deepecho"] +target-version = "py312" +exclude = [ + "docs", + ".tox", + ".git", + "__pycache__", + ".ipynb_checkpoints" +] + +[tool.ruff.lint] +select = [ + # Pyflakes + "F", + # Pycodestyle + "E", + "W", + # isort + "I001" +] +ignore = [ + "E501", + "D107", # Missing docstring in __init__ + "D417", # Missing argument descriptions in the docstring, this is a bug from pydocstyle: https://github.com/PyCQA/pydocstyle/issues/449 +] + +[tool.ruff.lint.pep8-naming] +extend-ignore-names = ["X", "C", "X_padded", "Y", "Y_padded"] + +[tool.ruff.lint.isort] +known-first-party = ["deepecho"] + +[tool.ruff.lint.per-file-ignores] +"__init__.py" = ["F401", "E402", "F403", "F405", "E501", "I001"] + +[tool.ruff.format] +quote-style = "single" +indent-style = "space" +preview = true + +[tool.ruff.lint.pydocstyle] +convention = "google" diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index 8e60f2a..0000000 --- a/setup.cfg +++ /dev/null @@ -1,72 +0,0 @@ -[bumpversion] -current_version = 0.4.2 -commit = True -tag = True -parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))? -serialize = - {major}.{minor}.{patch}.{release}{candidate} - {major}.{minor}.{patch} - -[bumpversion:part:release] -optional_value = release -first_value = dev -values = - dev - release - -[bumpversion:part:candidate] - -[bumpversion:file:setup.py] -search = version='{current_version}' -replace = version='{new_version}' - -[bumpversion:file:deepecho/__init__.py] -search = __version__ = '{current_version}' -replace = __version__ = '{new_version}' - -[bdist_wheel] -universal = 1 - -[flake8] -max-line-length = 99 -exclude = docs, .tox, .git, __pycache__, .ipynb_checkpoints -docstring-convetion = google -extend-ignore = - SFS3, # String formating using f-string - VNE001, # Single letter variable names are not allowed. - D107, # Missing docstring in __init__ - D417 # Missing argument descriptions in the docstring, this is a bug from pydocstyle: https://github.com/PyCQA/pydocstyle/issues/449 -ignore-names = - X, - C, - X_padded, - Y, - Y_padded - -[isort] -include_trailing_comment = True -line_length = 99 -lines_between_types = 0 -multi_line_output = 4 -not_skip = __init__.py -use_parentheses = True - -[aliases] -test = pytest - -[tool:pytest] -collect_ignore = ['setup.py'] - -[pylint] -persistent = no -extension-pkg-whitelist = numpy -generated-members = torch.* -min-similarity-lines = 5 -ignore-comments = yes -ignore-docstrings = yes -ignore-imports = yes -max-args = 10 -ignore = par.py -disable = R0914, R0902, R0903, C0102, C0209, W0703, W0223, E1102 -good-names = i, j, k, X, x, y, X_train, X_test, ex - diff --git a/setup.py b/setup.py deleted file mode 100644 index af6fd62..0000000 --- a/setup.py +++ /dev/null @@ -1,117 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -"""The setup script.""" - -from setuptools import setup, find_packages - -with open('README.md', encoding='utf-8') as readme_file: - readme = readme_file.read() - -with open('HISTORY.md', encoding='utf-8') as history_file: - history = history_file.read() - -install_requires = [ - "numpy>=1.20.0,<2;python_version<'3.10'", - "numpy>=1.23.3,<2;python_version>='3.10'", - "pandas>=1.1.3;python_version<'3.10'", - "pandas>=1.3.4;python_version>='3.10' and python_version<'3.11'", - "pandas>=1.5.0;python_version>='3.11'", - "torch>=1.8.0;python_version<'3.10'", - "torch>=1.11.0;python_version>='3.10' and python_version<'3.11'", - "torch>=2.0.0;python_version>='3.11'", - 'tqdm>=4.15,<5', -] - -setup_requires = [ - 'pytest-runner>=2.11.1', -] - -tests_require = [ - 'pytest>=3.4.2', - 'pytest-cov>=2.6.0', - 'pytest-rerunfailures>=9.0.0,<10', - 'jupyter>=1.0.0,<2', - 'rundoc>=0.4.3,<0.5', -] - -development_requires = [ - # general - 'setuptools<49.2', - 'bumpversion>=0.5.3,<0.6', - 'pip>=9.0.1', - 'watchdog>=0.8.3,<0.11', - - # style check - 'flake8>=3.7.7,<4', - 'flake8-absolute-import>=1.0,<2', - 'flake8-docstrings>=1.5.0,<2', - 'flake8-sfs>=0.0.3,<0.1', - 'isort>=4.3.4,<5', - 'pylint>=2.5.3,<3', - 'flake8-builtins>=1.5.3,<1.6', - 'flake8-debugger>=4.0.0,<4.1', - 'flake8-mock>=0.3,<0.4', - 'dlint>=0.11.0,<0.12', - 'flake8-eradicate>=1.1.0,<1.2', - 'flake8-mutable>=1.2.0,<1.3', - 'flake8-fixme>=1.1.1,<1.2', - 'flake8-multiline-containers>=0.0.18,<0.1', - 'flake8-quotes>=3.3.0,<4', - 'flake8-variables-names>=0.0.4,<0.1', - 'pep8-naming>=0.12.1,<0.13', - 'flake8-expression-complexity>=0.0.9,<0.1', - 'flake8-print>=4.0.0,<4.1', - - # fix style issues - 'autoflake>=1.1,<2', - 'autopep8>=1.4.3,<1.6', - - # distribute on PyPI - 'twine>=1.10.0,<4', - 'wheel>=0.30.0', - - # Advanced testing - 'coverage>=4.5.1,<6', - 'tox>=2.9.1,<4', - - # Invoking test commands - 'invoke' -] - -setup( - author='DataCebo, Inc.', - author_email='info@sdv.dev', - classifiers=[ - 'Development Status :: 2 - Pre-Alpha', - 'Intended Audience :: Developers', - 'License :: Free for non-commercial use', - 'Natural Language :: English', - 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.8', - 'Programming Language :: Python :: 3.9', - 'Programming Language :: Python :: 3.10', - 'Programming Language :: Python :: 3.11', - 'Topic :: Scientific/Engineering :: Artificial Intelligence', - ], - description='Create sequential synthetic data of mixed types using a GAN.', - extras_require={ - 'test': tests_require, - 'dev': development_requires + tests_require, - }, - include_package_data=True, - install_requires=install_requires, - keywords='deepecho deepecho DeepEcho', - license='BSL-1.1', - long_description=readme + '\n\n' + history, - long_description_content_type='text/markdown', - name='deepecho', - packages=find_packages(include=['deepecho', 'deepecho.*']), - python_requires='>=3.8,<3.12', - setup_requires=setup_requires, - test_suite='tests', - tests_require=tests_require, - url='https://github.com/sdv-dev/DeepEcho', - version='0.4.2', - zip_safe=False, -) diff --git a/static_code_analysis.txt b/static_code_analysis.txt new file mode 100644 index 0000000..6409e05 --- /dev/null +++ b/static_code_analysis.txt @@ -0,0 +1,101 @@ +Run started:2024-04-09 16:11:48.120710 + +Test results: +>> Issue: [B101:assert_used] Use of assert detected. The enclosed code will be removed when compiling to optimised byte code. + Severity: Low Confidence: High + CWE: CWE-703 (https://cwe.mitre.org/data/definitions/703.html) + More Info: https://bandit.readthedocs.io/en/1.7.7/plugins/b101_assert_used.html + Location: ./deepecho/models/base.py:38:8 +37 ]) +38 assert all(dtype in dtypes for dtype in context_types) +39 assert all(dtype in dtypes for dtype in data_types) + +-------------------------------------------------- +>> Issue: [B101:assert_used] Use of assert detected. The enclosed code will be removed when compiling to optimised byte code. + Severity: Low Confidence: High + CWE: CWE-703 (https://cwe.mitre.org/data/definitions/703.html) + More Info: https://bandit.readthedocs.io/en/1.7.7/plugins/b101_assert_used.html + Location: ./deepecho/models/base.py:39:8 +38 assert all(dtype in dtypes for dtype in context_types) +39 assert all(dtype in dtypes for dtype in data_types) +40 + +-------------------------------------------------- +>> Issue: [B101:assert_used] Use of assert detected. The enclosed code will be removed when compiling to optimised byte code. + Severity: Low Confidence: High + CWE: CWE-703 (https://cwe.mitre.org/data/definitions/703.html) + More Info: https://bandit.readthedocs.io/en/1.7.7/plugins/b101_assert_used.html + Location: ./deepecho/models/base.py:42:12 +41 for sequence in sequences: +42 assert len(sequence['context']) == len(context_types) +43 assert len(sequence['data']) == len(data_types) + +-------------------------------------------------- +>> Issue: [B101:assert_used] Use of assert detected. The enclosed code will be removed when compiling to optimised byte code. + Severity: Low Confidence: High + CWE: CWE-703 (https://cwe.mitre.org/data/definitions/703.html) + More Info: https://bandit.readthedocs.io/en/1.7.7/plugins/b101_assert_used.html + Location: ./deepecho/models/base.py:43:12 +42 assert len(sequence['context']) == len(context_types) +43 assert len(sequence['data']) == len(data_types) +44 lengths = [len(x) for x in sequence['data']] + +-------------------------------------------------- +>> Issue: [B101:assert_used] Use of assert detected. The enclosed code will be removed when compiling to optimised byte code. + Severity: Low Confidence: High + CWE: CWE-703 (https://cwe.mitre.org/data/definitions/703.html) + More Info: https://bandit.readthedocs.io/en/1.7.7/plugins/b101_assert_used.html + Location: ./deepecho/models/base.py:45:12 +44 lengths = [len(x) for x in sequence['data']] +45 assert len(set(lengths)) == 1 +46 + +-------------------------------------------------- +>> Issue: [B101:assert_used] Use of assert detected. The enclosed code will be removed when compiling to optimised byte code. + Severity: Low Confidence: High + CWE: CWE-703 (https://cwe.mitre.org/data/definitions/703.html) + More Info: https://bandit.readthedocs.io/en/1.7.7/plugins/b101_assert_used.html + Location: ./deepecho/models/basic_gan.py:366:8 +365 sequence_length, num_sequences, _ = tensor.shape +366 assert num_sequences == 1 +367 + +-------------------------------------------------- +>> Issue: [B101:assert_used] Use of assert detected. The enclosed code will be removed when compiling to optimised byte code. + Severity: Low Confidence: High + CWE: CWE-703 (https://cwe.mitre.org/data/definitions/703.html) + More Info: https://bandit.readthedocs.io/en/1.7.7/plugins/b101_assert_used.html + Location: ./deepecho/models/par.py:455:8 +454 seq_len, batch_size, _ = x.shape +455 assert batch_size == 1 +456 + +-------------------------------------------------- +>> Issue: [B101:assert_used] Use of assert detected. The enclosed code will be removed when compiling to optimised byte code. + Severity: Low Confidence: High + CWE: CWE-703 (https://cwe.mitre.org/data/definitions/703.html) + More Info: https://bandit.readthedocs.io/en/1.7.7/plugins/b101_assert_used.html + Location: ./deepecho/models/par.py:496:8 +495 seq_len, batch_size, _input_size = x.shape +496 assert seq_len == 1 and batch_size == 1 +497 + +-------------------------------------------------- + +Code scanned: + Total lines of code: 1467 + Total lines skipped (#nosec): 0 + Total potential issues skipped due to specifically being disabled (e.g., #nosec BXXX): 0 + +Run metrics: + Total issues (by severity): + Undefined: 0 + Low: 8 + Medium: 0 + High: 0 + Total issues (by confidence): + Undefined: 0 + Low: 0 + Medium: 0 + High: 8 +Files skipped (0): diff --git a/tasks.py b/tasks.py index 3e1b0e4..7710f5c 100644 --- a/tasks.py +++ b/tasks.py @@ -2,21 +2,21 @@ import inspect import operator import os -import re -import pkg_resources -import platform import shutil import stat +import sys from pathlib import Path +import tomli from invoke import task - +from packaging.requirements import Requirement +from packaging.version import Version COMPARISONS = { '>=': operator.ge, '>': operator.gt, '<': operator.lt, - '<=': operator.le + '<=': operator.le, } @@ -39,48 +39,54 @@ def unit(c): c.run('python -m pytest ./tests/unit --reruns 3') -def _validate_python_version(line): - is_valid = True - for python_version_match in re.finditer(r"python_version(<=?|>=?|==)\'(\d\.?)+\'", line): - python_version = python_version_match.group(0) - comparison = re.search(r'(>=?|<=?|==)', python_version).group(0) - version_number = python_version.split(comparison)[-1].replace("'", "") - comparison_function = COMPARISONS[comparison] - is_valid = is_valid and comparison_function( - pkg_resources.parse_version(platform.python_version()), - pkg_resources.parse_version(version_number), - ) - - return is_valid +def _get_minimum_versions(dependencies, python_version): + min_versions = {} + for dependency in dependencies: + if '@' in dependency: + name, url = dependency.split(' @ ') + min_versions[name] = f'{url}#egg={name}' + continue + + req = Requirement(dependency) + if ';' in dependency: + marker = req.marker + if marker and not marker.evaluate({'python_version': python_version}): + continue # Skip this dependency if the marker does not apply to the current Python version + + if req.name not in min_versions: + min_version = next( + (spec.version for spec in req.specifier if spec.operator in ('>=', '==')), + None, + ) + if min_version: + min_versions[req.name] = f'{req.name}=={min_version}' + + elif '@' not in min_versions[req.name]: + existing_version = Version(min_versions[req.name].split('==')[1]) + new_version = next( + (spec.version for spec in req.specifier if spec.operator in ('>=', '==')), + existing_version, + ) + if new_version > existing_version: + min_versions[req.name] = ( + f'{req.name}=={new_version}' # Change when a valid newer version is found + ) + + return list(min_versions.values()) @task def install_minimum(c): - with open('setup.py', 'r') as setup_py: - lines = setup_py.read().splitlines() - - versions = [] - started = False - for line in lines: - if started: - if line == ']': - break - - line = line.strip() - if _validate_python_version(line): - requirement = re.match(r'[^>]*', line).group(0) - requirement = re.sub(r"""['",]""", '', requirement) - version = re.search(r'>=?(\d\.?)+', line).group(0) - if version: - version = re.sub(r'>=?', '==', version) - version = re.sub(r"""['",]""", '', version) - requirement += version - versions.append(requirement) + with open('pyproject.toml', 'rb') as pyproject_file: + pyproject_data = tomli.load(pyproject_file) - elif line.startswith('install_requires = ['): - started = True + dependencies = pyproject_data.get('project', {}).get('dependencies', []) + python_version = '.'.join(map(str, sys.version_info[:2])) + minimum_versions = _get_minimum_versions(dependencies, python_version) - c.run(f'python -m pip install {" ".join(versions)}') + if minimum_versions: + install_deps = ' '.join(minimum_versions) + c.run(f'python -m pip install {install_deps}') @task @@ -110,19 +116,20 @@ def readme(c): def tutorials(c): for ipynb_file in glob.glob('tutorials/*.ipynb') + glob.glob('tutorials/**/*.ipynb'): if '.ipynb_checkpoints' not in ipynb_file: - c.run(( - 'jupyter nbconvert --execute --ExecutePreprocessor.timeout=3600 ' - f'--to=html --stdout "{ipynb_file}"' - ), hide='out') + c.run( + ( + 'jupyter nbconvert --execute --ExecutePreprocessor.timeout=3600 ' + f'--to=html --stdout "{ipynb_file}"' + ), + hide='out', + ) @task def lint(c): check_dependencies(c) - c.run('flake8 deepecho') - c.run('flake8 tests') - c.run('isort -c --recursive deepecho tests') - c.run('pylint deepecho --rcfile=setup.cfg') + c.run('ruff check .') + c.run('ruff format . --check') def remove_readonly(func, path, _): diff --git a/tests/integration/test_basic_gan.py b/tests/integration/test_basic_gan.py index 8ed516d..c58d6a1 100644 --- a/tests/integration/test_basic_gan.py +++ b/tests/integration/test_basic_gan.py @@ -16,15 +16,15 @@ def test_basic(self): 'data': [ [0.0, 0.1, 0.2, 0.3, 0.4, 0.5], [0.5, 0.4, 0.3, 0.2, 0.1, 0.0], - ] + ], }, { 'context': [], 'data': [ [0.0, 0.1, 0.2, 0.3, 0.4, 0.5], [0.5, 0.4, 0.3, 0.2, 0.1, 0.0], - ] - } + ], + }, ] context_types = [] data_types = ['continuous', 'continuous'] @@ -41,15 +41,15 @@ def test_conditional(self): 'data': [ [0.0, 0.1, 0.2, 0.3, 0.4, 0.5], [0.5, 0.4, 0.3, 0.2, 0.1, 0.0], - ] + ], }, { 'context': [1], 'data': [ [0.5, 0.4, 0.3, 0.2, 0.1, 0.0], [0.0, 0.1, 0.2, 0.3, 0.4, 0.5], - ] - } + ], + }, ] context_types = ['categorical'] data_types = ['continuous', 'continuous'] @@ -66,15 +66,15 @@ def test_mixed(self): 'data': [ [0.0, 0.1, 0.2, 0.3, 0.4, 0.5], [0, 1, 0, 1, 0, 1], - ] + ], }, { 'context': [1], 'data': [ [0.5, 0.4, 0.3, 0.2, 0.1, 0.0], [0, 1, 0, 1, 0, 1], - ] - } + ], + }, ] context_types = ['categorical'] data_types = ['continuous', 'categorical'] @@ -91,15 +91,15 @@ def test_count(self): 'data': [ [0, 5, 5, 3, 1, 1], [0, 1, 2, 1, 0, 1], - ] + ], }, { 'context': [1.1], 'data': [ [1, 6, 6, 4, 2, 2], [0, 1, 0, 1, 0, 1], - ] - } + ], + }, ] context_types = ['continuous'] data_types = ['count', 'categorical'] @@ -116,15 +116,15 @@ def test_variable_length(self): 'data': [ [0, 5, 5, 3, 1, 1, 0], [0, 1, 2, 1, 0, 1, 2], - ] + ], }, { 'context': [1], 'data': [ [1, 6, 6, 4, 2, 2], [0, 1, 0, 1, 0, 1], - ] - } + ], + }, ] context_types = ['count'] data_types = ['count', 'categorical'] diff --git a/tests/integration/test_par.py b/tests/integration/test_par.py index f633a35..55ce0c0 100644 --- a/tests/integration/test_par.py +++ b/tests/integration/test_par.py @@ -18,15 +18,15 @@ def test_basic(self): 'data': [ [0.0, np.nan, 0.2, 0.3, 0.4, 0.5], [0.5, 0.4, 0.3, 0.2, 0.1, 0.0], - ] + ], }, { 'context': [], 'data': [ [0.0, 0.1, 0.2, 0.3, 0.4, 0.5], [0.5, 0.4, 0.3, 0.2, 0.1, np.nan], - ] - } + ], + }, ] context_types = [] data_types = ['continuous', 'continuous'] @@ -35,6 +35,10 @@ def test_basic(self): model.fit_sequences(sequences, context_types, data_types) model.sample_sequence([]) + # Assert + assert set(model.loss_values.columns) == {'Epoch', 'Loss'} + assert len(model.loss_values) == 128 + def test_conditional(self): """Test the ``PARModel`` with conditional sampling.""" sequences = [ @@ -43,15 +47,15 @@ def test_conditional(self): 'data': [ [0.0, 0.1, 0.2, 0.3, 0.4, 0.5], [0.5, 0.4, 0.3, 0.2, np.nan, 0.0], - ] + ], }, { 'context': [1], 'data': [ [0.5, 0.4, 0.3, 0.2, 0.1, 0.0], [0.0, 0.1, np.nan, 0.3, 0.4, 0.5], - ] - } + ], + }, ] context_types = ['categorical'] data_types = ['continuous', 'continuous'] @@ -60,6 +64,10 @@ def test_conditional(self): model.fit_sequences(sequences, context_types, data_types) model.sample_sequence([0]) + # Assert + assert set(model.loss_values.columns) == {'Epoch', 'Loss'} + assert len(model.loss_values) == 128 + def test_mixed(self): """Test the ``PARModel`` with mixed input data.""" sequences = [ @@ -68,15 +76,15 @@ def test_mixed(self): 'data': [ [0.0, 0.1, 0.2, 0.3, 0.4, 0.5], [0, 1, 0, 1, 0, 1], - ] + ], }, { 'context': [1], 'data': [ [0.5, np.nan, 0.3, 0.2, np.nan, 0.0], [0, 1, 0, 1, np.nan, 1], - ] - } + ], + }, ] context_types = ['categorical'] data_types = ['continuous', 'categorical'] @@ -85,6 +93,10 @@ def test_mixed(self): model.fit_sequences(sequences, context_types, data_types) model.sample_sequence([0]) + # Assert + assert set(model.loss_values.columns) == {'Epoch', 'Loss'} + assert len(model.loss_values) == 128 + def test_count(self): """Test the PARModel with datatype ``count``.""" sequences = [ @@ -93,15 +105,15 @@ def test_count(self): 'data': [ [0, 5, 5, np.nan, 1, 1], [0, 1, 2, 1, 0, 1], - ] + ], }, { 'context': [1.1], 'data': [ [1, 6, 6, 4, 2, 2], [0, 1, 0, 1, 0, 1], - ] - } + ], + }, ] context_types = ['continuous'] data_types = ['count', 'categorical'] @@ -110,6 +122,10 @@ def test_count(self): model.fit_sequences(sequences, context_types, data_types) model.sample_sequence([0]) + # Assert + assert set(model.loss_values.columns) == {'Epoch', 'Loss'} + assert len(model.loss_values) == 128 + def test_variable_length(self): """Test ``PARModel`` with variable data length.""" sequences = [ @@ -118,15 +134,15 @@ def test_variable_length(self): 'data': [ [0, 5, 5, 3, 1, 1, 0], [0, 1, 2, 1, 0, 1, 2], - ] + ], }, { 'context': [1], 'data': [ [1, 6, 6, 4, 2, 2], [np.nan, 1, 0, 1, 0, np.nan], - ] - } + ], + }, ] context_types = ['count'] data_types = ['count', 'categorical'] @@ -134,3 +150,7 @@ def test_variable_length(self): model = PARModel() model.fit_sequences(sequences, context_types, data_types) model.sample_sequence([0]) + + # Assert + assert set(model.loss_values.columns) == {'Epoch', 'Loss'} + assert len(model.loss_values) == 128 diff --git a/tests/test_tasks.py b/tests/test_tasks.py new file mode 100644 index 0000000..d088673 --- /dev/null +++ b/tests/test_tasks.py @@ -0,0 +1,39 @@ +"""Tests for the ``tasks.py`` file.""" + +from tasks import _get_minimum_versions + + +def test_get_minimum_versions(): + """Test the ``_get_minimum_versions`` method. + + The method should return the minimum versions of the dependencies for the given python version. + If a library is linked to an URL, the minimum version should be the URL. + """ + # Setup + dependencies = [ + "numpy>=1.20.0,<2;python_version<'3.10'", + "numpy>=1.23.3,<2;python_version>='3.10'", + "pandas>=1.2.0,<2;python_version<'3.10'", + "pandas>=1.3.0,<2;python_version>='3.10'", + 'humanfriendly>=8.2,<11', + 'pandas @ git+https://github.com/pandas-dev/pandas.git@master', + ] + + # Run + minimum_versions_39 = _get_minimum_versions(dependencies, '3.9') + minimum_versions_310 = _get_minimum_versions(dependencies, '3.10') + + # Assert + expected_versions_39 = [ + 'numpy==1.20.0', + 'git+https://github.com/pandas-dev/pandas.git@master#egg=pandas', + 'humanfriendly==8.2', + ] + expected_versions_310 = [ + 'numpy==1.23.3', + 'git+https://github.com/pandas-dev/pandas.git@master#egg=pandas', + 'humanfriendly==8.2', + ] + + assert minimum_versions_39 == expected_versions_39 + assert minimum_versions_310 == expected_versions_310 diff --git a/tests/unit/test_sequences.py b/tests/unit/test_sequences.py index 8718b23..019a480 100644 --- a/tests/unit/test_sequences.py +++ b/tests/unit/test_sequences.py @@ -4,7 +4,11 @@ import pytest from deepecho.sequences import ( - assemble_sequences, segment_by_size, segment_by_time, segment_sequence) + assemble_sequences, + segment_by_size, + segment_by_time, + segment_sequence, +) def test_segment_by_size(): @@ -19,18 +23,27 @@ def test_segment_by_size(): assert isinstance(out, list) assert len(out) == 3 - pd.testing.assert_frame_equal(pd.DataFrame({ - 'a': [1, 2, 3], - 'b': [9, 8, 7], - }), out[0]) - pd.testing.assert_frame_equal(pd.DataFrame({ - 'a': [4, 5, 6], - 'b': [6, 5, 4], - }), out[1]) - pd.testing.assert_frame_equal(pd.DataFrame({ - 'a': [7, 8, 9], - 'b': [3, 2, 1], - }), out[2]) + pd.testing.assert_frame_equal( + pd.DataFrame({ + 'a': [1, 2, 3], + 'b': [9, 8, 7], + }), + out[0], + ) + pd.testing.assert_frame_equal( + pd.DataFrame({ + 'a': [4, 5, 6], + 'b': [6, 5, 4], + }), + out[1], + ) + pd.testing.assert_frame_equal( + pd.DataFrame({ + 'a': [7, 8, 9], + 'b': [3, 2, 1], + }), + out[2], + ) def test_segment_by_time(): @@ -47,18 +60,27 @@ def test_segment_by_time(): assert isinstance(out, list) assert len(out) == 3 - pd.testing.assert_frame_equal(pd.DataFrame({ - 'a': [1, 2, 3], - 'b': [9, 8, 7], - }), out[0]) - pd.testing.assert_frame_equal(pd.DataFrame({ - 'a': [4, 5, 6], - 'b': [6, 5, 4], - }), out[1]) - pd.testing.assert_frame_equal(pd.DataFrame({ - 'a': [7, 8, 9], - 'b': [3, 2, 1], - }), out[2]) + pd.testing.assert_frame_equal( + pd.DataFrame({ + 'a': [1, 2, 3], + 'b': [9, 8, 7], + }), + out[0], + ) + pd.testing.assert_frame_equal( + pd.DataFrame({ + 'a': [4, 5, 6], + 'b': [6, 5, 4], + }), + out[1], + ) + pd.testing.assert_frame_equal( + pd.DataFrame({ + 'a': [7, 8, 9], + 'b': [3, 2, 1], + }), + out[2], + ) def test_segment_sequence(): @@ -73,18 +95,27 @@ def test_segment_sequence(): assert isinstance(out, list) assert len(out) == 3 - pd.testing.assert_frame_equal(pd.DataFrame({ - 'a': [1, 2, 3], - 'b': [9, 8, 7], - }), out[0]) - pd.testing.assert_frame_equal(pd.DataFrame({ - 'a': [7, 8, 9], - 'b': [3, 2, 1], - }), out[1]) - pd.testing.assert_frame_equal(pd.DataFrame({ - 'a': [4, 5, 6], - 'b': [6, 5, 4], - }), out[2]) + pd.testing.assert_frame_equal( + pd.DataFrame({ + 'a': [1, 2, 3], + 'b': [9, 8, 7], + }), + out[0], + ) + pd.testing.assert_frame_equal( + pd.DataFrame({ + 'a': [7, 8, 9], + 'b': [3, 2, 1], + }), + out[1], + ) + pd.testing.assert_frame_equal( + pd.DataFrame({ + 'a': [4, 5, 6], + 'b': [6, 5, 4], + }), + out[2], + ) def test_segment_sequence_sequence_index(): @@ -99,15 +130,24 @@ def test_segment_sequence_sequence_index(): assert isinstance(out, list) assert len(out) == 3 - pd.testing.assert_frame_equal(pd.DataFrame({ - 'b': [9, 8, 7], - }), out[0]) - pd.testing.assert_frame_equal(pd.DataFrame({ - 'b': [6, 5, 4], - }), out[1]) - pd.testing.assert_frame_equal(pd.DataFrame({ - 'b': [3, 2, 1], - }), out[2]) + pd.testing.assert_frame_equal( + pd.DataFrame({ + 'b': [9, 8, 7], + }), + out[0], + ) + pd.testing.assert_frame_equal( + pd.DataFrame({ + 'b': [6, 5, 4], + }), + out[1], + ) + pd.testing.assert_frame_equal( + pd.DataFrame({ + 'b': [3, 2, 1], + }), + out[2], + ) def test__assemble_sequences_no_entity_no_context(): diff --git a/tox.ini b/tox.ini index 65681eb..f926e1e 100644 --- a/tox.ini +++ b/tox.ini @@ -1,5 +1,5 @@ [tox] -envlist = py38-lint, py3{8,9,10,11}-{readme,unit,integration,minimum,tutorials} +envlist = py38-lint, py3{8,9,10,11,12}-{readme,unit,integration,minimum,tutorials} [testenv] skipsdist = false