diff --git a/.dockerignore b/.dockerignore index 7b0ce18..34ab9b9 100644 --- a/.dockerignore +++ b/.dockerignore @@ -1,11 +1,9 @@ * Dockerfile* .dockerignore -!metator/* -!setup.py -!setup.cfg +!src/* +!external/* !metator.yaml -!requirements.txt +!pyproject.toml !MANIFEST.in !README.md -!external diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..66f2cbf --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,51 @@ +name: Test and Doc + +on: + push: + pull_request: + +defaults: + run: + shell: bash -l {0} + +permissions: + contents: write + +jobs: + Test: + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest] + python-version: [ "3.9", "3.10", "3.11" ] + include: + - os: ubuntu-latest + # - os: windows-latest + # - os: macos-latest + runs-on: ${{ matrix.os }} + + steps: + + - uses: actions/checkout@v4 + + - name: 🛠️ Install Python ${{ matrix.python-version }} and deps with micromamba + uses: mamba-org/setup-micromamba@v1 + with: + environment-file: metator.yaml + init-shell: bash + cache-environment: false + post-cleanup: 'none' + generate-run-shell: true + create-args: >- + python=${{ matrix.python-version }} + + - name: 📦 Install package + run: | + pip install .[test] + shell: micromamba-shell {0} + + - name: 🧪 Run tests with Python ${{ matrix.python-version }} + run: | + ruff check . --select=E9,F63,F7,F82 + pytest --cov --cov-report=xml + shell: micromamba-shell {0} diff --git a/.github/workflows/pypi-publish.yml b/.github/workflows/pypi-publish.yml index dd2b996..b216d98 100644 --- a/.github/workflows/pypi-publish.yml +++ b/.github/workflows/pypi-publish.yml @@ -5,7 +5,6 @@ name: Upload Python Package on PyPI on: release: types: [created] - branches: [master] jobs: deploy: @@ -13,21 +12,22 @@ jobs: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 - - name: Set up Python 3.8 - uses: actions/setup-python@v2 + - uses: actions/checkout@v4 + + - name: Set up Python 3.10 + uses: actions/setup-python@v5 with: - python-version: '3.8' + python-version: '3.10' - name: Install dependencies run: | python -m pip install --upgrade pip - pip install setuptools wheel twine + pip install hatch - name: Build and publish env: TWINE_USERNAME: __token__ TWINE_PASSWORD: ${{secrets.PYPI_TOKEN}} run: | - python setup.py sdist bdist_wheel - twine upload dist/* \ No newline at end of file + hatch build + twine upload dist/* diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml deleted file mode 100644 index aba3555..0000000 --- a/.github/workflows/python-package.yml +++ /dev/null @@ -1,61 +0,0 @@ -# This workflow will install Python dependencies, run tests and lint with a variety of Python versions -# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions - -name: Build - -on: - push: - pull_request: - -defaults: - run: - shell: bash -l {0} - -jobs: - build: - - runs-on: ubuntu-latest - strategy: - matrix: - python-version: ['3.8', '3.9', '3.10'] - - steps: - - - uses: actions/checkout@v3 - - - name: Load apt packages - run: | - sudo apt-get update - sudo apt-get -y install git make g++ default-jre default-jdk zlib1g-dev - - - name: Install Louvain - run: | - cd external - tar -xzf louvain-generic.tar.gz - cd gen-louvain - make - cd ../.. - - - name: Install Leiden - run: | - git clone https://github.com/vtraag/networkanalysis.git - cd networkanalysis - ./gradlew build - cd .. - - - name: Install Conda environment from metator.yaml - uses: mamba-org/setup-micromamba@v1 - with: - environment-file: metator.yaml - generate-run-shell: true - create-args: >- - python=${{ matrix.python-version }} - - - name: Test with pytest - run: | - micromamba activate metator - export LOUVAIN_PATH=external/gen-louvain - export LEIDEN_PATH=networkanalysis/build/libs/networkanalysis-1.2.0.jar - pytest --pylint --pylint-error-types=EF --pylint-rcfile=.pylintrc --doctest-modules metator - pytest --cov=metator - codecov diff --git a/.gitignore b/.gitignore index 27c5025..aabd421 100644 --- a/.gitignore +++ b/.gitignore @@ -35,6 +35,7 @@ wheels/ .installed.cfg *.egg MANIFEST +metator.code-workspace # PyInstaller # Usually these files are written by a python script from a template @@ -133,6 +134,9 @@ venv.bak/ dmypy.json nf-metator.sif -# Louvain and Leiden installation -external/gen-louvain/ -networkanalysis +# Artifacts installation +artifacts/ +gen-louvain/ +networkanalysis/ +pairix/ +bowtie2/ \ No newline at end of file diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..17a8843 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,16 @@ +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v2.3.0 + hooks: + - id: check-yaml + - id: end-of-file-fixer + - id: trailing-whitespace + - repo: https://github.com/psf/black + rev: 22.3.0 + hooks: + - id: black + args: ['--config=./pyproject.toml'] + - repo: https://github.com/charliermarsh/ruff-pre-commit + rev: v0.6.9 + hooks: + - id: ruff diff --git a/CHANGELOG.md b/CHANGELOG.md index 1a40542..b60ed55 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,13 @@ All notable changes to this project will be documented in this file. +## [1.3.4] - 2025-02-19 +- Package now relies on `pyproject.toml` for build configuration with `hatch`. +- Binaries for `louvain` and `leiden` clustering algorithms are now embedded in the package. +- Uses pre-commit hooks for code formatting and linting. +- Fix deprecated Bio.SeqUtils.GC to Bio.SeqUtils.gc_fraction. +- `Biopython` is pinned <= 1.80 to work with `micomplete 1.1.1`. + ## [1.3.3] - 2023-11-27 - Improve ci. - Add pairix as requirements. diff --git a/Dockerfile b/Dockerfile index 1cdd07f..474e426 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,39 +1,21 @@ -# syntax=docker/dockerfile:1 - FROM mambaorg/micromamba:latest -LABEL Name=metator Version=1.3.3 +LABEL Name=metator Version=1.3.4 COPY --chown=$MAMBA_USER:$MAMBA_USER . ./ # Install 3rd party packages USER root RUN apt update && \ - apt install -y --no-install-recommends git make g++ curl default-jre default-jdk zlib1g-dev - -# Install Louvain -RUN cd ./external && \ - tar -xzf louvain-generic.tar.gz && \ - cd gen-louvain && \ - make && \ - cd ../ -ENV LOUVAIN_PATH=./external/gen-louvain - -# Install Leiden through Network analysis repo -RUN git clone https://github.com/vtraag/networkanalysis.git && \ - cd ./networkanalysis && \ - ./gradlew build && \ - cd ../ -ENV LEIDEN_PATH=$(pwd)/networkanalysis/build/libs/networkanalysis-1.2.0.jar + apt install -y --no-install-recommends git make g++ curl default-jre default-jdk zlib1g-dev unzip -## Install dependencies +# ## Install dependencies USER mambauser RUN micromamba install -y -n base --file metator.yaml && \ - micromamba install -y -n base pip && \ micromamba clean --all --yes # Install metator -RUN micromamba run -n base python3 -m pip install -e . +RUN micromamba run -n base pip install -e .[dev] WORKDIR /home/mambauser/ ENTRYPOINT [ "/bin/bash" ] diff --git a/MANIFEST.in b/MANIFEST.in index e1a0fb3..8e8b5e0 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,4 +1,4 @@ -include requirements.txt include README.md +include external/artifacts/* recursive-exclude test_data * recursive-exclude tests * \ No newline at end of file diff --git a/README.md b/README.md index ecc6215..6e0be72 100644 --- a/README.md +++ b/README.md @@ -15,82 +15,79 @@ Metagenomic Tridimensional Organisation-based Reassembly - A set of scripts that

-## Table of contents - -1. [MetaTOR](#metator) - 1. [Table of contents](#table-of-contents) - 2. [Installation](#installation) - 1. [Requirements](#requirements) - 2. [Using pip](#using-pip) - 3. [Using conda](#using-conda) - 4. [Louvain or Leiden dependency](#louvain-or-leiden-dependency) - 5. [Using docker container](#using-docker-container) - 3. [Usage](#usage) - 4. [Output files](#output-files) - 5. [References](#references) - 6. [Contact](#contact) - 1. [Authors](#authors) - 2. [Research lab](#research-lab) +1. [Installation](#installation) + 1. [Using micromamba](#using-micromamba) + 2. [Using pip](#using-pip) +2. [Usage](#usage) +3. [Output files](#output-files) +4. [References](#references) +5. [Contact](#contact) + 1. [Authors](#authors) + 2. [Research lab](#research-lab) ## Installation -### Requirements +### Using micromamba -* Python `3.8` to `3.10` or later is required. -* The following librairies are required but will be automatically installed with the pip installation: `numpy`, `scipy`, `sklearn`, `pandas`, `docopt`, `networkx` `biopython` `pyfastx`, `pysam`, `micomplete` and `pairix`. -* The following software should be installed separately if you used the pip installation: - * [bowtie2](http://bowtie-bio.sourceforge.net/bowtie2/index.shtml) - * [samtools](http://www.htslib.org/) - * [louvain](https://sourceforge.net/projects/louvain/) (original - implementation). - * [networkanalysis](https://github.com/vtraag/networkanalysis) (not - necessary only if you want to use Leiden algorithm to partition the network) - -### Using pip - -```sh -pip3 install metator -``` - -or, to use the latest version: +`conda` is the recommended way to install the latest `metator` release: ```sh -pip3 install -e git+https://github.com/koszullab/metator.git@master#egg=metator +conda create -n metator bioconda::metator ``` -### Using conda - -```sh -conda create -n metator -y --log-level warning -f metator.yaml -``` +### Using pip -### Louvain or Leiden dependency +**Note:** while `metator` is available from Pypi, several additional libraries are +not available from Pypi and must be installed separately. Please consider the following +before installing `metator`: -In order to use Louvain or Leiden it's necessary to set a global variable `LOUVAIN_PATH` and `LEIDEN_PATH` depending on which algorithm you wan to use with the absolute path where the executable are. +* Python `3.9` to `3.11` is required. +* The following dependencies should also be locally installed and available in the `$PATH`: + * [`bowtie2`](http://bowtie-bio.sourceforge.net/bowtie2/index.shtml) or `bwa` + * [`pairix`](https://github.com/4dn-dcic/pairix) + * [`samtools`](https://www.htslib.org/download/) + * [`hmmer`](http://hmmer.org/documentation.html) + * [`prodigal`](https://github.com/hyattpd/Prodigal) + * [`java`](https://www.oracle.com/java/technologies/downloads/) -For Louvain algorithm in the directory where you have the archive file (available in the external directory of this repository): +* The following non-pythonic librairies are **embedded** when installing `metator` with `pip`: [`louvain 0.3`](https://sourceforge.net/projects/louvain/files/GenericLouvain/) and [`leiden 1.3.0`](https://github.com/CWTSLeiden/networkanalysis). ```sh -YOUR_DIRECTORY=$(pwd) -tar -xvzf louvain-generic.tar.gz -cd gen-louvain +# Install bowtie2, sameools, hmmer, prodigal and java-jdk: +sudo apt update && sudo apt install bowtie samtools hmmer prodigal default-jdk + +# Also install pairix: +wget https://github.com/4dn-dcic/pairix/archive/refs/tags/0.3.9.zip -O pairix-0.3.9.zip +unzip pairix-0.3.9.zip +mv pairix-0.3.9 ~/.local/lib/pairix +cd ~/.local/lib/pairix make -export LOUVAIN_PATH=$YOUR_DIRECTORY/gen-louvain/ +chmod +x bin/pairix +echo 'export PATH=$PATH:~/.local/lib/pairix/bin' >> ~/.bashrc + +# Install metator from Pypi +pip3 install metator ``` -For Leiden algorithm, clone the networkanalysis repository from github and build the Java script. Then you can export the Leiden path: +To use the development version: ```sh -export LEIDEN_PATH=/networkanalysis_repository_path/build/libs/networkanalysis-1.2.0.jar +# Install bowtie2, sameools, hmmer, prodigal, java-jdk and pairix, see above + +git clone https://github.com/koszullab/metator +cd metator +pip3 install -e .[dev] ``` + ## Usage @@ -211,6 +208,7 @@ This is the summary of the data of the final bins build with all the step of met ### Authors * amaury.bignaud@pasteur.fr +* jacques.serizay@pasteur.fr * lyam.baudry@pasteur.fr * thfoutel@pasteur.fr * martial.marbouty@pasteur.fr diff --git a/external/bowtie2-2.5.1-source.zip b/external/bowtie2-2.5.1-source.zip new file mode 100644 index 0000000..e4cbb04 Binary files /dev/null and b/external/bowtie2-2.5.1-source.zip differ diff --git a/external/networkanalysis-1.3.0.jar b/external/networkanalysis-1.3.0.jar new file mode 100644 index 0000000..01f925d Binary files /dev/null and b/external/networkanalysis-1.3.0.jar differ diff --git a/external/pairix-0.3.9.zip b/external/pairix-0.3.9.zip new file mode 100644 index 0000000..2e4658b Binary files /dev/null and b/external/pairix-0.3.9.zip differ diff --git a/external/setup_dependencies.sh b/external/setup_dependencies.sh new file mode 100644 index 0000000..525cac9 --- /dev/null +++ b/external/setup_dependencies.sh @@ -0,0 +1,43 @@ +#!/bin/bash + +## Purge existing artifacts (required for local rebuild) +rm -rf artifacts/ gen-louvain/ pairix/ bowtie2/ networkanalysis/ + +## Install louvain +tar -k -xzf louvain-generic.tar.gz +cd gen-louvain +make +cd .. + +## Install leiden (https://github.com/CWTSLeiden/networkanalysis) +# wget "https://github-registry-files.githubusercontent.com/153760626/0f40f180-3ed3-11ee-916e-23eb9928c186?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAVCODYLSA53PQK4ZA%2F20250218%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20250218T134441Z&X-Amz-Expires=300&X-Amz-Signature=065d9dec2e31375461b33db29c5e6261b54eca2ec2e44c6460a3138586f86b89&X-Amz-SignedHeaders=host&response-content-disposition=filename%3Dnetworkanalysis-1.3.0.jar&response-content-type=application%2Foctet-stream" -O networkanalysis-1.3.0.jar +mkdir -p networkanalysis/build/libs/ +cp networkanalysis-1.3.0.jar networkanalysis/build/libs/ + +## Install pairix +# wget https://github.com/4dn-dcic/pairix/archive/refs/tags/0.3.9.zip -O pairix-0.3.9.zip +# zip -d pairix-0.3.9.zip "pairix-0.3.9/samples/*" +# unzip pairix-0.3.9.zip +# mv pairix-0.3.9 pairix +# cd pairix +# make +# chmod +x bin/pairix +# cd .. + +# ## Install bowtie2 +# wget https://sourceforge.net/projects/bowtie-bio/files/bowtie2/2.5.1/bowtie2-2.5.1-source.zip/download -O bowtie2-2.5.1-source.zip +# zip -d bowtie2-2.5.1-source.zip "bowtie2-2.5.1/example/*" +# unzip bowtie2-2.5.1-source.zip +# mv bowtie2-2.5.1/ bowtie2 +# cd bowtie2 +# make +# cd .. + +## Move artifacts to the correct location +mkdir -p artifacts/networkanalysis/build artifacts/pairix artifacts/bowtie2/bin +mv gen-louvain/ artifacts/ +mv networkanalysis/build artifacts/networkanalysis/ +# mv pairix/* artifacts/pairix/ +# mv bowtie2/bowtie2* artifacts/bowtie2/bin/ + +rm -rf gen-louvain/ pairix/ bowtie2/ networkanalysis/ diff --git a/metator.yaml b/metator.yaml index d8e10e9..396f92e 100644 --- a/metator.yaml +++ b/metator.yaml @@ -4,27 +4,28 @@ channels: - bioconda - defaults dependencies: - - requests - - pip - - git - - biopython=1.80 - - bowtie2=2.5.1 - - bwa=0.7.17 - - samtools=1.17 - - prodigal=2.6.3 - - hmmer=3.3.2 - - pysam=0.21.0 - - pairix=0.3.7 - - pairtools=1.0.2 - - pyfastx==0.8.4 - - cooler==0.9.1 - - pandas==1.5.3 + - python<3.12 - hicstuff + - networkx + - checkv + - biopython<=1.80 + - pysam + - pairtools + - pyfastx + - cooler + - numpy + - pandas + - scikit-learn + - scipy + - seaborn + - bowtie2 + - pairix + - bwa + - samtools + - prodigal + - hmmer + - gcc + - java-jdk - pip: - - micomplete==1.1.1 + - micomplete - metator - - pytest - - pylint - - codecov - - pytest-cov - - pytest-pylint diff --git a/metator/__init__.py b/metator/__init__.py deleted file mode 100644 index d3bd966..0000000 --- a/metator/__init__.py +++ /dev/null @@ -1,23 +0,0 @@ -#!/usr/bin/env python3 - -from .version import __version__ as version -from . import * - -__author__ = "Amaury Bignaud, Jacques Serizay, Lyam Baudry, Théo Foutel-Rodier,\ - Martial Marbouty" -__copyright__ = "Copyright © 2017-2018, Institut Pasteur, Paris, France" -__credits__ = [ - "Amaury Bignaud", - "Jacques Serizay", - "Lyam Baudry", - "Théo Foutel-Rodier", - "Martial Marbouty", - "Axel Cournac", - "Vittore Scolari", - "Romain Koszul", -] -__license__ = "GPLv3" -__maintainer__ = "Amaury Bignaud" -__email__ = "amaury.bignaud@pasteur.fr" -__status__ = "Alpha" -__version__ = version diff --git a/metator/version.py b/metator/version.py deleted file mode 100644 index 07f744c..0000000 --- a/metator/version.py +++ /dev/null @@ -1 +0,0 @@ -__version__ = '1.3.3' diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..0bc2cde --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,177 @@ +[build-system] +requires = ["hatchling", "hatch-build-scripts"] +build-backend = "hatchling.build" + +[project] +name = "metator" +version = "1.3.4" +description = "A pipeline for binning metagenomic datasets from metaHiC data." +readme = "README.md" +requires-python = ">=3.9,<3.12" +license = { text = "GNU General Public License v3 (GPLv3)" } + +authors = [ + {name = "Amaury Bignaud", email = "amaury.bignaud@pasteur.fr"}, + {name = "Jacques Serizay", email = "jacques.serizay@pasteur.fr"}, +] +keywords = [ + "metagenomics", + "bioinformatics" +] +classifiers = [ + "Development Status :: 3 - Alpha", + "Intended Audience :: Science/Research", + "License :: OSI Approved :: GNU General Public License v3 (GPLv3)", + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3 :: Only", + "Topic :: Scientific/Engineering", + "Topic :: Scientific/Engineering :: Bio-Informatics", + "Topic :: Scientific/Engineering :: Visualization", + "Operating System :: Unix", +] +dependencies = [ + "hicstuff", + "networkx", + "checkv", + "biopython<=1.80", + "pysam", + "pairtools", + "pyfastx", + "cooler", + "numpy", + "pandas", + "scikit-learn", + "scipy", + "seaborn", + "micomplete" + + # NON PIP DEPENDENCIES + + #"bowtie2" + #"pairix" + #"bwa" + #"samtools" + #"prodigal" + #"hmmer" + #"gcc" + #"java-jdk" +] + +[project.optional-dependencies] +test = [ + "coverage[toml]", + "isort", + "pytest", + "pytest-cov", + "pytest-pylint", + "pytest-order", + "codecov", + "pylint", + "ruff", +] +docs = [ + 'autodoc', + 'sphinx-autoapi', + "autodocsumm", + "sphinxcontrib-napoleon", + "m2r", + "recommonmark", + "Sphinx>=1.6", + "sphinx-autobuild", + "sphinx-click", + "furo", + "pydata-sphinx-theme", + "sphinx-rtd-theme", + "myst-parser", + "nbsphinx", + "lxml[html_clean]", +] +dev = [ + "metator[test,docs]", + "hatch", + "hatch-build-scripts", + "pre-commit", + "mypy", + "black", + "twine", + "ipython" +] + +[project.urls] +homepage = "https://github.com/koszullab/metator" +documentation = "https://github.com/koszullab/metator" +repository = "https://github.com/koszullab/metator" +changelog = "https://github.com/koszullab/metator/blob/devel/CHANGELOG.md" + +[project.scripts] +metator = "metator.main:main" + +[tool.hatch.version] +path = "src/metator/version.py" + +[tool.hatch.metadata] +allow-direct-references = true + +[[tool.hatch.build.hooks.build-scripts.scripts]] +out_dir = "external/artifacts/" +work_dir = "external" +commands = ["bash setup_dependencies.sh"] +artifacts = [] + +[tool.hatch.build.force-include] +"external/artifacts" = "metator/external/artifacts" + +[tool.black] +line-length = 130 +target-version = ['py310'] + +[tool.ruff] +line-length = 130 +src = ["src"] +exclude = [ + "bench/*", + ".venv", + "__main__.py", +] +lint.ignore = ["E402", "I001"] +lint.extend-select = [ + "B", # bugbear + "E", # style errors + "F", # pyflakes + "I", # isort + "RUF", # ruff-specific rules + # "UP", # pyupgrade + "W", # style warnings +] +lint.isort.known-first-party = ["metator"] + +[tool.pytest.ini_options] +markers = ["order: mark test to run in a specific order"] +minversion = "7" +log_cli_level = "INFO" +xfail_strict = true +addopts = [ + "-ra", + "--strict-config", + "--strict-markers", + "--cov=metator", + "--cov-config=pyproject.toml", + "--cov-report=term-missing", + "--cov-report=html", + "--cov-report=xml", +] +filterwarnings = ["ignore::PendingDeprecationWarning"] +testpaths = ["tests"] + +[tool.coverage.run] +source = ["src/metator"] + +omit = [] + +[tool.coverage.report] +exclude_lines = [ + "pragma: no cover", + "return NotImplemented", + "raise NotImplementedError" +] diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index e85dae6..0000000 --- a/requirements.txt +++ /dev/null @@ -1,19 +0,0 @@ -biopython==1.80 -cdlib -checkv -docopt -hicstuff -looseversion -micomplete==1.1.1 -networkx -numpy -pairix -pairtools -pandas -pyfastx -pypairix -pysam -requests -scikit-learn -scipy -seaborn diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index 526aeb2..0000000 --- a/setup.cfg +++ /dev/null @@ -1,2 +0,0 @@ -[bdist_wheel] -universal = 0 diff --git a/setup.py b/setup.py deleted file mode 100644 index ec38d92..0000000 --- a/setup.py +++ /dev/null @@ -1,65 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - -"""A pipeline for binning metagenomic datasets from metaHiC data. -""" - -from setuptools import setup, find_packages -import codecs - -CLASSIFIERS = [ - "Development Status :: 3 - Alpha", - "Intended Audience :: Science/Research", - "Intended Audience :: Science/Research", - "License :: OSI Approved :: GNU General Public License v3 (GPLv3)", - "Programming Language :: Python", - "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3 :: Only", - "Programming Language :: Python :: 3.8", - "Programming Language :: Python :: 3.9", - "Programming Language :: Python :: 3.10", - "Programming Language :: Python :: 3.11", - "Topic :: Scientific/Engineering", - "Topic :: Scientific/Engineering :: Bio-Informatics", - "Topic :: Scientific/Engineering :: Visualization", - "Operating System :: Unix", -] - -name = "metator" - -MAJOR = 1 -MINOR = 3 -MAINTENANCE = 3 -VERSION = "{}.{}.{}".format(MAJOR, MINOR, MAINTENANCE) - -LICENSE = "GPLv3" -URL = "https://github.com/koszullab/metator" - -DESCRIPTION = __doc__.strip("\n") - -with codecs.open("README.md", encoding="utf-8") as f: - LONG_DESCRIPTION = f.read() - -with open("requirements.txt", "r") as f: - REQUIREMENTS = f.read().splitlines() - -with open("metator/version.py", "w") as f: - f.write("__version__ = '{}'\n".format(VERSION)) - - -setup( - name=name, - author="amaury.bignaud@pasteur.fr", - description=DESCRIPTION, - long_description=LONG_DESCRIPTION, - version=VERSION, - license=LICENSE, - classifiers=CLASSIFIERS, - url=URL, - packages=find_packages(), - python_requires=">=3.8,<=3.10", - include_package_data=True, - long_description_content_type="text/markdown", - install_requires=REQUIREMENTS, - entry_points={"console_scripts": ["metator=metator.main:main"]}, -) diff --git a/src/metator/__init__.py b/src/metator/__init__.py new file mode 100644 index 0000000..3952d15 --- /dev/null +++ b/src/metator/__init__.py @@ -0,0 +1,45 @@ +#!/usr/bin/env python3 +import importlib.util +from pathlib import Path +import os +import site +from .version import __version__ as version +from . import * + +__author__ = "Amaury Bignaud, Jacques Serizay, Lyam Baudry, Théo Foutel-Rodier,\ + Martial Marbouty" +__copyright__ = "Copyright © 2017-2018, Institut Pasteur, Paris, France" +__credits__ = [ + "Amaury Bignaud", + "Jacques Serizay", + "Lyam Baudry", + "Théo Foutel-Rodier", + "Martial Marbouty", + "Axel Cournac", + "Vittore Scolari", + "Romain Koszul", +] +__license__ = "GPLv3" +__maintainer__ = "Amaury Bignaud" +__email__ = "amaury.bignaud@pasteur.fr" +__status__ = "Alpha" +__version__ = version + + +def is_editable_install(): + """Check if the metator package was installed in editable mode.""" + site_packages = site.getsitepackages() + for site_package in site_packages: + pth_file = os.path.join(site_package, "_metator.pth") + if os.path.isfile(pth_file): + return True + return False + + +__metator_source__ = os.path.dirname(importlib.util.find_spec("metator").origin) # type: ignore +__metator_root__ = __metator_source__ +if is_editable_install(): + __metator_root__ = os.path.abspath(os.path.join(__metator_source__, "../../")) +__leiden_dir__ = Path(__metator_root__, "external", "artifacts", "networkanalysis", "build", "libs") +LEIDEN_PATH = str(next(__leiden_dir__.glob("networkanalysis-1.3.0*.jar"))) +LOUVAIN_PATH = str(Path(__metator_root__, "external", "artifacts", "gen-louvain")) diff --git a/metator/align.py b/src/metator/align.py similarity index 80% rename from metator/align.py rename to src/metator/align.py index 8e28a6d..ad046f6 100644 --- a/metator/align.py +++ b/src/metator/align.py @@ -88,12 +88,7 @@ def align( read_len=20, ) st.rmtree(iter_tmp_dir) - sp.call( - "samtools sort -n -@ {n_cpu} -o {bam} {tmp}".format( - n_cpu=n_cpu, tmp=tmp_bam, bam=bam_out - ), - shell=True, - ) + pysam.sort("-n", "-@", str(n_cpu), "-o", bam_out, tmp_bam) else: # Align the reads on the reference genome with the chosen aligner. @@ -105,25 +100,22 @@ def align( "fq_rev": fq_in_2, } if aligner == "bwa": - cmd = "bwa mem -5SP -t {cpus} {idx} {fq} {fq_rev}".format( - **map_args - ) + cmd = "bwa mem -5SP -t {cpus} {idx} {fq} {fq_rev}".format(**map_args) elif aligner == "bowtie2": - cmd = ( - "bowtie2 -x {idx} -p {cpus} --very-sensitive-local {fq} --no-unal" - ).format(**map_args) - - # Write the outputfile in a temporary bam file. - map_process = sp.Popen(cmd, shell=True, stdout=sp.PIPE, stderr=sp.PIPE) - sort_process = sp.Popen( - "samtools sort -n -@ {cpus} -o {bam}".format(**map_args), - shell=True, - stdin=map_process.stdout, - ) - _out, _err = sort_process.communicate() - mapping_values = map_process.stderr.read() - for line in mapping_values.split(b"\n"): - logger.info(f"{line.decode('utf-8')}") + cmd = ("bowtie2 -x {idx} -p {cpus} --very-sensitive-local {fq} --no-unal").format(**map_args) + + iter_tmp_dir = hio.generate_temp_dir(tmp_dir) + tmp_bam = join(iter_tmp_dir, "tmp.bam") + with open(tmp_bam, "wb") as tmp: + map_process = sp.Popen(cmd, shell=True, stdout=tmp, stderr=sp.PIPE) + mapping_values = map_process.stderr.read() + # Log the map_process stderr (mapping stats) + for line in mapping_values.split(b"\n"): + logger.info(f"{line.decode('utf-8')}") + + # Sort the bam file + pysam.sort("-n", "-@", str(n_cpu), "-o", bam_out, tmp_bam) + return 0 @@ -207,9 +199,7 @@ def get_contact_pairs( # Create the contig data dictionnary and hit from each alignments nb_alignment = len(for_list) - contig_data, hit_data = mtn.create_contig_data( - assembly, nb_alignment, depth_file, enzyme - ) + contig_data, hit_data = mtn.create_contig_data(assembly, nb_alignment, depth_file, enzyme) for i in range(len(for_list)): for_in = for_list[i] @@ -251,7 +241,7 @@ def get_contact_pairs( # Align the forward reads logger.info(f"Alignment of {for_in}:") - align(for_in, index, aligner, alignment_for, n_cpu, iterative) + align(for_in, index, aligner, alignment_for, n_cpu, tmp_dir, iterative) # Align the reverse reads logger.info(f"Alignment of {rev_in}:") @@ -297,39 +287,24 @@ def get_contact_pairs( # Filters the aligned and non aligned reads from the forward and # reverse bam files. - aligned_reads_for = process_bamfile( - alignment_for, min_qual, alignment_temp_for - ) - aligned_reads_rev = process_bamfile( - alignment_rev, min_qual, alignment_temp_rev - ) - logger.info( - f"{aligned_reads_for} forward reads aligned and {aligned_reads_rev} reverse reads aligned." - ) + aligned_reads_for = process_bamfile(alignment_for, min_qual, alignment_temp_for) + aligned_reads_rev = process_bamfile(alignment_rev, min_qual, alignment_temp_rev) + logger.info(f"{aligned_reads_for} forward reads aligned and {aligned_reads_rev} reverse reads aligned.") # Merge alignement to create a pairs file logger.info("Merging the pairs:") - n_pairs = merge_alignment( - alignment_temp_for, alignment_temp_rev, contig_data, out_file - ) + n_pairs = merge_alignment(alignment_temp_for, alignment_temp_rev, contig_data, out_file) # Case where a bam file from bwa is given as input. if aligner == "bwa": - n_pairs = process_bwa_bamfile( - alignment, min_qual, contig_data, out_file - ) + n_pairs = process_bwa_bamfile(alignment, min_qual, contig_data, out_file) logger.info(f"{n_pairs} pairs aligned.\n") total_aligned_pairs += n_pairs # Sort pairs. - logger.info(f"Sort and indexed {out_file}") - out_file = mio.sort_pairs_pairtools( - out_file, - threads=n_cpu, - remove=True, - force=True - ) + logger.info(f"Sort and index {out_file}") + out_file = mio.sort_pairs_pairtools(out_file, threads=n_cpu, remove=True, force=True) out_file_list.append(out_file) if len(out_file_list) > 1: @@ -371,9 +346,7 @@ def merge_alignment(forward_aligned, reverse_aligned, contig_data, out_file): """ # Open files for reading and writing. - with open(forward_aligned, "r") as for_file, open( - reverse_aligned, "r" - ) as rev_file, open(out_file, "w") as merged: + with open(forward_aligned, "r") as for_file, open(reverse_aligned, "r") as rev_file, open(out_file, "w") as merged: for_bam = csv.reader(for_file, delimiter="\t") rev_bam = csv.reader(rev_file, delimiter="\t") @@ -388,11 +361,7 @@ def merge_alignment(forward_aligned, reverse_aligned, contig_data, out_file): merged.write("#sorted: readID\n") merged.write("#shape: upper triangle\n") for contig in contig_data: - merged.write( - "#chromsize: {0} {1}\n".format( - contig, contig_data[contig]["length"] - ) - ) + merged.write("#chromsize: {0} {1}\n".format(contig, contig_data[contig]["length"])) # Loop while at least one end of one end is reached. It's possible to # advance like that as the two tsv files are sorted on the id of the @@ -403,40 +372,17 @@ def merge_alignment(forward_aligned, reverse_aligned, contig_data, out_file): # Write read ID merged.write(for_read[0] + "\t") # Pairs are 1-based so we have to add 1 to 0 based bam position - for_position = ( - for_read[1] + "\t" + str(int(for_read[2]) + 1) + "\t" - ) - rev_position = ( - rev_read[1] + "\t" + str(int(rev_read[2]) + 1) + "\t" - ) + for_position = for_read[1] + "\t" + str(int(for_read[2]) + 1) + "\t" + rev_position = rev_read[1] + "\t" + str(int(rev_read[2]) + 1) + "\t" # Have upper triangle shape - if ( - ( - for_read[1] == rev_read[1] - and int(for_read[2]) <= int(rev_read[2]) - ) - or contig_data[for_read[1]]["id"] - < contig_data[rev_read[1]]["id"] - ): - - merged.write( - for_position - + rev_position - + for_read[3] - + "\t" - + rev_read[3] - + "\n" - ) + if (for_read[1] == rev_read[1] and int(for_read[2]) <= int(rev_read[2])) or contig_data[for_read[1]][ + "id" + ] < contig_data[rev_read[1]]["id"]: + + merged.write(for_position + rev_position + for_read[3] + "\t" + rev_read[3] + "\n") else: - merged.write( - rev_position - + for_position - + rev_read[3] - + "\t" - + for_read[3] - + "\n" - ) + merged.write(rev_position + for_position + rev_read[3] + "\t" + for_read[3] + "\n") n_pairs += 1 try: for_read = next(for_bam) @@ -504,29 +450,11 @@ def process_bamfile(alignment, min_qual, filtered_out): # Check Mapping (0 or 16 flags are kept only) if r.flag == 0: aligned_reads += 1 - read = str( - r.query_name - + "\t" - + r.reference_name - + "\t" - + str(r.reference_start) - + "\t" - + "+" - + "\n" - ) + read = str(r.query_name + "\t" + r.reference_name + "\t" + str(r.reference_start) + "\t" + "+" + "\n") f.write(read) elif r.flag == 16: aligned_reads += 1 - read = str( - r.query_name - + "\t" - + r.reference_name - + "\t" - + str(r.reference_start) - + "\t" - + "-" - + "\n" - ) + read = str(r.query_name + "\t" + r.reference_name + "\t" + str(r.reference_start) + "\t" + "-" + "\n") f.write(read) temp_bam.close() @@ -575,11 +503,7 @@ def process_bwa_bamfile(alignment, min_qual, contig_data, out_file): merged.write("#sorted: readID\n") merged.write("#shape: upper triangle\n") for contig in contig_data: - merged.write( - "#chromsize: {0} {1}\n".format( - contig, contig_data[contig]["length"] - ) - ) + merged.write("#chromsize: {0} {1}\n".format(contig, contig_data[contig]["length"])) # Loop until the end of the file. Read the reads by two as the forward # and reverse reads should be interleaved. @@ -593,10 +517,7 @@ def process_bwa_bamfile(alignment, min_qual, contig_data, out_file): rev_read = next(temp_bam) # Check mapping quality - if ( - for_read.mapping_quality >= min_qual - and rev_read.mapping_quality >= min_qual - ): + if for_read.mapping_quality >= min_qual and rev_read.mapping_quality >= min_qual: # Check flag if not (for_read.is_unmapped or rev_read.is_unmapped): @@ -604,9 +525,7 @@ def process_bwa_bamfile(alignment, min_qual, contig_data, out_file): # Safety check (forward and reverse are the same reads) if for_read.query_name != rev_read.query_name: - logger.error( - f"Reads should be paired - {for_read.query_name}\t{rev_read.query_name}" - ) + logger.error(f"Reads should be paired - {for_read.query_name}\t{rev_read.query_name}") raise ValueError # Define pairs value. @@ -624,9 +543,7 @@ def process_bwa_bamfile(alignment, min_qual, contig_data, out_file): # Modify order to have an upper triangle and write # the pair. - if (contig1 == contig2 and pos1 <= pos2) or contig_data[ - contig1 - ]["id"] < contig_data[contig2]["id"]: + if (contig1 == contig2 and pos1 <= pos2) or contig_data[contig1]["id"] < contig_data[contig2]["id"]: merged.write( "\t".join( [ diff --git a/metator/commands.py b/src/metator/commands.py similarity index 100% rename from metator/commands.py rename to src/metator/commands.py diff --git a/metator/contact_map.py b/src/metator/contact_map.py similarity index 93% rename from metator/contact_map.py rename to src/metator/contact_map.py index c8be4fd..717fd9e 100644 --- a/metator/contact_map.py +++ b/src/metator/contact_map.py @@ -97,12 +97,8 @@ def set_contigs(self): except KeyError: logger.error("The object that you gave is not in the table.") raise ValueError - self.contigs = list( - contigs_data["Name"][contigs_data[self.object] == self.name] - ) - self.contigs_size = list( - contigs_data["Size"][contigs_data[self.object] == self.name] - ) + self.contigs = list(contigs_data["Name"][contigs_data[self.object] == self.name]) + self.contigs_size = list(contigs_data["Size"][contigs_data[self.object] == self.name]) def set_large_contigs(self): """Method to keep only contigs bigger than the threshold given to remove @@ -114,9 +110,7 @@ def set_large_contigs(self): if value >= self.min_size: self.large_contigs.append(self.contigs[index]) self.contigs = self.large_contigs - self.contigs_size = [ - size for size in self.contigs_size if size >= self.min_size - ] + self.contigs_size = [size for size in self.contigs_size if size >= self.min_size] def set_metator_object(self, metator_object, name): """Method to get the metator object and name of the object usable for @@ -176,9 +170,7 @@ def set_metator_object(self, metator_object, name): ) raise ValueError from object_no_exist if int(self.name) <= 0: - logger.error( - "A recursive bin should have an id bigger than 0." - ) + logger.error("A recursive bin should have an id bigger than 0.") elif metator_object == "final_bin": self.object = "Final_bin" self.name = name @@ -209,9 +201,7 @@ def write_fasta(self, tmp_dir, out_dir): for contig_name in self.contigs: file.write("%s\n" % contig_name) # Extract contigs from the fastq. - cmd = "pyfastx extract {0} -l {1} > {2}".format( - self.assembly, contigs_list, self.fasta - ) + cmd = "pyfastx extract {0} -l {1} > {2}".format(self.assembly, contigs_list, self.fasta) process = sp.Popen(cmd, shell=True) process.communicate() @@ -238,13 +228,12 @@ def extract_pairs(metator_data): with open(output_file, "w") as output_pairs: # Write the header of the output pairs output_pairs.write("## pairs format v1.0\n") - output_pairs.write( - "#columns: readID chr1 pos1 chr2 pos2 strand1 strand2\n" - ) + output_pairs.write("#columns: readID chr1 pos1 chr2 pos2 strand1 strand2\n") for contig_id, contig in enumerate(metator_data.contigs): output_pairs.write( "#chromsize: {0} {1}\n".format( - contig, metator_data.contigs_size[contig_id], + contig, + metator_data.contigs_size[contig_id], ) ) for pairs_file in metator_data.pairs_files: @@ -335,9 +324,7 @@ def generate_contact_map( # Extract bin information from metaTOR outdir. logger.info("Generate HiC contact map for %s", name) - metator_data = MetatorObject( - metator_object, name, assembly, contig_data_file, pairs, min_size - ) + metator_data = MetatorObject(metator_object, name, assembly, contig_data_file, pairs, min_size) metator_data.set_contigs() if min_size > 0: metator_data.set_large_contigs() diff --git a/metator/figures.py b/src/metator/figures.py similarity index 100% rename from metator/figures.py rename to src/metator/figures.py diff --git a/metator/host.py b/src/metator/host.py similarity index 99% rename from metator/host.py rename to src/metator/host.py index 7488302..7d1b06f 100644 --- a/metator/host.py +++ b/src/metator/host.py @@ -155,7 +155,7 @@ def getScoreList(self): def associate_bin( bin_contigs: dict, network: "networkx.classes.graph.Graph", - contig_data: "pandas.DataFrame", + contig_data: pd.DataFrame, threshold: float, ) -> dict: """Function to associate one bin to one MAG. diff --git a/metator/io.py b/src/metator/io.py similarity index 90% rename from metator/io.py rename to src/metator/io.py index bf7208c..ccfa45f 100644 --- a/metator/io.py +++ b/src/metator/io.py @@ -39,6 +39,7 @@ import pandas as pd import pathlib import pypairix +import pairtools import re import subprocess as sp import zipfile @@ -47,6 +48,7 @@ from metator.log import logger from os.path import join, exists, isfile from random import getrandbits +from packaging.version import Version def check_checkm(): @@ -61,9 +63,7 @@ def check_checkm(): try: checkm = sp.check_output("checkm", stderr=sp.STDOUT, shell=True) except sp.CalledProcessError: - logger.error( - "Cannot find 'checkm' in your path please install it or add it in your path." - ) + logger.error("Cannot find 'checkm' in your path please install it or add it in your path.") return False return True @@ -95,11 +95,7 @@ def check_fasta_index(ref, mode="bowtie2"): elif mode == "bwa": refdir = str(ref.parent) refdir_files = os.listdir(refdir) - bwa_idx_files = [ - join(refdir, f) - for f in refdir_files - if re.search(r".*\.(sa|pac|bwt|ann|amb)$", f) - ] + bwa_idx_files = [join(refdir, f) for f in refdir_files if re.search(r".*\.(sa|pac|bwt|ann|amb)$", f)] index = None if len(bwa_idx_files) < 5 else bwa_idx_files else: index = [ref] @@ -156,9 +152,7 @@ def check_louvain_cpp(louvain_path): # Check convert: try: - convert = sp.check_output( - f"{convert} --help", stderr=sp.STDOUT, shell=True - ) + convert = sp.check_output(f"{convert} --help", stderr=sp.STDOUT, shell=True) except sp.CalledProcessError: logger.error("Cannot find the 'convert' function from Louvain path.") return False @@ -166,9 +160,7 @@ def check_louvain_cpp(louvain_path): # Check louvain: try: - louvain = sp.check_output( - f"{louvain} --help", stderr=sp.STDOUT, shell=True - ) + louvain = sp.check_output(f"{louvain} --help", stderr=sp.STDOUT, shell=True) except sp.CalledProcessError: logger.error("Cannot find the 'louvain' function from Louvain path.") return False @@ -176,9 +168,7 @@ def check_louvain_cpp(louvain_path): # Check hierarchy: try: - hierarchy = sp.check_output( - f"{hierarchy} --help", stderr=sp.STDOUT, shell=True - ) + hierarchy = sp.check_output(f"{hierarchy} --help", stderr=sp.STDOUT, shell=True) except sp.CalledProcessError: logger.error("Cannot find the convert_net function from Louvain path.") return False @@ -197,11 +187,9 @@ def check_pairix(): True if pairix found in the path, False otherwise. """ try: - pairix = sp.check_output("pairix --help", stderr=sp.STDOUT, shell=True) + pairix = sp.check_output(f"pairix --help", stderr=sp.STDOUT, shell=True) except sp.CalledProcessError: - logger.error( - "Cannot find 'pairix' in your path please install it or add it in your path." - ) + logger.error("Cannot find 'pairix' in your path please install it or add it in your path.") raise ImportError return False return True @@ -217,11 +205,9 @@ def check_pairtools(): True if pairtools found in the path, False otherwise. """ try: - pairix = sp.check_output("pairtools", stderr=sp.STDOUT, shell=True) + pairtools = sp.check_output("pairtools", stderr=sp.STDOUT, shell=True) except sp.CalledProcessError: - logger.error( - "Cannot find 'pairtools' in your path please install it or add it in your path." - ) + logger.error("Cannot find 'pairtools' in your path please install it or add it in your path.") raise ImportError return False return True @@ -282,8 +268,7 @@ def generate_temp_dir(path): os.makedirs(full_path) except PermissionError: raise PermissionError( - "The temporary directory cannot be created in {}. " - "Make sure you have write permission.".format(path) + "The temporary directory cannot be created in {}. " "Make sure you have write permission.".format(path) ) return full_path @@ -417,9 +402,7 @@ def import_contig_data_mges(contig_data_file, binning_result, mges_list): contig_data.loc[i, "MGE"] = True mges_list_id.append(contig_data.index[i]) try: - contig_data.loc[i, "Final_bin"] = binning_result[ - contig_data.loc[i, "Name"] - ] + contig_data.loc[i, "Final_bin"] = binning_result[contig_data.loc[i, "Name"]] contig_data.loc[i, "Binned"] = True except KeyError: continue @@ -439,9 +422,7 @@ def import_network(network_file): networkx.classes.graph.Graph: Network as networkx class. """ - network = nx.read_edgelist( - network_file, nodetype=int, data=(("weight", float),) - ) + network = nx.read_edgelist(network_file, nodetype=int, data=(("weight", float),)) return network @@ -483,7 +464,10 @@ def micomplete_results_to_dict(micomplete_file): """ # Read table. micomplete_summary = pd.read_csv( - micomplete_file, sep="\t", comment="#", index_col=0, + micomplete_file, + sep="\t", + comment="#", + index_col=0, ).iloc[:, :13] # Transform to dictionnary. @@ -555,9 +539,7 @@ def file_type(filename): elif comp == "zip": zip_arch = zipfile.ZipFile(filename, "r") if len(zip_arch.namelist()) > 1: - raise IOError( - "Only a single fastq file must be in the zip archive." - ) + raise IOError("Only a single fastq file must be in the zip archive.") else: # ZipFile opens as bytes by default, using io to read as text zip_content = zip_arch.open(zip_arch.namelist()[0], "r") @@ -674,14 +656,10 @@ def retrieve_fasta(in_file, aligner, tmpdir): if check_is_fasta(in_file + ".fasta"): fasta = in_file + ".fasta" else: - logger.error( - "If you give bwa index, please make sure the fasta exists with the same prefix." - ) + logger.error("If you give bwa index, please make sure the fasta exists with the same prefix.") raise ValueError else: - logger.error( - "Please give as a reference a bowtie2 index or a fasta." - ) + logger.error("Please give as a reference a bowtie2 index or a fasta.") raise ValueError return fasta @@ -706,9 +684,7 @@ def save_sparse_matrix(s_mat, path): np.savetxt( path, sparse_arr, - header="{nrows}\t{ncols}\t{nonzero}".format( - nrows=s_mat.shape[0], ncols=s_mat.shape[1], nonzero=s_mat.nnz - ), + header="{nrows}\t{ncols}\t{nonzero}".format(nrows=s_mat.shape[0], ncols=s_mat.shape[1], nonzero=s_mat.nnz), comments="", fmt=["%i", "%i", "%1.3f"], delimiter="\t", @@ -744,30 +720,20 @@ def sort_pairs(in_file, out_file, tmp_dir=None, threads=1, buffer="2G"): # Check if UNIX sort version supports parallelism parallel_ok = True sort_ver = sp.Popen(["sort", "--version"], stdout=sp.PIPE) - sort_ver = ( - sort_ver.communicate()[0] - .decode() - .split("\n")[0] - .split(" ")[-1] - .split(".") - ) + sort_ver = sort_ver.communicate()[0].decode().split("\n")[0].split(" ")[-1].split(".") # If so, specify threads, otherwise don't mention it in the command line try: sort_ver = list(map(int, sort_ver)) if sort_ver[0] < 8 or (sort_ver[0] == 8 and sort_ver[1] < 23): logger.warning( "GNU sort version is {0} but >8.23 is required for parallel " - "sort. Sorting on a single thread.".format( - ".".join(map(str, sort_ver)) - ) + "sort. Sorting on a single thread.".format(".".join(map(str, sort_ver))) ) parallel_ok = False # BSD sort has a different format and will throw error upon parsing. It does # not support parallel processes anyway. except ValueError: - logger.warning( - "Using BSD sort instead of GNU sort, sorting on a single thread." - ) + logger.warning("Using BSD sort instead of GNU sort, sorting on a single thread.") parallel_ok = False # Sort pairs and append to file. @@ -816,9 +782,7 @@ def sort_pairs_pairtools(pairfile, threads=1, remove=False, force=False): os.remove(f"{basename}_sorted.pairs") else: force = "" - if os.path.isfile(f"{basename}_sorted.pairs") or os.path.isfile( - f"{basename}_sorted.pairs.gz" - ): + if os.path.isfile(f"{basename}_sorted.pairs") or os.path.isfile(f"{basename}_sorted.pairs.gz"): logger.error( f"The {basename}_sorted.pairs exists. Do not overwrite existing, use --force to overwrite or use another location." ) @@ -826,6 +790,9 @@ def sort_pairs_pairtools(pairfile, threads=1, remove=False, force=False): # Sort pairs using pairtools. cmd = f"set -eu ; pairtools sort {pairfile} --nproc {threads} -o {basename}_sorted.pairs" + if Version(pairtools.__version__) >= Version("1.1.0"): + logger.debug("pairtools version >= 1.1.0. Use new options.") + cmd = cmd + " --c1 chr1 --c2 chr2 --p1 pos1 --p2 pos2 --pt strand1" process = sp.Popen(cmd, shell=True) _out, _err = process.communicate() # Compressed pairs. @@ -833,7 +800,7 @@ def sort_pairs_pairtools(pairfile, threads=1, remove=False, force=False): process = sp.Popen(cmd, shell=True) _out, _err = process.communicate() # Indexed pairs. - cmd = f"set -eu ; pairix{force} {basename}_sorted.pairs.gz" + cmd = f"set -eu ; pairix{force} {basename}_sorted.pairs.gz" process = sp.Popen(cmd, shell=True) _out, _err = process.communicate() @@ -858,13 +825,9 @@ def write_bin_summary(bin_summary, bin_summary_file): bin_summary = pd.DataFrame.from_dict(bin_summary, orient="index") # Change float format of the coverage. - bin_summary["HiC_abundance"] = bin_summary["HiC_abundance"].map( - lambda x: "%.4f" % x - ) + bin_summary["HiC_abundance"] = bin_summary["HiC_abundance"].map(lambda x: "%.4f" % x) try: - bin_summary["SG_abundance"] = bin_summary["SG_abundance"].map( - lambda x: "%.4f" % x - ) + bin_summary["SG_abundance"] = bin_summary["SG_abundance"].map(lambda x: "%.4f" % x) except KeyError: pass diff --git a/metator/log.py b/src/metator/log.py similarity index 100% rename from metator/log.py rename to src/metator/log.py diff --git a/metator/main.py b/src/metator/main.py similarity index 100% rename from metator/main.py rename to src/metator/main.py diff --git a/metator/mge.py b/src/metator/mge.py similarity index 98% rename from metator/mge.py rename to src/metator/mge.py index b30b031..c737988 100644 --- a/metator/mge.py +++ b/src/metator/mge.py @@ -26,6 +26,7 @@ import checkv import metator.figures as mtf import metator.io as mio +import networkx import numpy as np import pandas as pd import pypairix @@ -126,7 +127,7 @@ def build_matrix( def build_mge_depth( contigs_file: str, depth_file: str, - mges_data: "pandas.DataFrame", + mges_data: pd.DataFrame, mge_depth_file: str, ): """Build mge depth form the whole assembly depth file from metabat script. @@ -241,8 +242,8 @@ def generate_bin_summary( def generate_mge_bins_metabat( - mges_data: "pandas.DataFrame", -) -> "pandas.DataFrame": + mges_data: pd.DataFrame, +) -> pd.DataFrame: """Generates the binning of the mges contigs based on both HiC information (host detection) and the coverage and sequences information (metabat2 binning). @@ -293,10 +294,10 @@ def generate_mge_bins_metabat( def generate_mge_bins_pairs( - mges_data: "pandas.DataFrame", + mges_data: pd.DataFrame, pairs_files: List[str], threshold: float = 0.8, -) -> Tuple["pandas.DataFrame", dict]: +) -> Tuple[pd.DataFrame, dict]: """Generates the binning of the mges contigs based on both HiC information (host detection) and the coverage and sequences information (metabat2 binning). @@ -397,8 +398,8 @@ def mge_binning( checkv_db: str, depth_file: str, fasta_mges_contigs: str, - network: "networkx.classes.graph.Graph", - contigs_data: "pandas.DataFrame", + network: networkx.classes.graph.Graph, + contigs_data: pd.DataFrame, mges_list_id: List[int], out_dir: str, pairs_files: List[str], @@ -687,7 +688,7 @@ def run_metabat( outfile: str, mge_depth_file: str, temp_fasta: str, -) -> "pandas.DataFrame": +) -> pd.DataFrame: """Function to launch metabat binning which is based on sequence and coverage information. @@ -736,8 +737,8 @@ def run_metabat( def shuffle_mge_bins( - mges_data: "pandas.DataFrame", -) -> Tuple["pandas.DataFrame", dict]: + mges_data: pd.DataFrame, +) -> Tuple[pd.DataFrame, dict]: """Function to shuffle id to imitate a random binning with the same bins distribution as the one created by Metator MGE. @@ -776,8 +777,8 @@ def shuffle_mge_bins( def update_mge_data( - mges_data: "pandas.DataFrame", bins: List[Tuple] -) -> "pandas.DataFrame": + mges_data: pd.DataFrame, bins: List[Tuple] +) -> pd.DataFrame: """Function to update the mge bins data. Parameters diff --git a/metator/network.py b/src/metator/network.py similarity index 99% rename from metator/network.py rename to src/metator/network.py index c21e8c5..dbf0d55 100644 --- a/metator/network.py +++ b/src/metator/network.py @@ -297,7 +297,7 @@ def create_contig_data(assembly, nb_alignment=1, depth_file=None, enzyme=None): contig_data[contig.id] = { "id": global_id, "length": int(line[1]), - "GC": SeqUtils.GC(contig.seq), + "GC": SeqUtils.gc_fraction(contig.seq)*100, "hit": 0, "coverage": float(line[2]), "RS": (len(re.findall(pattern, str(contig.seq))) + 1) @@ -315,7 +315,7 @@ def create_contig_data(assembly, nb_alignment=1, depth_file=None, enzyme=None): contig_data[contig.id] = { "id": global_id, "length": len(contig.seq), - "GC": SeqUtils.GC(contig.seq), + "GC": SeqUtils.gc_fraction(contig.seq)*100, "hit": 0, "coverage": "-", "RS": (len(re.findall(pattern, str(contig.seq))) + 1) diff --git a/metator/partition.py b/src/metator/partition.py similarity index 98% rename from metator/partition.py rename to src/metator/partition.py index 230bcbf..e484fd1 100644 --- a/metator/partition.py +++ b/src/metator/partition.py @@ -39,6 +39,7 @@ from os.path import join from scipy import sparse from sklearn import metrics +from . import LEIDEN_PATH, LOUVAIN_PATH def algo_partition( @@ -83,14 +84,19 @@ def algo_partition( """ # Launch the write partition algorithm if algorithm == "leiden": - LEIDEN_PATH = os.environ["LEIDEN_PATH"] output_partition = leiden_iterations_java( - network_file, iterations, resolution_parameter, tmpdir, LEIDEN_PATH, + network_file, + iterations, + resolution_parameter, + tmpdir, + LEIDEN_PATH, ) elif algorithm == "louvain": - LOUVAIN_PATH = os.environ["LOUVAIN_PATH"] output_partition = louvain_iterations_cpp( - network_file, iterations, tmpdir, LOUVAIN_PATH, + network_file, + iterations, + tmpdir, + LOUVAIN_PATH, ) # elif algorithm == "spinglass": # output_partition = spinglass_partition( @@ -98,9 +104,7 @@ def algo_partition( # spins=spin, # ) else: - logger.error( - 'algorithm should be either "louvain", "leiden", or "spinglass"' - ) + logger.error('algorithm should be either "louvain", "leiden", or "spinglass"') raise ValueError return output_partition @@ -650,7 +654,6 @@ def partition( # Perform the iterations of Louvain or Leiden to partition the network. logger.info("Start iterations:") if algorithm == "leiden": - LEIDEN_PATH = os.environ["LEIDEN_PATH"] output_partition = leiden_iterations_java( network_file, iterations, @@ -659,9 +662,11 @@ def partition( LEIDEN_PATH, ) elif algorithm == "louvain": - LOUVAIN_PATH = os.environ["LOUVAIN_PATH"] output_partition = louvain_iterations_cpp( - network_file, iterations, temp_directory_clustering, LOUVAIN_PATH, + network_file, + iterations, + temp_directory_clustering, + LOUVAIN_PATH, ) else: logger.error('algorithm should be either "louvain" or "leiden"') diff --git a/metator/quality_check.py b/src/metator/quality_check.py similarity index 100% rename from metator/quality_check.py rename to src/metator/quality_check.py diff --git a/metator/regions.py b/src/metator/regions.py similarity index 100% rename from metator/regions.py rename to src/metator/regions.py diff --git a/metator/scaffold.py b/src/metator/scaffold.py similarity index 100% rename from metator/scaffold.py rename to src/metator/scaffold.py diff --git a/metator/validation.py b/src/metator/validation.py similarity index 100% rename from metator/validation.py rename to src/metator/validation.py diff --git a/src/metator/version.py b/src/metator/version.py new file mode 100644 index 0000000..9b52a14 --- /dev/null +++ b/src/metator/version.py @@ -0,0 +1,3 @@ +from importlib.metadata import version + +__version__ = version("metator") diff --git a/tests/__init__.py b/tests/__init__.py index e69de29..7d6ab99 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -0,0 +1,23 @@ +import importlib.util +from pathlib import Path +import os +import site + + +def is_editable_install(): + """Check if the metator package was installed in editable mode.""" + site_packages = site.getsitepackages() + for site_package in site_packages: + pth_file = os.path.join(site_package, "_metator.pth") + if os.path.isfile(pth_file): + return True + return False + + +__metator_source__ = os.path.dirname(importlib.util.find_spec("metator").origin) # type: ignore +__metator_root__ = __metator_source__ +if is_editable_install(): + __metator_root__ = os.path.abspath(os.path.join(__metator_source__, "../../")) +__leiden_dir__ = Path(__metator_root__, "external", "artifacts", "networkanalysis", "build", "libs") +LEIDEN_PATH = str(next(__leiden_dir__.glob("networkanalysis-1.3.0*.jar"))) +LOUVAIN_PATH = str(Path(__metator_root__, "external", "artifacts", "gen-louvain")) diff --git a/tests/test_io.py b/tests/test_io.py index 62f7d3f..0369425 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -5,22 +5,20 @@ import pytest import os import shutil +from . import LOUVAIN_PATH -def test_check_checkm(): - ... +def test_check_checkm(): ... -def test_check_fasta_index(): - ... +def test_check_fasta_index(): ... -def test_check_is_fasta(): - ... +def test_check_is_fasta(): ... def test_check_louvain_cpp(): - test = mio.check_louvain_cpp(os.environ['LOUVAIN_PATH']) + test = mio.check_louvain_cpp(LOUVAIN_PATH) assert test @@ -32,50 +30,39 @@ def test_check_pairix(): def test_check_pairtools(): test = mio.check_pairtools() assert test - -def test_generate_fasta_index(): - ... +def test_generate_fasta_index(): ... -def test_generate_temp_dir(): - ... +def test_generate_temp_dir(): ... -def test_get_restriction_site(): - ... +def test_get_restriction_site(): ... -def test_get_pairs(): - ... +def test_get_pairs(): ... -def test_process_ligation_sites(): - ... +def test_process_ligation_sites(): ... -def test_read_bin_summary(): - ... +def test_read_bin_summary(): ... -def test_read_compressed(): - ... +def test_read_compressed(): ... -def test_read_contig_data(): - ... +def test_read_contig_data(): ... -def test_read_results_checkm(): - ... +def test_read_results_checkm(): ... -def test_retrieve_fasta(): - ... +def test_retrieve_fasta(): ... -def test_sort_pairs(): - ... + +def test_sort_pairs(): ... def test_sort_pairs_pairtools(): @@ -86,8 +73,7 @@ def test_sort_pairs_pairtools(): shutil.copyfile(pairfile, test_file) mio.sort_pairs_pairtools(test_file, 1, True, False) pairs_data = pypairix.open(os.path.join(tmp_dir, f"test_sorted.pairs.gz")) - shutil.rmtree(tmp_dir) + shutil.rmtree(tmp_dir) -def test_write_bin_summary(): - ... +def test_write_bin_summary(): ... diff --git a/tests/test_network.py b/tests/test_network.py index 5fcf797..e63636b 100644 --- a/tests/test_network.py +++ b/tests/test_network.py @@ -128,7 +128,7 @@ def test_create_contig_data(): assert contig_data["NODE_522"] == { "id": 1, "length": 22786, - "GC": 62.077591503554814, + "GC": 62.07759150355482, "hit": 0, "coverage": "-", "RS": "-", @@ -142,7 +142,7 @@ def test_create_contig_data(): assert contig_data["NODE_522"] == { "id": 1, "length": 22786, - "GC": 62.077591503554814, + "GC": 62.07759150355482, "hit": 0, "coverage": 4.76595, "RS": 162, diff --git a/tests/test_partition.py b/tests/test_partition.py index a4d05cf..6f10dab 100644 --- a/tests/test_partition.py +++ b/tests/test_partition.py @@ -1,13 +1,14 @@ # Test for partition module +import os import metator.io as mio import metator.partition as mtp import networkx as nx import numpy as np -import os import pandas as pd import pytest import shutil +from . import LEIDEN_PATH, LOUVAIN_PATH assembly = "tests_data/assembly.fa" network_file = "tests_data/outdir/network.txt" @@ -15,20 +16,15 @@ resolution_parameter = 0.9 spins = 2 threads = 8 -LEIDEN_PATH = os.environ["LEIDEN_PATH"] -LOUVAIN_PATH = os.environ["LOUVAIN_PATH"] + overlapping_parameter = 0.6 tmp_dir = "tmp_partition_clustering" os.makedirs(tmp_dir, exist_ok=True) -partition = mtp.louvain_iterations_cpp( - network_file, iterations, tmp_dir, LOUVAIN_PATH -) +partition = mtp.louvain_iterations_cpp(network_file, iterations, tmp_dir, LOUVAIN_PATH) shutil.rmtree(tmp_dir) -contigs_data = pd.read_csv( - "tests_data/outdir/contig_data_partition.txt", sep="\t" -) +contigs_data = pd.read_csv("tests_data/outdir/contig_data_partition.txt", sep="\t") output_partition = { 1: "0;13;0;10;5", 2: "0;6;3;10;5", @@ -93,9 +89,7 @@ def test_algo_partition(): # Test algo partition choice. tmp_dir = "tmp_partition_partition" os.makedirs(tmp_dir, exist_ok=True) - network = nx.read_edgelist( - network_file, nodetype=int, data=(("weight", float),) - ) + network = nx.read_edgelist(network_file, nodetype=int, data=(("weight", float),)) subnetwork = network.subgraph(np.arange(1, 5)) for algorithm in ["louvain", "leiden", "error"]: try: @@ -135,9 +129,7 @@ def test_defined_overlapping_bins(): def test_detect_core_bins(): # Test core bin detection. - cc_contigs, cc_iterations = mtp.detect_core_bins( - output_partition, iterations - ) + cc_contigs, cc_iterations = mtp.detect_core_bins(output_partition, iterations) assert cc_contigs == core_bins_contigs assert (cc_iterations == core_bins_iterations).all().all() @@ -161,9 +153,7 @@ def test_generate_fasta(): def test_get_distances_splitmat(): # Test hamming distance computation worker. - x = mtp.get_distances_splitmat( - core_bins_iterations[0:1], core_bins_iterations - ) + x = mtp.get_distances_splitmat(core_bins_iterations[0:1], core_bins_iterations) assert np.sum(x.data) == pytest.approx(1.8, abs=1e-5) assert x.shape == (8, 1) assert x.nnz == 3 @@ -181,9 +171,7 @@ def test_leiden_iterations_java(): # Test leiden partition. tmp_dir = "tmp_partition_clustering" os.makedirs(tmp_dir, exist_ok=True) - partition = mtp.leiden_iterations_java( - network_file, iterations, resolution_parameter, tmp_dir, LEIDEN_PATH - ) + partition = mtp.leiden_iterations_java(network_file, iterations, resolution_parameter, tmp_dir, LEIDEN_PATH) _val = int(partition[1].split(";")[0]) assert len(partition) == 1058 assert len(partition[1].split(";")) == iterations @@ -197,8 +185,7 @@ def test_louvain_iterations_cpp(): assert len(partition[1].split(";")) == iterations -def test_partition(): - ... +def test_partition(): ... def test_remove_isolates(): @@ -208,9 +195,7 @@ def test_remove_isolates(): try: partition1[i] = partition[i] except KeyError: - partition1[i] = ";".join( - map(str, map(int, np.ones(iterations) * i)) - ) + partition1[i] = ";".join(map(str, map(int, np.ones(iterations) * i))) partition2 = mtp.remove_isolates(partition1, network_file) assert partition2 == partition diff --git a/tests/test_validation.py b/tests/test_validation.py index 9926d57..c292140 100644 --- a/tests/test_validation.py +++ b/tests/test_validation.py @@ -18,9 +18,7 @@ arch_output = "tests_data/outdir_validation/tmp_micomplete/arch131.tsv" bact_output = "tests_data/outdir_validation/tmp_micomplete/bact105.tsv" assembly = "tests_data/outdir_validation/assembly_val.fa" -bin_summary = mio.micomplete_results_to_dict( - "tests_data/outdir_validation/overlapping_micomplete_results.txt" -) +bin_summary = mio.micomplete_results_to_dict("tests_data/outdir_validation/overlapping_micomplete_results.txt") iterations = 5 resolution_parameter = 1.0 contigs_data = pd.read_csv( @@ -44,13 +42,10 @@ def test_get_bin_coverage(): bin_info = mtv.get_bin_coverage(bin_summary, contigs_data) - assert bin_info["MetaTOR_00002_00000"]["HiC_abundance"] == pytest.approx( - 49.72, abs=1e-2 - ) + assert bin_info["MetaTOR_00002_00000"]["HiC_abundance"] == pytest.approx(49.72, abs=1e-2) -def test_give_results_info(): - ... +def test_give_results_info(): ... def test_merge_micomplete(): @@ -62,8 +57,7 @@ def test_merge_micomplete(): os.remove(out_file) -def test_micomplete_compare_bins(): - ... +def test_micomplete_compare_bins(): ... def test_micomplete_quality(): @@ -75,8 +69,7 @@ def test_micomplete_quality(): os.remove(out_file) -def test_recursive_clustering(): - ... +def test_recursive_clustering(): ... def test_recursive_clustering_worker(): @@ -98,8 +91,7 @@ def test_recursive_clustering_worker(): shutil.rmtree(tmp_dir) -def test_recursive_decontamination(): - ... +def test_recursive_decontamination(): ... def test_update_contigs_data_recursive(): @@ -131,9 +123,7 @@ def test_update_contigs_data_recursive(): def test_write_bins_contigs(): binning_file = "tmp_binning.txt" - contig_data = mtv.write_bins_contigs( - bin_summary, contigs_data, binning_file, "MetaTOR" - ) + contig_data = mtv.write_bins_contigs(bin_summary, contigs_data, binning_file, "MetaTOR") print(np.unique(list(contig_data["Final_bin"]))) assert len(np.unique(list(contig_data["Final_bin"]))) == 2 os.remove(binning_file)