diff --git a/.bumpversion.cfg b/.bumpversion.cfg index 26f291b47..11e9a491f 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 3.0.1 +current_version = 3.0.2 [comment] comment = The contents of this file cannot be merged with that of setup.cfg until https://github.com/c4urself/bump2version/issues/185 is resolved @@ -15,3 +15,7 @@ replace = version = "{new_version}" [bumpversion:file:CITATION.cff] search = version: "{current_version}" replace = version: "{new_version}" + +[bumpversion:file:requirements-docker.txt] +search = deeprank2==current_version +replace = deeprank2==new_version diff --git a/.github/actions/install-python-and-package/action.yml b/.github/actions/install-python-and-package/action.yml index 6815a04b6..9f7253360 100644 --- a/.github/actions/install-python-and-package/action.yml +++ b/.github/actions/install-python-and-package/action.yml @@ -26,70 +26,43 @@ runs: uses: styfle/cancel-workflow-action@0.4.0 with: access_token: ${{ github.token }} + - uses: actions/checkout@v3 - - name: Setup conda - uses: s-weigand/setup-conda@v1 + + - name: Setup miniconda + uses: conda-incubator/setup-miniconda@v2 with: - update-conda: true + auto-update-conda: true + miniforge-variant: Mambaforge + channels: conda-forge python-version: ${{ inputs.python-version }} - conda-channels: pytorch, pyg, bioconda, defaults, sbl, conda-forge + activate-environment: deeprank2 + environment-file: env/deeprank2.yml + use-mamba: true + - run: | conda --version conda env list - shell: bash {0} + shell: bash -l {0} + - name: Python info - shell: bash -e {0} + shell: bash -l {0} run: | which python3 python3 --version - - name: Install dependencies on Linux - shell: bash {0} - env: - CMAKE_INSTALL_PREFIX: .local - if: runner.os == 'Linux' - run: | - # Install deeprank2 conda dependencies - ## DSSP - conda install -c sbl dssp>=4.2.2.1 - ## MSMS - conda install -c bioconda msms>=2.6.1 - ## PyTorch, PyG, PyG adds - ### Installing for CPU only on the CI - conda install pytorch=2.1.1 torchvision=0.16.1 torchaudio=2.1.1 cpuonly=2.0.* -c pytorch - conda install pyg=2.4.0 -c pyg - pip install torch_scatter==2.1.2 torch_sparse==0.6.18 torch_cluster==1.6.3 torch_spline_conv==1.2.2 -f https://data.pyg.org/whl/torch-2.1.0+cpu.html - - name: Install dependencies on MacOS - shell: bash {0} - env: - CMAKE_INSTALL_PREFIX: .local - if: runner.os == 'macOS' - run: | - # Install dependencies not handled by setuptools - ## DSSP - conda install -c sbl dssp>=4.2.2.1 - ## MSMS - cd /tmp/ - wget http://mgltools.scripps.edu/downloads/tars/releases/MSMSRELEASE/REL2.6.1/msms_i86Linux2_2.6.1.tar.gz - sudo mkdir /usr/local/lib/msms - cd /usr/local/lib/msms - sudo tar zxvf /tmp/msms_i86Linux2_2.6.1.tar.gz - sudo ln -s /usr/local/lib/msms/msms.i86Linux2.2.6.1 /usr/local/bin/msms - sudo ln -s /usr/local/lib/msms/pdb_to_xyzr* /usr/local/bin - ## PyTorch, PyG, PyG adds - ### Installing for CPU only on the CI - conda install pytorch torchvision torchaudio cpuonly -c pytorch - pip install torch_geometric - pip install torch_scatter torch_sparse torch_cluster torch_spline_conv -f https://data.pyg.org/whl/torch-$(python3 -c "import torch; print(torch.__version__)")+cpu.html - # PyTables via conda only for MacOS - conda install pytables + - name: Install the GitHub repository version of the package - shell: bash {0} + shell: bash -l {0} if: ${{ inputs.pkg-installation-type == 'repository' }} - run: pip install .'[${{ inputs.extras-require }}]' - - name: Install the latest released version of the package - shell: bash {0} + run: | + conda activate deeprank2 + pip install .'[${{ inputs.extras-require }}]' + + - name: Install the latest released PyPI version of the package + shell: bash -l {0} if: ${{ inputs.pkg-installation-type == 'latest' }} run: | + conda activate deeprank2 pip install pytest rm -r deeprank2 pip install deeprank2 diff --git a/.github/workflows/build-repo.yml b/.github/workflows/build-repo.yml index 13e2764a2..7af4cc7ae 100644 --- a/.github/workflows/build-repo.yml +++ b/.github/workflows/build-repo.yml @@ -39,14 +39,22 @@ jobs: os: ["ubuntu-latest"] python-version: ["3.10"] # ["3.10", "3.11"] + # https://github.com/marketplace/actions/setup-miniconda#use-a-default-shell + defaults: + run: + shell: bash -l {0} + steps: - uses: actions/checkout@v3 + - uses: ./.github/actions/install-python-and-package with: python-version: ${{ matrix.python-version }} extras-require: test, publishing pkg-installation-type: "repository" + - name: Run unit tests run: pytest -v + - name: Verify that we can build the package run: python3 -m build diff --git a/.github/workflows/coveralls.yml b/.github/workflows/coveralls.yml index 8a7632985..eb4feff2a 100644 --- a/.github/workflows/coveralls.yml +++ b/.github/workflows/coveralls.yml @@ -39,6 +39,11 @@ jobs: os: ["ubuntu-latest"] python-version: ["3.10"] + # https://github.com/marketplace/actions/setup-miniconda#use-a-default-shell + defaults: + run: + shell: bash -l {0} + steps: - uses: actions/checkout@v3 - uses: ./.github/actions/install-python-and-package diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml index 15b26684f..4f72ac07f 100644 --- a/.github/workflows/linting.yml +++ b/.github/workflows/linting.yml @@ -39,6 +39,11 @@ jobs: os: ["ubuntu-latest"] python-version: ["3.10"] + # https://github.com/marketplace/actions/setup-miniconda#use-a-default-shell + defaults: + run: + shell: bash -l {0} + steps: - uses: actions/checkout@v3 - uses: ./.github/actions/install-python-and-package @@ -46,4 +51,6 @@ jobs: python-version: ${{ matrix.python-version }} extras-require: test - name: Check style against standards using ruff - run: ruff . + run: | + ruff check + ruff format --check diff --git a/.github/workflows/notebooks.yml b/.github/workflows/notebooks.yml new file mode 100644 index 000000000..b1e59ea9d --- /dev/null +++ b/.github/workflows/notebooks.yml @@ -0,0 +1,62 @@ +name: notebooks + +on: + push: + paths-ignore: + # specific folder locations + - ".vscode/**" + - "docs/**" + # filetypes + - "**.md" + - "**.rst" + - "**.cff" + - "**.png" + branches: + - main + pull_request: + types: [opened, synchronize, reopened, ready_for_review] + paths-ignore: + # specific folder locations + - ".vscode/**" + - "docs/**" + # filetypes + - "**.md" + - "**.rst" + - "**.cff" + - "**.png" + +jobs: + build: + if: github.event.pull_request.draft == false + name: Build for (${{ matrix.python-version }}, ${{ matrix.os }}) + runs-on: ${{ matrix.os }} + + strategy: + fail-fast: false + matrix: + os: ["ubuntu-latest"] + python-version: ["3.10"] # ["3.10", "3.11"] + + # https://github.com/marketplace/actions/setup-miniconda#use-a-default-shell + defaults: + run: + shell: bash -l {0} + + steps: + - uses: actions/checkout@v3 + + - uses: ./.github/actions/install-python-and-package + with: + python-version: ${{ matrix.python-version }} + extras-require: test, notebooks + pkg-installation-type: "repository" + + - name: Download the data for the tutorials + shell: bash -l {0} + run: | + wget https://zenodo.org/records/8349335/files/data_raw.zip + unzip data_raw.zip -d data_raw + mv data_raw tutorials + + - name: Run tutorial notebooks + run: pytest --nbmake tutorials diff --git a/.vscode/settings.json b/.vscode/settings.json index 6b68b7366..5cf7354a7 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -17,6 +17,10 @@ }, "notebook.diff.ignoreMetadata": true, + // Pytest + "python.testing.unittestEnabled": false, + "python.testing.pytestEnabled": true, + // Format all files on save "editor.formatOnSave": true, "editor.defaultFormatter": "esbenp.prettier-vscode", diff --git a/CITATION.cff b/CITATION.cff index 0fc0a42aa..981a89d16 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -1,45 +1,5 @@ cff-version: "1.2.0" authors: -- family-names: Crocioni - given-names: Giulia - orcid: "https://orcid.org/0000-0002-0823-0121" -- family-names: Bodor - given-names: Dani L. - orcid: "https://orcid.org/0000-0003-2109-2349" -- family-names: Baakman - given-names: Coos - orcid: "https://orcid.org/0000-0003-4317-1566" -- family-names: Parizi - given-names: Farzaneh M. - orcid: "https://orcid.org/0000-0003-4230-7492" -- family-names: Rademaker - given-names: Daniel-T. - orcid: "https://orcid.org/0000-0003-1959-1317" -- family-names: Ramakrishnan - given-names: Gayatri - orcid: "https://orcid.org/0000-0001-8203-2783" -- family-names: Burg - given-names: Sven A. - name-particle: van der - orcid: "https://orcid.org/0000-0003-1250-6968" -- family-names: Marzella - given-names: Dario F. - orcid: "https://orcid.org/0000-0002-0043-3055" -- family-names: Teixeira - given-names: João M. C. - orcid: "https://orcid.org/0000-0002-9113-0622" -- family-names: Xue - given-names: Li C. - orcid: "https://orcid.org/0000-0002-2613-538X" -contact: -- family-names: Crocioni - given-names: Giulia - orcid: "https://orcid.org/0000-0002-0823-0121" -doi: 10.5281/zenodo.10566809 -message: If you use this software, please cite our article in the - Journal of Open Source Software. -preferred-citation: - authors: - family-names: Crocioni given-names: Giulia orcid: "https://orcid.org/0000-0002-0823-0121" @@ -71,6 +31,46 @@ preferred-citation: - family-names: Xue given-names: Li C. orcid: "https://orcid.org/0000-0002-2613-538X" +contact: + - family-names: Crocioni + given-names: Giulia + orcid: "https://orcid.org/0000-0002-0823-0121" +doi: 10.5281/zenodo.10566809 +message: If you use this software, please cite our article in the + Journal of Open Source Software. +preferred-citation: + authors: + - family-names: Crocioni + given-names: Giulia + orcid: "https://orcid.org/0000-0002-0823-0121" + - family-names: Bodor + given-names: Dani L. + orcid: "https://orcid.org/0000-0003-2109-2349" + - family-names: Baakman + given-names: Coos + orcid: "https://orcid.org/0000-0003-4317-1566" + - family-names: Parizi + given-names: Farzaneh M. + orcid: "https://orcid.org/0000-0003-4230-7492" + - family-names: Rademaker + given-names: Daniel-T. + orcid: "https://orcid.org/0000-0003-1959-1317" + - family-names: Ramakrishnan + given-names: Gayatri + orcid: "https://orcid.org/0000-0001-8203-2783" + - family-names: Burg + given-names: Sven A. + name-particle: van der + orcid: "https://orcid.org/0000-0003-1250-6968" + - family-names: Marzella + given-names: Dario F. + orcid: "https://orcid.org/0000-0002-0043-3055" + - family-names: Teixeira + given-names: João M. C. + orcid: "https://orcid.org/0000-0002-9113-0622" + - family-names: Xue + given-names: Li C. + orcid: "https://orcid.org/0000-0002-2613-538X" date-published: 2024-02-27 doi: 10.21105/joss.05983 issn: 2475-9066 @@ -86,3 +86,4 @@ preferred-citation: volume: 9 title: "DeepRank2: Mining 3D Protein Structures with Geometric Deep Learning" +version: "3.0.2" diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst index 7169f5a1a..901040a17 100644 --- a/CONTRIBUTING.rst +++ b/CONTRIBUTING.rst @@ -37,7 +37,7 @@ You want to make some kind of change to the code base #. if needed, fork the repository to your own Github profile and create your own feature branch off of the latest main commit. While working on your feature branch, make sure to stay up to date with the main branch by pulling in changes, possibly from the 'upstream' repository (follow the instructions `here `__ and `here `__); #. make sure the existing tests still work by running ``python setup.py test``; #. add your own tests (if necessary); -#. ensure the code is correctly linted (``ruff .``) and formatted (``ruff format .``); +#. ensure the code is correctly linted (``ruff check.``) and formatted (``ruff format .``); #. see our `developer's readme `_ for detailed information on our style conventions, etc.; #. update or expand the documentation; #. `push `_ your feature branch to (your fork of) the DeepRank2 repository on GitHub; diff --git a/Dockerfile b/Dockerfile index f4b52528a..73be0bb87 100644 --- a/Dockerfile +++ b/Dockerfile @@ -3,26 +3,26 @@ FROM --platform=linux/x86_64 condaforge/miniforge3:23.3.1-1 # Add files ADD ./tutorials /home/deeprank2/tutorials -ADD ./env/environment.yml /home/deeprank2 -ADD ./env/requirements.txt /home/deeprank2 +ADD ./env/deeprank2-docker.yml /home/deeprank2 +ADD ./env/requirements-docker.txt /home/deeprank2 -# Install RUN \ - apt update -y && - apt install unzip -y && + # Install dependencies and package + apt update -y && \ + apt install unzip -y && \ ## GCC - apt install -y gcc && - ## Conda and pip deps - mamba env create -f /home/deeprank2/environment.yml && - ## Get the data for running the tutorials - if [ -d "/home/deeprank2/tutorials/data_raw" ]; then rm -Rf /home/deeprank2/tutorials/data_raw; fi && - if [ -d "/home/deeprank2/tutorials/data_processed" ]; then rm -Rf /home/deeprank2/tutorials/data_processed; fi && - wget https://zenodo.org/records/8349335/files/data_raw.zip && - unzip data_raw.zip -d data_raw && + apt install -y gcc && \ + ## Create the environment and install the dependencies + mamba env create -f /home/deeprank2/deeprank2-docker.yml && \ + ## Activate the environment automatically when entering the container + echo "source activate deeprank2" >~/.bashrc && \ + # Get the data for running the tutorials + if [ -d "/home/deeprank2/tutorials/data_raw" ]; then rm -Rf /home/deeprank2/tutorials/data_raw; fi && \ + if [ -d "/home/deeprank2/tutorials/data_processed" ]; then rm -Rf /home/deeprank2/tutorials/data_processed; fi && \ + wget https://zenodo.org/records/8349335/files/data_raw.zip && \ + unzip data_raw.zip -d data_raw && \ mv data_raw /home/deeprank2/tutorials -# Activate the environment -RUN echo "source activate deeprank2" >~/.bashrc ENV PATH /opt/conda/envs/deeprank2/bin:$PATH # Define working directory diff --git a/README.dev.md b/README.dev.md index 879257dec..d15731b30 100644 --- a/README.dev.md +++ b/README.dev.md @@ -48,7 +48,7 @@ We use [ruff](https://docs.astral.sh/ruff/) for linting, sorting imports and for If you are using VS code, please install and activate the [Ruff extension](https://marketplace.visualstudio.com/items?itemName=charliermarsh.ruff) to automatically format and check linting. -Otherwise, please ensure check both linting (`ruff fix .`) and formatting (`ruff format .`) before requesting a review. +Otherwise, please ensure check both linting (`ruff check .`) and formatting (`ruff format .`) before requesting a review. We use [prettier](https://prettier.io/) for formatting most other files. If you are editing or adding non-python files and using VS code, the [Prettier extension](https://marketplace.visualstudio.com/items?itemName=esbenp.prettier-vscode) can be installed to auto-format these files as well. diff --git a/README.md b/README.md index 25acb670e..4482c0d70 100644 --- a/README.md +++ b/README.md @@ -1,13 +1,13 @@ # DeepRank2 -| Badges | | -| :------------: | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| **fairness** | [![fair-software.eu](https://img.shields.io/badge/fair--software.eu-%E2%97%8F%20%20%E2%97%8F%20%20%E2%97%8F%20%20%E2%97%8F%20%20%E2%97%8F-green)](https://fair-software.eu) [![CII Best Practices](https://bestpractices.coreinfrastructure.org/projects/6403/badge)](https://bestpractices.coreinfrastructure.org/projects/6403) | -| **package** | [![PyPI version](https://badge.fury.io/py/deeprank2.svg)](https://badge.fury.io/py/deeprank2) [![Codacy Badge](https://app.codacy.com/project/badge/Grade/b1bde03fc0334e07b0cd8a69ce2adeb3)](https://app.codacy.com/gh/DeepRank/deeprank2/dashboard?utm_source=gh&utm_medium=referral&utm_content=&utm_campaign=Badge_grade) | -| **docs** | [![Documentation Status](https://readthedocs.org/projects/deeprank2/badge/?version=latest)](https://deeprank2.readthedocs.io/en/latest/?badge=latest) [![RSD](https://img.shields.io/badge/RSD-deeprank2-pink)](https://research-software-directory.org/software/deeprankcore) [![DOI](https://zenodo.org/badge/450496579.svg)](https://zenodo.org/badge/latestdoi/450496579) [![DOI](https://joss.theoj.org/papers/10.21105/joss.05983/status.svg)](https://doi.org/10.21105/joss.05983) | +| Badges | | +| :------------: | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| **fairness** | [![fair-software.eu](https://img.shields.io/badge/fair--software.eu-%E2%97%8F%20%20%E2%97%8F%20%20%E2%97%8F%20%20%E2%97%8F%20%20%E2%97%8F-green)](https://fair-software.eu) [![CII Best Practices](https://bestpractices.coreinfrastructure.org/projects/6403/badge)](https://bestpractices.coreinfrastructure.org/projects/6403) | +| **package** | [![PyPI version](https://badge.fury.io/py/deeprank2.svg)](https://badge.fury.io/py/deeprank2) [![Codacy Badge](https://app.codacy.com/project/badge/Grade/b1bde03fc0334e07b0cd8a69ce2adeb3)](https://app.codacy.com/gh/DeepRank/deeprank2/dashboard?utm_source=gh&utm_medium=referral&utm_content=&utm_campaign=Badge_grade) | +| **docs** | [![Documentation Status](https://readthedocs.org/projects/deeprank2/badge/?version=latest)](https://deeprank2.readthedocs.io/en/latest/?badge=latest) [![RSD](https://img.shields.io/badge/RSD-deeprank2-pink)](https://research-software-directory.org/software/deeprankcore) [![DOI](https://zenodo.org/badge/450496579.svg)](https://zenodo.org/badge/latestdoi/450496579) [![DOI](https://joss.theoj.org/papers/10.21105/joss.05983/status.svg)](https://doi.org/10.21105/joss.05983) | | **tests** | [![Build Status](https://github.com/DeepRank/deeprank2/actions/workflows/build-repo.yml/badge.svg)](https://github.com/DeepRank/deeprank2/actions) ![Linting status](https://github.com/DeepRank/deeprank2/actions/workflows/linting.yml/badge.svg?branch=main) [![Coverage Status](https://coveralls.io/repos/github/DeepRank/deeprank2/badge.svg?branch=main)](https://coveralls.io/github/DeepRank/deeprank2?branch=main) ![Python](https://img.shields.io/badge/python-3.10-blue.svg) | -| **running on** | ![Ubuntu](https://img.shields.io/badge/Ubuntu-E95420?style=for-the-badge&logo=ubuntu&logoColor=white) | -| **license** | [![License](https://img.shields.io/badge/License-Apache_2.0-blue.svg)](https://opensource.org/license/apache-2-0/) | +| **running on** | ![Ubuntu](https://img.shields.io/badge/Ubuntu-E95420?style=for-the-badge&logo=ubuntu&logoColor=white) | +| **license** | [![License](https://img.shields.io/badge/License-Apache_2.0-blue.svg)](https://opensource.org/license/apache-2-0/) | ## Overview @@ -17,7 +17,12 @@ DeepRank2 is an open-source deep learning (DL) framework for data mining of protein-protein interfaces (PPIs) or single-residue variants (SRVs). This package is an improved and unified version of three previously developed packages: [DeepRank](https://github.com/DeepRank/deeprank), [DeepRank-GNN](https://github.com/DeepRank/Deeprank-GNN), and [DeepRank-Mut](https://github.com/DeepRank/DeepRank-Mut). -DeepRank2 allows for transformation of (pdb formatted) molecular data into 3D representations (either grids or graphs) containing structural and physico-chemical information, which can be used for training neural networks. DeepRank2 also offers a pre-implemented training pipeline, using either [CNNs](https://en.wikipedia.org/wiki/Convolutional_neural_network) (for grids) or [GNNs](https://en.wikipedia.org/wiki/Graph_neural_network) (for graphs), as well as output exporters for evaluating performances. +As input, DeepRank2 takes [PDB-formatted](https://www.cgl.ucsf.edu/chimera/docs/UsersGuide/tutorials/pdbintro.html) atomic structures, and map them to graphs, where nodes can represent either residues or atoms, as chosen by the user, and edges represent the interactions between them. DeepRank2 has the option to choose between two types of queries as input for the featurization phase: + +- PPIs, for mining interaction patterns within protein-protein complexes, implemented by the `ProteinProteinInterfaceQuery` class; +- SRVs, for mining mutation phenotypes within protein structures, implemented by the `SingleResidueVariantQuery` class. + +The physico-chemical and geometrical features are then computed and assigned to each node and edge. The user can choose which features to generate from several pre-existing options defined in the package, or define custom features modules, as explained in the documentation. The graphs can then be mapped to 3D-grids as well. The generated data can be used for training neural networks. DeepRank2 also offers a pre-implemented training pipeline, using either [CNNs](https://en.wikipedia.org/wiki/Convolutional_neural_network) (for 3D-grids) or [GNNs](https://en.wikipedia.org/wiki/Graph_neural_network) (for graphs), as well as output exporters for evaluating performances. Main features: @@ -28,7 +33,7 @@ Main features: - binary class, CAPRI categories, DockQ, RMSD, and FNAT - Detailed docking scores documentation is available [here](https://deeprank2.readthedocs.io/en/latest/docking.html) - Flexible definition of both new features and targets -- Features generation for both graphs and grids +- Features generation for both graphs and 3D-grids - Efficient data storage in HDF5 format - Support for both classification and regression (based on [PyTorch](https://pytorch.org/) and [PyTorch Geometric](https://pytorch-geometric.readthedocs.io/en/latest/)) @@ -44,8 +49,8 @@ Main features: - [Installation](#installation) - [Containerized Installation](#containerized-installation) - [Local/remote installation](#localremote-installation) - - [YML file installation](#yml-file-installation) - - [Manual installation](#manual-installation) + - [YML file installation (recommended)](#yml-file-installation-recommended) + - [Manual installation (customizable)](#manual-installation-customizable) - [Testing DeepRank2 installation](#testing-deeprank2-installation) - [Contributing](#contributing) - [Using DeepRank2](#using-deeprank2) @@ -95,7 +100,7 @@ Local installation is formally only supported on the latest stable release of ub Before installing DeepRank2 please ensure you have [GCC](https://gcc.gnu.org/install/) installed: if running `gcc --version` gives an error, run `sudo apt-get install gcc`. -#### YML file installation +#### YML file installation (recommended) You can use the provided YML file for creating a [conda environment](https://conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html) containing the latest stable release of DeepRank2 and all its dependencies. This will install the CPU-only version of DeepRank2 on Python 3.10. @@ -109,27 +114,28 @@ cd deeprank2 # Ensure you are in your base environment conda activate # Create the environment -conda env create -f env/environment.yml +conda env create -f env/deeprank2.yml # Activate the environment conda activate deeprank2 +# Install the latest deeprank2 release +pip install deeprank2 ``` See instructions below to [test](#testing-deeprank2-installation) that the installation was succesful. -#### Manual installation +#### Manual installation (customizable) + +If you want to use the GPUs, choose a specific python version, are a MacOS user, or if the YML installation was not successful, you can install the package manually. We advise to do this inside a [conda virtual environment](https://conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html). -If you want to use the GPUs, choose a specific python version, are a MacOS user, or if the YML installation was not succesful, you can install the package manually. We advise to do this inside a [conda virtual environment](https://conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html). -If you have any issues during installation of dependencies, please refer to the official documentation for each package (linked below), as our instructions may be out of date (last tested on 19 Jan 2024): +You can first remove from `env/deeprank2.yml` the packages that cannot be installed properly, or the ones that you want to install differently (e.g., pytorch-related packages if you wish to install the CUDA version), and then proceed with the environment creation by using the edited YML file: `conda env create -f env/deeprank2.yml`. Then activate the environment, and proceed with installing the missing packages, which might fall into the following list. If you have any issues during installation of dependencies, please refer to the official documentation for each package (linked below), as our instructions may be out of date (last tested on 19 Feb 2024): -- [DSSP 4](https://anaconda.org/sbl/dssp): `conda install -c sbl dssp` -- [MSMS](https://anaconda.org/bioconda/msms): `conda install -c bioconda msms` - - [Here](https://ssbio.readthedocs.io/en/latest/instructions/msms.html) for MacOS with M1 chip users. -- [PyTorch](https://pytorch.org/get-started/locally/): `conda install pytorch torchvision torchaudio cpuonly -c pytorch` - - Pytorch regularly publishes updates and not all newest versions will work stably with DeepRank2. Currently, the package is tested using [PyTorch 2.1.1](https://pytorch.org/get-started/previous-versions/#v211). +- [MSMS](https://anaconda.org/bioconda/msms): [Here](https://ssbio.readthedocs.io/en/latest/instructions/msms.html) for MacOS with M1 chip users. +- [PyTorch](https://pytorch.org/get-started/locally/) + - Pytorch regularly publishes updates and not all newest versions will work stably with DeepRank2. Currently, the package is tested on ubuntu using [PyTorch 2.1.1](https://pytorch.org/get-started/previous-versions/#v211). - We support torch's CPU library as well as CUDA. - [PyG](https://pytorch-geometric.readthedocs.io/en/latest/install/installation.html) and its optional dependencies: `torch_scatter`, `torch_sparse`, `torch_cluster`, `torch_spline_conv`. - The exact command to install pyg will depend on the version of pytorch you are using. Please refer to the source's installation instructions (we recommend using the pip installation for this as it also shows the command for the dependencies). -- For MacOS with M1 chip users: install [the conda version of PyTables](https://www.pytables.org/usersguide/installation.html). +- [FreeSASA](https://freesasa.github.io/python/). Finally install deeprank2 itself: `pip install deeprank2`. @@ -145,7 +151,7 @@ The `test` extra is optional, and can be used to install test-related dependenci #### Testing DeepRank2 installation -You can check that all components were installed correctly, using pytest. We especially recommend doing this in case you installed DeepRank2 and its dependencies manually (the latter option above). +You can check that all components were installed correctly, using `pytest`. We especially recommend doing this in case you installed DeepRank2 and its dependencies manually (the latter option above). The quick test should be sufficient to ensure that the software works, while the full test (a few minutes) will cover a much broader range of settings to ensure everything is correct. @@ -166,10 +172,13 @@ For each protein-protein complex (or protein structure containing a missense var A `Query` takes as inputs: -- a `.pdb` file, representing the protein-protein structure, -- the resolution (`"residue"` or `"atom"`), i.e. whether each node should represent an amino acid residue or an atom, -- the ids of the chains composing the structure, and -- optionally, the correspondent position-specific scoring matrices (PSSMs), in the form of `.pssm` files. +- A `.pdb` file, representing the molecular structure. +- The resolution (`"residue"` or `"atom"`), i.e. whether each node should represent an amino acid residue or an atom. +- `chain_ids`, the chain ID or IDs (generally single capital letter(s)). + - `SingleResidueVariantQuery` takes a single ID, which represents the chain containing the variant residue. + - `ProteinProteinInterfaceQuery` takes a pair of ids, which represent the chains between which the interface exists. + - Note that in either case this does not limit the structure to residues from this/these chain/s. The structure contained in the `.pdb` can thus have any number of chains, and residues from these chains will be included in the graphs and 3D-grids produced by DeepRank2 (if they are within the `influence_radius`). +- Optionally, the correspondent position-specific scoring matrices (PSSMs), in the form of `.pssm` files. ```python from deeprank2.query import QueryCollection, ProteinProteinInterfaceQuery @@ -218,7 +227,7 @@ queries.add(ProteinProteinInterfaceQuery( The user is free to implement a custom query class. Each implementation requires the `build` method to be present. -The queries can then be processed into graphs only or both graphs and 3D grids, depending on which kind of network will be used later for training. +The queries can then be processed into graphs only or both graphs and 3D-grids, depending on which kind of network will be used later for training. ```python from deeprank2.features import components, conservation, contact, exposure, irc, surfacearea @@ -231,7 +240,7 @@ hdf5_paths = queries.process( "/", feature_modules = feature_modules) -# Save data into 3D-graphs and 3D-grids +# Save data into graphs and 3D-grids hdf5_paths = queries.process( "/", feature_modules = feature_modules, diff --git a/deeprank2/__init__.py b/deeprank2/__init__.py index 055276878..131942e76 100644 --- a/deeprank2/__init__.py +++ b/deeprank2/__init__.py @@ -1 +1 @@ -__version__ = "3.0.1" +__version__ = "3.0.2" diff --git a/deeprank2/trainer.py b/deeprank2/trainer.py index 56b5637ad..e133dbcaa 100644 --- a/deeprank2/trainer.py +++ b/deeprank2/trainer.py @@ -21,7 +21,7 @@ from deeprank2.utils.earlystopping import EarlyStopping from deeprank2.utils.exporters import HDF5OutputExporter, OutputExporter, OutputExporterCollection -# ruff: noqa: PYI041 (redundant-numeric-union), they are used differently in this module +# ruff: noqa: PYI041 (usage depends on type in this module) _log = logging.getLogger(__name__) diff --git a/deeprank2/utils/community_pooling.py b/deeprank2/utils/community_pooling.py index 04384268f..553ab4b4f 100644 --- a/deeprank2/utils/community_pooling.py +++ b/deeprank2/utils/community_pooling.py @@ -11,7 +11,7 @@ from torch_geometric.nn.pool.pool import pool_batch, pool_edge from torch_scatter import scatter_max, scatter_mean -# ruff: noqa: ANN001, ANN201 (missing type hints and return types) +# ruff: noqa: ANN001, ANN201 def plot_graph(graph, cluster) -> None: # noqa:D103 diff --git a/deeprank2/utils/parsing/__init__.py b/deeprank2/utils/parsing/__init__.py index 1d253eee0..f22f4e52a 100644 --- a/deeprank2/utils/parsing/__init__.py +++ b/deeprank2/utils/parsing/__init__.py @@ -65,7 +65,11 @@ def get_vanderwaals_parameters(self, atom: Atom) -> VanderwaalsParam: type_ = action["TYPE"] if type_ is None: - _log.warning(f"Atom {atom} is unknown to the forcefield; vanderwaals_parameters set to (0.0, 0.0, 0.0, 0.0)") + _log.warning( + f"Atom {atom} is unknown to the forcefield, vanderwaals_parameters set to (0.0, 0.0, 0.0, 0.0).\n" + " This will affect `vanderwaals` feature.\n" + " Check https://deeprank2.readthedocs.io/en/latest/features.html#nonbond-energies for more details.", + ) return VanderwaalsParam(0.0, 0.0, 0.0, 0.0) return self._vanderwaals_parameters[type_] @@ -94,7 +98,11 @@ def get_charge(self, atom: Atom) -> float: charge = float(action["CHARGE"]) if charge is None: - _log.warning(f"Atom {atom} is unknown to the forcefield; charge is set to 0.0") + _log.warning( + f"Atom {atom} is unknown to the forcefield, `electrostatic` and `atom_charge` charge is set to 0.0.\n" + " This will affect `electrostatic` and `atom_charge` features.\n" + " Check https://deeprank2.readthedocs.io/en/latest/features.html#nonbond-energies for more details.", + ) return 0.0 return charge diff --git a/docs/features.md b/docs/features.md index 3ed576ccd..6fa59ea6b 100644 --- a/docs/features.md +++ b/docs/features.md @@ -168,9 +168,14 @@ These features relate to the structural relationship between nodes. #### Nonbond energies: -These features measure nonbond energy potentials between nodes. +These features measure nonbond energy potentials between nodes, and are calculated using [OPLS forcefield](https://en.wikipedia.org/wiki/OPLS). For residue graphs, the pairwise sum of potentials for all atoms from each residue is used. Note that no distance cutoff is used and the radius of influence is assumed to be infinite, although the potentials tends to 0 at large distance. Also edges are only assigned within a given cutoff radius when graphs are created. + Nonbond energies are set to 0 for any atom pairs (on the same chain) that are within a cutoff radius of 3.6 Å, as these are assumed to be covalent neighbors or linked by no more than 2 covalent bonds (i.e. 1-3 pairs). - `electrostatic`: Electrostatic potential (also known as Coulomb potential) between two nodes, calculated using interatomic distances and charges of each atom (float). - `vanderwaals`: Van der Waals potential (also known as Lennard-Jones potential) between two nodes, calculated using interatomic distance/s and a list of atoms with vanderwaals parameters (`deeprank2.domain.forcefield.protein-allhdg5-4_new`, float). Atom pairs within a cutoff radius of 4.2 Å (but above 3.6 Å) are assumed to be separated by separated by exactly 2 covalent bonds (i.e. 1-4 pairs) and use a set of lower energy parameters. + +Charge and vanderwaals parameters are set to 0 for those atoms that are unknown to the OPLS forcefield, treating such cases as missing values. If this happens for many of the atoms in the PDB file/s provided, depending on the specific dataset it may be worth it to drop the features affected, i.e., `electrostatic`, `vanderwaals`, and `atom_charge`. + +- It may be useful to generate histograms of the processed data to further investigate the distribution of these features' values before deciding whether to drop them. Refer to the `data_generation_xxx.ipynb` [tutorial files](https://github.com/DeepRank/deeprank2/tree/main/tutorials) for comprehensive instructions on transforming the data into a Pandas dataframe and generating histograms of the features. diff --git a/docs/getstarted.md b/docs/getstarted.md index 9a98462d7..1f2eb561c 100644 --- a/docs/getstarted.md +++ b/docs/getstarted.md @@ -10,10 +10,13 @@ For each protein-protein complex (or protein structure containing a missense var A `Query` takes as inputs: -- a `.pdb` file, representing the protein-protein structure, -- the resolution (`"residue"` or `"atom"`), i.e. whether each node should represent an amino acid residue or an atom, -- the ids of the chains composing the structure, and -- optionally, the correspondent position-specific scoring matrices (PSSMs), in the form of `.pssm` files. +- A `.pdb` file, representing the molecular structure. +- The resolution (`"residue"` or `"atom"`), i.e. whether each node should represent an amino acid residue or an atom. +- `chain_ids`, the chain ID or IDs (generally single capital letter(s)). + - `SingleResidueVariantQuery` takes a single ID, which represents the chain containing the variant residue. + - `ProteinProteinInterfaceQuery` takes a pair of ids, which represent the chains between which the interface exists. + - Note that in either case this does not limit the structure to residues from this/these chain/s. The structure contained in the `.pdb` can thus have any number of chains, and residues from these chains will be included in the graphs and grids produced by DeepRank2 (if they are within the `influence_radius`). +- Optionally, the correspondent position-specific scoring matrices (PSSMs), in the form of `.pssm` files. ```python from deeprank2.query import QueryCollection, ProteinProteinInterfaceQuery diff --git a/docs/index.rst b/docs/index.rst index fe3d4084d..689aefe4f 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -4,14 +4,19 @@ DeepRank2 |version| documentation DeepRank2 is an open-source deep learning (DL) framework for data mining of protein-protein interfaces (PPIs) or single-residue variants (SRVs). This package is an improved and unified version of three previously developed packages: `DeepRank`_, `DeepRank-GNN`_, and `DeepRank-Mut`_. -DeepRank2 allows for transformation of (pdb formatted) molecular data into 3D representations (either grids or graphs) containing structural and physico-chemical information, which can be used for training neural networks. DeepRank2 also offers a pre-implemented training pipeline, using either `convolutional neural networks`_ (for grids) or `graph neural networks`_ (for graphs), as well as output exporters for evaluating performances. +As input, DeepRank2 takes `PDB-formatted`_ atomic structures, and map them to graphs, where nodes can represent either residues or atoms, as chosen by the user, and edges represent the interactions between them. DeepRank2 has the option to choose between two types of queries as input for the featurization phase: + +- PPIs, for mining interaction patterns within protein-protein complexes, implemented by the `ProteinProteinInterfaceQuery` class; +- SRVs, for mining mutation phenotypes within protein structures, implemented by the `SingleResidueVariantQuery` class. + +The physico-chemical and geometrical features are then computed and assigned to each node and edge. The user can choose which features to generate from several pre-existing options defined in the package, or define custom features modules, as explained in the documentation. The graphs can then be mapped to 3D-grids as well. The generated data can be used for training neural networks. DeepRank2 also offers a pre-implemented training pipeline, using either `convolutional neural networks`_ (for 3D-grids) or `graph neural networks`_ (for graphs), as well as output exporters for evaluating performances. Main features: * Predefined atom-level and residue-level feature types (e.g. atom/residue type, charge, size, potential energy, all features' documentation is available under `Features`_ notes) * Predefined target types (binary class, CAPRI categories, DockQ, RMSD, and FNAT, detailed docking scores documentation is available under `Docking scores`_ notes) * Flexible definition of both new features and targets -* Features generation for both graphs and grids +* Features generation for both graphs and 3D-grids * Efficient data storage in HDF5 format * Support both classification and regression (based on `PyTorch`_ and `PyTorch Geometric`_) @@ -24,6 +29,7 @@ Main features: .. _Docking scores: https://deeprank2.readthedocs.io/en/latest/docking.html .. _PyTorch: https://pytorch.org/docs/stable/index.html .. _PyTorch Geometric: https://pytorch-geometric.readthedocs.io/en/latest/ +.. _PDB-formatted: https://www.cgl.ucsf.edu/chimera/docs/UsersGuide/tutorials/pdbintro.html Getting started =========== diff --git a/docs/installation.md b/docs/installation.md index af2443ea9..394a28d09 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -4,9 +4,9 @@ - [Installation](#installation) - [Containerized Installation](#containerized-installation) - [Local/remote installation](#localremote-installation) - - [YML file installation](#yml-file-installation) - - [Manual installation](#manual-installation) - - [Testing DeepRank2 installation](#testing-deeprank2-installation) + - [YML file installation (recommended)](#yml-file-installation-recommended) + - [Manual installation (customizable)](#manual-installation-customizable) + - [Testing DeepRank2 installation](#testing-deeprank2-installation) - [Contributing](#contributing) # Installation @@ -46,7 +46,7 @@ Local installation is formally only supported on the latest stable release of ub Before installing DeepRank2 please ensure you have [GCC](https://gcc.gnu.org/install/) installed: if running `gcc --version` gives an error, run `sudo apt-get install gcc`. -#### YML file installation +#### YML file installation (recommended) You can use the provided YML file for creating a [conda environment](https://conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html) containing the latest stable release of DeepRank2 and all its dependencies. This will install the CPU-only version of DeepRank2 on Python 3.10. @@ -60,27 +60,28 @@ cd deeprank2 # Ensure you are in your base environment conda activate # Create the environment -conda env create -f env/environment.yml +conda env create -f env/deeprank2.yml # Activate the environment conda activate deeprank2 +# Install the latest deeprank2 release +pip install deeprank2 ``` See instructions below to [test](#testing-deeprank2-installation) that the installation was succesful. -### Manual installation +#### Manual installation (customizable) -If you want to use the GPUs, choose a specific python version, are a MacOS user, or if the YML installation was not succesful, you can install the package manually. We advise to do this inside a [conda virtual environment](https://conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html). -If you have any issues during installation of dependencies, please refer to the official documentation for each package (linked below), as our instructions may be out of date (last tested on 19 Jan 2024): +If you want to use the GPUs, choose a specific python version, are a MacOS user, or if the YML installation was not successful, you can install the package manually. We advise to do this inside a [conda virtual environment](https://conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html). -- [DSSP 4](https://anaconda.org/sbl/dssp): `conda install -c sbl dssp` -- [MSMS](https://anaconda.org/bioconda/msms): `conda install -c bioconda msms` - - [Here](https://ssbio.readthedocs.io/en/latest/instructions/msms.html) for MacOS with M1 chip users. -- [PyTorch](https://pytorch.org/get-started/locally/): `conda install pytorch torchvision torchaudio cpuonly -c pytorch` - - Pytorch regularly publishes updates and not all newest versions will work stably with DeepRank2. Currently, the package is tested using [PyTorch 2.1.1](https://pytorch.org/get-started/previous-versions/#v211). +You can first remove from `env/deeprank2.yml` the packages that cannot be installed properly, or the ones that you want to install differently (e.g., pytorch-related packages if you wish to install the CUDA version), and then proceed with the environment creation by using the edited YML file: `conda env create -f env/deeprank2.yml`. Then activate the environment, and proceed with installing the missing packages, which might fall into the following list. If you have any issues during installation of dependencies, please refer to the official documentation for each package (linked below), as our instructions may be out of date (last tested on 19 Feb 2024): + +- [MSMS](https://anaconda.org/bioconda/msms): [Here](https://ssbio.readthedocs.io/en/latest/instructions/msms.html) for MacOS with M1 chip users. +- [PyTorch](https://pytorch.org/get-started/locally/) + - Pytorch regularly publishes updates and not all newest versions will work stably with DeepRank2. Currently, the package is tested on ubuntu using [PyTorch 2.1.1](https://pytorch.org/get-started/previous-versions/#v211). - We support torch's CPU library as well as CUDA. - [PyG](https://pytorch-geometric.readthedocs.io/en/latest/install/installation.html) and its optional dependencies: `torch_scatter`, `torch_sparse`, `torch_cluster`, `torch_spline_conv`. - The exact command to install pyg will depend on the version of pytorch you are using. Please refer to the source's installation instructions (we recommend using the pip installation for this as it also shows the command for the dependencies). -- For MacOS with M1 chip users: install [the conda version of PyTables](https://www.pytables.org/usersguide/installation.html). +- [FreeSASA](https://freesasa.github.io/python/). Finally install deeprank2 itself: `pip install deeprank2`. @@ -94,9 +95,9 @@ pip install -e .'[test]' The `test` extra is optional, and can be used to install test-related dependencies, useful during development. -### Testing DeepRank2 installation +#### Testing DeepRank2 installation -You can check that all components were installed correctly, using pytest. We especially recommend doing this in case you installed DeepRank2 and its dependencies manually (the latter option above). +You can check that all components were installed correctly, using `pytest`. We especially recommend doing this in case you installed DeepRank2 and its dependencies manually (the latter option above). The quick test should be sufficient to ensure that the software works, while the full test (a few minutes) will cover a much broader range of settings to ensure everything is correct. diff --git a/env/deeprank2-docker.yml b/env/deeprank2-docker.yml new file mode 100644 index 000000000..440daf9c3 --- /dev/null +++ b/env/deeprank2-docker.yml @@ -0,0 +1,45 @@ +name: deeprank2 +channels: + - pytorch + - pyg + - bioconda + - defaults + - conda-forge + - sbl +dependencies: + - python==3.10 + - pip>=23.3 + - notebook>=7.0.6 + - sbl::libcifpp>=5.1.0 + - sbl::dssp>=4.2.2.1 + - msms>=2.6.1 + - markov_clustering>=0.0.6 + - pytorch=2.1.1 + - torchvision>=0.16.1 + - torchaudio>=2.1.1 + - cpuonly>=2.0 + - pyg>=2.4.0 + - pytorch-scatter>=2.1.2 + - pytorch-sparse>=0.6.18 + - pytorch-cluster>=1.6.3 + - pytorch-spline-conv>=1.2.2 + - tables>=3.8.0 + - numpy>=1.21.5 + - scipy>=1.11.2 + - h5py>=3.6.0 + - networkx>=2.6.3 + - matplotlib>=3.5.1 + - scikit-learn>=1.0.2 + - chart-studio>=1.1.0 + - biopython>=1.81 + - pdb2sql>=0.5.1 + - python-louvain>=0.16 + - tqdm>=4.63.0 + - freesasa>=2.1.0 + - tensorboard>=0.9.0 + - protobuf>=3.20.1 + - ruff>=0.3.0 + - dill>=0.3.8 + - pyarrow>=15.0.0 + - pip: + - --requirement requirements-docker.txt diff --git a/env/deeprank2.yml b/env/deeprank2.yml new file mode 100644 index 000000000..6127fcb66 --- /dev/null +++ b/env/deeprank2.yml @@ -0,0 +1,43 @@ +name: deeprank2 +channels: + - pytorch + - pyg + - bioconda + - defaults + - conda-forge + - sbl +dependencies: + - python==3.10 + - pip>=23.3 + - notebook>=7.0.6 + - sbl::libcifpp>=5.1.0 + - sbl::dssp>=4.2.2.1 + - msms>=2.6.1 + - markov_clustering>=0.0.6 + - pytorch=2.1.1 + - torchvision>=0.16.1 + - torchaudio>=2.1.1 + - cpuonly>=2.0 + - pyg>=2.4.0 + - pytorch-scatter>=2.1.2 + - pytorch-sparse>=0.6.18 + - pytorch-cluster>=1.6.3 + - pytorch-spline-conv>=1.2.2 + - tables>=3.8.0 + - numpy>=1.21.5 + - scipy>=1.11.2 + - h5py>=3.6.0 + - networkx>=2.6.3 + - matplotlib>=3.5.1 + - scikit-learn>=1.0.2 + - chart-studio>=1.1.0 + - biopython>=1.81 + - pdb2sql>=0.5.1 + - python-louvain>=0.16 + - tqdm>=4.63.0 + - freesasa>=2.1.0 + - tensorboard>=0.9.0 + - protobuf>=3.20.1 + - ruff>=0.3.0 + - dill>=0.3.8 + - pyarrow>=15.0.0 diff --git a/env/environment.yml b/env/environment.yml deleted file mode 100644 index 037f735f9..000000000 --- a/env/environment.yml +++ /dev/null @@ -1,22 +0,0 @@ -name: deeprank2 -channels: - - pytorch - - pyg - - bioconda - - defaults - - conda-forge - - sbl -dependencies: - - pip==23.3.* - - python==3.10.* - - msms==2.6.1 - - dssp>=4.2.2.1 - - pytorch==2.1.1 - - pytorch-mutex==1.0.* - - torchvision==0.16.1 - - torchaudio==2.1.1 - - cpuonly==2.0.* - - pyg==2.4.0 - - notebook==7.0.6 - - pip: - - --requirement requirements.txt diff --git a/env/requirements-docker.txt b/env/requirements-docker.txt new file mode 100644 index 000000000..43d6682d6 --- /dev/null +++ b/env/requirements-docker.txt @@ -0,0 +1 @@ +deeprank2==3.0.2 \ No newline at end of file diff --git a/env/requirements.txt b/env/requirements.txt deleted file mode 100644 index 50095d959..000000000 --- a/env/requirements.txt +++ /dev/null @@ -1,6 +0,0 @@ ---find-links https://data.pyg.org/whl/torch-2.1.0+cpu.html -torch_scatter==2.1.2 -torch_sparse==0.6.18 -torch_cluster==1.6.3 -torch_spline_conv==1.2.2 -deeprank2==3.0.1 diff --git a/pyproject.toml b/pyproject.toml index e589828a4..5ecc23d9e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "deeprank2" -version = "3.0.1" +version = "3.0.2" description = "DeepRank2 is an open-source deep learning framework for data mining of protein-protein interfaces or single-residue missense variants." readme = "README.md" requires-python = ">=3.10" @@ -34,26 +34,6 @@ classifiers = [ "Programming Language :: Python :: 3", "Programming Language :: Python :: 3.10", ] -dependencies = [ - "tables >= 3.8.0", - "numpy >= 1.21.5", - "scipy >= 1.11.2", - "h5py >= 3.6.0", - "networkx >= 2.6.3", - "matplotlib >= 3.5.1", - "pdb2sql >= 0.5.1", - "scikit-learn >= 1.0.2", - "chart-studio >= 1.1.0", - "biopython >= 1.81", - "python-louvain >= 0.16", - "markov-clustering >= 0.0.6.dev0", - "tqdm >= 4.63.0", - "freesasa >= 2.1.0", - "tensorboard >= 0.9.0", - "protobuf >= 3.20.1", - "ruff >= 0.1.13", - "dill", -] [project.optional-dependencies] # development dependency groups @@ -67,6 +47,7 @@ test = [ "coveralls", ] publishing = ["build", "twine", "wheel"] +notebooks = ["nbmake"] [project.urls] Documentation = "https://deeprank2.readthedocs.io/en/latest/?badge=latest" @@ -79,13 +60,15 @@ source = ["deeprank2"] [tool.setuptools.packages.find] include = ["deeprank2*"] -exclude = ["tests*", "*tests.*", "*tests"] +exclude = ["tests", "tests*", "*tests.*", "*tests"] [tool.setuptools.package-data] "*" = ["*.xlsx", "*.param", "*.top", "*residue-classes"] [tool.ruff] line-length = 159 + +[tool.ruff.lint] select = ["ALL"] ignore = [ # Unrealistic for this code base @@ -125,22 +108,9 @@ ignore = [ "D413", # Missing blank line after last section ] -# Allow autofix for all enabled rules. +# Autofix settings fixable = ["ALL"] unfixable = ["F401"] # unused imports (should not disappear while editing) - -[tool.ruff.lint.per-file-ignores] -"tests/*" = [ - "S101", # Use of `assert` detected - "PLR2004", # Magic value used in comparison - "D101", # Missing class docstring - "D102", # Missing docstring in public method - "D103", # Missing docstring in public function -] -"docs/*" = ["ALL"] -"tests/perf/*" = ["T201"] # Use of print statements - -[tool.ruff.lint] extend-safe-fixes = [ "D415", # First line should end with a period, question mark, or exclamation point "D300", # Use triple double quotes `"""` @@ -152,5 +122,16 @@ extend-safe-fixes = [ "B006", # Mutable default argument ] -[tool.ruff.isort] -known-first-party = ["deeprank2"] +isort.known-first-party = ["deeprank2"] + +[tool.ruff.lint.per-file-ignores] +"tests/*" = [ + "S101", # Use of `assert` detected + "PLR2004", # Magic value used in comparison + "D101", # Missing class docstring + "D102", # Missing docstring in public method + "D103", # Missing docstring in public function + "SLF001", # private member access +] +"docs/*" = ["ALL"] +"tests/perf/*" = ["T201"] # Use of print statements diff --git a/tests/domain/test_forcefield.py b/tests/domain/test_forcefield.py index 4db2c6db7..ee067aaf2 100644 --- a/tests/domain/test_forcefield.py +++ b/tests/domain/test_forcefield.py @@ -10,7 +10,7 @@ def test_atomic_forcefield() -> None: try: structure = get_structure(pdb, "101M") finally: - pdb._close() # noqa: SLF001 + pdb._close() # The arginine C-zeta should get a positive charge arg = next(r for r in structure.get_chain("A").residues if r.amino_acid == arginine) diff --git a/tests/features/__init__.py b/tests/features/__init__.py index 541e5c777..921444cac 100644 --- a/tests/features/__init__.py +++ b/tests/features/__init__.py @@ -55,7 +55,7 @@ def build_testgraph( # noqa: C901 try: structure: PDBStructure = get_structure(pdb, Path(pdb_path).stem) finally: - pdb._close() # noqa: SLF001 + pdb._close() if not central_res: nodes = set() diff --git a/tests/features/test_contact.py b/tests/features/test_contact.py index d9e9f001a..94766321a 100644 --- a/tests/features/test_contact.py +++ b/tests/features/test_contact.py @@ -43,7 +43,7 @@ def _get_contact( try: structure = get_structure(pdb, pdb_id) finally: - pdb._close() # noqa: SLF001 + pdb._close() if not chains: chains = [structure.chains[0], structure.chains[0]] diff --git a/tests/molstruct/test_structure.py b/tests/molstruct/test_structure.py index 6074fc265..03a906a9e 100644 --- a/tests/molstruct/test_structure.py +++ b/tests/molstruct/test_structure.py @@ -12,7 +12,7 @@ def _get_structure(path: str) -> PDBStructure: try: structure = get_structure(pdb, "101M") finally: - pdb._close() # noqa: SLF001 + pdb._close() assert structure is not None diff --git a/tests/perf/ppi_perf.py b/tests/perf/ppi_perf.py index f1e48e356..8b5d687ad 100644 --- a/tests/perf/ppi_perf.py +++ b/tests/perf/ppi_perf.py @@ -41,7 +41,7 @@ os.makedirs(os.path.join(processed_data_path, "atomic")) -def get_pdb_files_and_target_data(data_path: str) -> (list[str], list): +def get_pdb_files_and_target_data(data_path: str) -> tuple[list[str], list]: csv_data = pd.read_csv(os.path.join(data_path, "BA_values.csv")) pdb_files = glob.glob(os.path.join(data_path, "pdb", "*.pdb")) pdb_files.sort() diff --git a/tests/perf/srv_perf.py b/tests/perf/srv_perf.py index 9de2d8268..94be374ee 100644 --- a/tests/perf/srv_perf.py +++ b/tests/perf/srv_perf.py @@ -87,7 +87,7 @@ os.makedirs(os.path.join(processed_data_path, "atomic")) -def get_pdb_files_and_target_data(data_path: str) -> (list[str], list, list, list, list): +def get_pdb_files_and_target_data(data_path: str) -> tuple[list[str], list, list, list, list]: csv_data = pd.read_csv(os.path.join(data_path, "srv_target_values.csv")) # before running this script change .ent to .pdb pdb_files = glob.glob(os.path.join(data_path, "pdb", "*.pdb")) diff --git a/tests/test_dataset.py b/tests/test_dataset.py index 49f1b4b3b..932e7d3c9 100644 --- a/tests/test_dataset.py +++ b/tests/test_dataset.py @@ -31,7 +31,7 @@ def _compute_features_manually( hdf5_path: str, features_transform: dict, feat: str, -) -> (NDArray, float, float): +) -> tuple[NDArray, float, float]: """Return specified feature. This function returns the feature specified read from the hdf5 file, after applying manually features_transform dict. diff --git a/tests/test_querycollection.py b/tests/test_querycollection.py index 904796634..7b4694657 100644 --- a/tests/test_querycollection.py +++ b/tests/test_querycollection.py @@ -21,7 +21,7 @@ def _querycollection_tester( feature_modules: ModuleType | list[ModuleType] | None = None, cpu_count: int = 1, combine_output: bool = True, -) -> (QueryCollection, str, list[str]): +) -> tuple[QueryCollection, str, list[str]]: """ Generic function to test QueryCollection class. @@ -279,6 +279,6 @@ def test_querycollection_duplicates_add() -> None: "1ATN_2w_2", "1ATN_3w", ] - assert queries._ids_count["residue-ppi:A-B:1ATN_1w"] == 3 # noqa: SLF001 - assert queries._ids_count["residue-ppi:A-B:1ATN_2w"] == 2 # noqa: SLF001 - assert queries._ids_count["residue-ppi:A-B:1ATN_3w"] == 1 # noqa: SLF001 + assert queries._ids_count["residue-ppi:A-B:1ATN_1w"] == 3 + assert queries._ids_count["residue-ppi:A-B:1ATN_2w"] == 2 + assert queries._ids_count["residue-ppi:A-B:1ATN_3w"] == 1 diff --git a/tests/utils/test_buildgraph.py b/tests/utils/test_buildgraph.py index 35559c6c2..a57e7ec74 100644 --- a/tests/utils/test_buildgraph.py +++ b/tests/utils/test_buildgraph.py @@ -12,7 +12,7 @@ def test_get_structure_complete() -> None: try: structure = get_structure(pdb, "101M") finally: - pdb._close() # noqa: SLF001 + pdb._close() assert structure is not None @@ -40,7 +40,7 @@ def test_get_structure_from_nmr_with_dna() -> None: try: structure = get_structure(pdb, "101M") finally: - pdb._close() # noqa: SLF001 + pdb._close() assert structure is not None assert structure.chains[0].residues[0].amino_acid is None # DNA @@ -52,7 +52,7 @@ def test_residue_contact_pairs() -> None: try: structure = get_structure(pdb, "1ATN") finally: - pdb._close() # noqa: SLF001 + pdb._close() residue_pairs = get_residue_contact_pairs(pdb_path, structure, "A", "B", 8.5) assert len(residue_pairs) > 0 @@ -64,7 +64,7 @@ def test_surrounding_residues() -> None: try: structure = get_structure(pdb, "101M") finally: - pdb._close() # noqa: SLF001 + pdb._close() all_residues = structure.get_chain("A").residues # A nicely centered residue diff --git a/tests/utils/test_graph.py b/tests/utils/test_graph.py index b792ff29b..39bd2c9c8 100644 --- a/tests/utils/test_graph.py +++ b/tests/utils/test_graph.py @@ -35,7 +35,7 @@ def graph() -> Graph: try: structure = get_structure(pdb, entry_id) finally: - pdb._close() # noqa: SLF001 + pdb._close() # build a contact from two residues residue0 = structure.chains[0].residues[0] diff --git a/tests/utils/test_pssmdata.py b/tests/utils/test_pssmdata.py index 45d7cf0a6..262a517d4 100644 --- a/tests/utils/test_pssmdata.py +++ b/tests/utils/test_pssmdata.py @@ -10,7 +10,7 @@ def test_add_pssm() -> None: try: structure = get_structure(pdb, "1ATN") finally: - pdb._close() # noqa: SLF001 + pdb._close() for chain in structure.chains: with open(f"tests/data/pssm/1ATN/1ATN.{chain.id}.pdb.pssm", encoding="utf-8") as f: diff --git a/tutorials/data_generation_ppi.ipynb b/tutorials/data_generation_ppi.ipynb index 8330e9a8b..2d1d9650a 100644 --- a/tutorials/data_generation_ppi.ipynb +++ b/tutorials/data_generation_ppi.ipynb @@ -255,9 +255,7 @@ " grid_map_method=grid_map_method,\n", ")\n", "\n", - "print(\n", - " f'The queries processing is done. The generated HDF5 files are in {os.path.join(processed_data_path, \"residue\")}.'\n", - ")" + "print(f'The queries processing is done. The generated HDF5 files are in {os.path.join(processed_data_path, \"residue\")}.')" ] }, { @@ -341,7 +339,7 @@ "outputs": [], "source": [ "processed_data = glob.glob(os.path.join(processed_data_path, \"residue\", \"*.hdf5\"))\n", - "dataset = GraphDataset(processed_data)\n", + "dataset = GraphDataset(processed_data, target=\"binary\")\n", "df = dataset.hdf5_to_pandas()\n", "df.head()" ] @@ -360,9 +358,7 @@ "metadata": {}, "outputs": [], "source": [ - "fname = os.path.join(\n", - " processed_data_path, \"residue\", \"_\".join([\"res_mass\", \"distance\", \"electrostatic\"])\n", - ")\n", + "fname = os.path.join(processed_data_path, \"residue\", \"_\".join([\"res_mass\", \"distance\", \"electrostatic\"]))\n", "dataset.save_hist(features=[\"res_mass\", \"distance\", \"electrostatic\"], fname=fname)\n", "\n", "im = img.imread(fname + \".png\")\n", @@ -480,9 +476,7 @@ " grid_map_method=grid_map_method,\n", ")\n", "\n", - "print(\n", - " f'The queries processing is done. The generated HDF5 files are in {os.path.join(processed_data_path, \"atomic\")}.'\n", - ")" + "print(f'The queries processing is done. The generated HDF5 files are in {os.path.join(processed_data_path, \"atomic\")}.')" ] }, { @@ -500,7 +494,7 @@ "outputs": [], "source": [ "processed_data = glob.glob(os.path.join(processed_data_path, \"atomic\", \"*.hdf5\"))\n", - "dataset = GraphDataset(processed_data)\n", + "dataset = GraphDataset(processed_data, target=\"binary\")\n", "df = dataset.hdf5_to_pandas()\n", "df.head()" ] diff --git a/tutorials/data_generation_srv.ipynb b/tutorials/data_generation_srv.ipynb index 11a776051..1a68f31ad 100644 --- a/tutorials/data_generation_srv.ipynb +++ b/tutorials/data_generation_srv.ipynb @@ -266,9 +266,7 @@ " grid_map_method=grid_map_method,\n", ")\n", "\n", - "print(\n", - " f'The queries processing is done. The generated HDF5 files are in {os.path.join(processed_data_path, \"residue\")}.'\n", - ")" + "print(f'The queries processing is done. The generated HDF5 files are in {os.path.join(processed_data_path, \"residue\")}.')" ] }, { @@ -359,7 +357,7 @@ "outputs": [], "source": [ "processed_data = glob.glob(os.path.join(processed_data_path, \"residue\", \"*.hdf5\"))\n", - "dataset = GraphDataset(processed_data)\n", + "dataset = GraphDataset(processed_data, target=\"binary\")\n", "df = dataset.hdf5_to_pandas()\n", "df.head()" ] @@ -378,9 +376,7 @@ "metadata": {}, "outputs": [], "source": [ - "fname = os.path.join(\n", - " processed_data_path, \"residue\", \"_\".join([\"res_mass\", \"distance\", \"electrostatic\"])\n", - ")\n", + "fname = os.path.join(processed_data_path, \"residue\", \"_\".join([\"res_mass\", \"distance\", \"electrostatic\"]))\n", "dataset.save_hist(features=[\"res_mass\", \"distance\", \"electrostatic\"], fname=fname)\n", "\n", "im = img.imread(fname + \".png\")\n", @@ -500,9 +496,7 @@ " grid_map_method=grid_map_method,\n", ")\n", "\n", - "print(\n", - " f'The queries processing is done. The generated HDF5 files are in {os.path.join(processed_data_path, \"atomic\")}.'\n", - ")" + "print(f'The queries processing is done. The generated HDF5 files are in {os.path.join(processed_data_path, \"atomic\")}.')" ] }, { @@ -520,7 +514,7 @@ "outputs": [], "source": [ "processed_data = glob.glob(os.path.join(processed_data_path, \"atomic\", \"*.hdf5\"))\n", - "dataset = GraphDataset(processed_data)\n", + "dataset = GraphDataset(processed_data, target=\"binary\")\n", "df = dataset.hdf5_to_pandas()\n", "df.head()" ] @@ -548,6 +542,11 @@ "source": [ "Note that some of the features are different from the ones generated with the residue-level queries. There are indeed features in `deeprank2.features.components` module which are generated only in atomic graphs, i.e. `atom_type`, `atom_charge`, and `pdb_occupancy`, because they don't make sense only in the atomic graphs' representation.\n" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [] } ], "metadata": { diff --git a/tutorials/training.ipynb b/tutorials/training.ipynb index 784bc03c7..ae8870c47 100644 --- a/tutorials/training.ipynb +++ b/tutorials/training.ipynb @@ -1,772 +1,776 @@ { - "cells": [ - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Training Neural Networks\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Introduction\n", - "\n", - "\n", - "\n", - "This tutorial will demonstrate the use of DeepRank2 for training graph neural networks (GNNs) and convolutional neural networks (CNNs) using protein-protein interface (PPI) or single-residue variant (SRV) data for classification and regression predictive tasks.\n", - "\n", - "This tutorial assumes that the PPI data of interest have already been generated and saved as [HDF5 files](https://en.wikipedia.org/wiki/Hierarchical_Data_Format), with the data structure that DeepRank2 expects. This data can be generated using the [data_generation_ppi.ipynb](https://github.com/DeepRank/deeprank2/blob/main/tutorials/data_generation_ppi.ipynb) tutorial or downloaded from Zenodo at [this record address](https://zenodo.org/record/8349335). For more details on the data structure, please refer to the other tutorial, which also contains a detailed description of how the data is generated from PDB files.\n", - "\n", - "This tutorial assumes also a basic knowledge of the [PyTorch](https://pytorch.org/) framework, on top of which the machine learning pipeline of DeepRank2 has been developed, for which many online tutorials exist.\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Input data\n", - "\n", - "If you have previously run `data_generation_ppi.ipynb` or `data_generation_srv.ipynb` notebook, then their output can be directly used as input for this tutorial.\n", - "\n", - "Alternatively, preprocessed HDF5 files can be downloaded directly from Zenodo at [this record address](https://zenodo.org/record/8349335). To download the data used in this tutorial, please visit the link and download `data_processed.zip`. Unzip it, and save the `data_processed/` folder in the same directory as this notebook. The name and the location of the folder are optional but recommended, as they are the name and the location we will use to refer to the folder throughout the tutorial.\n", - "\n", - "Note that the datasets contain only ~100 data points each, which is not enough to develop an impactful predictive model, and the scope of their use is indeed only demonstrative and informative for the users.\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Utilities\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Libraries\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The libraries needed for this tutorial:\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import logging\n", - "import glob\n", - "import os\n", - "import h5py\n", - "import pandas as pd\n", - "from sklearn.model_selection import train_test_split\n", - "from sklearn.metrics import roc_curve, auc, precision_score, recall_score, accuracy_score, f1_score\n", - "import plotly.express as px\n", - "import torch\n", - "import numpy as np\n", - "\n", - "np.seterr(divide=\"ignore\")\n", - "np.seterr(invalid=\"ignore\")\n", - "import pandas as pd\n", - "\n", - "logging.basicConfig(level=logging.INFO)\n", - "from deeprank2.dataset import GraphDataset, GridDataset\n", - "from deeprank2.trainer import Trainer\n", - "from deeprank2.neuralnets.gnn.naive_gnn import NaiveNetwork\n", - "from deeprank2.neuralnets.cnn.model3d import CnnClassification\n", - "from deeprank2.utils.exporters import HDF5OutputExporter\n", - "import warnings\n", - "\n", - "warnings.filterwarnings(\"ignore\")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Paths and sets\n", - "\n", - "The paths for reading the processed data:\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data_type = \"ppi\"\n", - "level = \"residue\"\n", - "processed_data_path = os.path.join(\"data_processed\", data_type, level)\n", - "input_data_path = glob.glob(os.path.join(processed_data_path, \"*.hdf5\"))\n", - "output_path = os.path.join(\"data_processed\", data_type, level) # for saving predictions results" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The `data_type` can be either \"ppi\" or \"srv\", depending on which application the user is most interested in. The `level` can be either \"residue\" or \"atomic\", and refers to the structural resolution, where each node either represents a single residue or a single atom from the molecular structure.\n", - "\n", - "In this tutorial, we will use PPI residue-level data by default, but the same code can be applied to SRV or/and atomic-level data with no changes, apart from setting `data_type` and `level` parameters in the cell above.\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "A Pandas DataFrame containing data points' IDs and the binary target values can be defined:\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df_dict = {}\n", - "df_dict[\"entry\"] = []\n", - "df_dict[\"target\"] = []\n", - "for fname in input_data_path:\n", - " with h5py.File(fname, \"r\") as hdf5:\n", - " for mol in hdf5.keys():\n", - " target_value = float(hdf5[mol][\"target_values\"][\"binary\"][()])\n", - " df_dict[\"entry\"].append(mol)\n", - " df_dict[\"target\"].append(target_value)\n", - "\n", - "df = pd.DataFrame(data=df_dict)\n", - "df.head()" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "As explained in [data_generation_ppi.ipynb](https://github.com/DeepRank/deeprank2/blob/main/tutorials/data_generation_ppi.ipynb), for each data point there are two targets: \"BA\" and \"binary\". The first represents the strength of the interaction between two molecules that bind reversibly (interact) in nM, while the second represents its binary mapping, being 0 (BA > 500 nM) a not-binding complex and 1 (BA <= 500 nM) binding one.\n", - "\n", - "For SRVs, each data point has a single target, \"binary\", which is 0 if the SRV is considered benign, and 1 if it is pathogenic, as explained in [data_generation_srv.ipynb](https://github.com/DeepRank/deeprank-core/blob/main/tutorials/data_generation_srv.ipynb).\n", - "\n", - "The pandas DataFrame `df` is used only to split data points into training, validation and test sets according to the \"binary\" target - using target stratification to keep the proportion of 0s and 1s constant among the different sets. Training and validation sets will be used during the training for updating the network weights, while the test set will be held out as an independent test and will be used later for the model evaluation.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df_train, df_test = train_test_split(df, test_size=0.1, stratify=df.target, random_state=42)\n", - "df_train, df_valid = train_test_split(df_train, test_size=0.2, stratify=df_train.target, random_state=42)\n", - "\n", - "print(f\"Data statistics:\\n\")\n", - "print(f\"Total samples: {len(df)}\\n\")\n", - "print(f\"Training set: {len(df_train)} samples, {round(100*len(df_train)/len(df))}%\")\n", - "print(f\"\\t- Class 0: {len(df_train[df_train.target == 0])} samples, {round(100*len(df_train[df_train.target == 0])/len(df_train))}%\")\n", - "print(f\"\\t- Class 1: {len(df_train[df_train.target == 1])} samples, {round(100*len(df_train[df_train.target == 1])/len(df_train))}%\")\n", - "print(f\"Validation set: {len(df_valid)} samples, {round(100*len(df_valid)/len(df))}%\")\n", - "print(f\"\\t- Class 0: {len(df_valid[df_valid.target == 0])} samples, {round(100*len(df_valid[df_valid.target == 0])/len(df_valid))}%\")\n", - "print(f\"\\t- Class 1: {len(df_valid[df_valid.target == 1])} samples, {round(100*len(df_valid[df_valid.target == 1])/len(df_valid))}%\")\n", - "print(f\"Testing set: {len(df_test)} samples, {round(100*len(df_test)/len(df))}%\")\n", - "print(f\"\\t- Class 0: {len(df_test[df_test.target == 0])} samples, {round(100*len(df_test[df_test.target == 0])/len(df_test))}%\")\n", - "print(f\"\\t- Class 1: {len(df_test[df_test.target == 1])} samples, {round(100*len(df_test[df_test.target == 1])/len(df_test))}%\")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Classification example\n", - "\n", - "A GNN and a CNN can be trained for a classification predictive task, which consists in predicting the \"binary\" target values.\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### GNN\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### GraphDataset\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For training GNNs the user can create `GraphDataset` instances. This class inherits from `DeeprankDataset` class, which in turns inherits from `Dataset` [PyTorch geometric class](https://pytorch-geometric.readthedocs.io/en/latest/_modules/torch_geometric/data/dataset.html), a base class for creating graph datasets.\n", - "\n", - "A few notes about `GraphDataset` parameters:\n", - "\n", - "- By default, all features contained in the HDF5 files are used, but the user can specify `node_features` and `edge_features` in `GraphDataset` if not all of them are needed. See the [docs](https://deeprank2.readthedocs.io/en/latest/features.html) for more details about all the possible pre-implemented features.\n", - "- For regression, `task` should be set to `regress` and the `target` to `BA`, which is a continuous variable and therefore suitable for regression tasks.\n", - "- For the `GraphDataset` class it is possible to define a dictionary to indicate which transformations to apply to the features, being the transformations lambda functions and/or standardization.\n", - " - If the `standardize` key is `True`, standardization is applied after transformation. Standardization consists in applying the following formula on each feature's value: ${x' = \\frac{x - \\mu}{\\sigma}}$, being ${\\mu}$ the mean and ${\\sigma}$ the standard deviation. Standardization is a scaling method where the values are centered around mean with a unit standard deviation.\n", - " - The transformation to apply can be speficied as a lambda function as a value of the key `transform`, which defaults to `None`.\n", - " - Since in the provided example standardization is applied, the training features' means and standard deviations need to be used for scaling validation and test sets. For doing so, `train_source` parameter is used. When `train_source` parameter is set, it will be used to scale the validation/testing sets. You need to pass `features_transform` to the training dataset only, since in other cases it will be ignored and only the one of `train_source` will be considered.\n", - " - Note that transformations have not currently been implemented for the `GridDataset` class.\n", - " - In the example below a logarithmic transformation and then the standardization are applied to all the features. It is also possible to use specific features as keys for indicating that transformation and/or standardization need to be apply to few features only.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "target = \"binary\"\n", - "task = \"classif\"\n", - "node_features = [\"res_type\"]\n", - "edge_features = [\"distance\"]\n", - "features_transform = {\"all\": {\"transform\": lambda x: np.cbrt(x), \"standardize\": True}}\n", - "\n", - "print(\"Loading training data...\")\n", - "dataset_train = GraphDataset(\n", - " hdf5_path=input_data_path,\n", - " subset=list(df_train.entry), # selects only data points with ids in df_train.entry\n", - " node_features=node_features,\n", - " edge_features=edge_features,\n", - " features_transform=features_transform,\n", - " target=target,\n", - " task=task,\n", - ")\n", - "print(\"\\nLoading validation data...\")\n", - "dataset_val = GraphDataset(\n", - " hdf5_path=input_data_path,\n", - " subset=list(df_valid.entry), # selects only data points with ids in df_valid.entry\n", - " train_source=dataset_train,\n", - ")\n", - "print(\"\\nLoading test data...\")\n", - "dataset_test = GraphDataset(\n", - " hdf5_path=input_data_path,\n", - " subset=list(df_test.entry), # selects only data points with ids in df_test.entry\n", - " train_source=dataset_train,\n", - ")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Trainer\n", - "\n", - "The class `Trainer` implements training, validation and testing of PyTorch-based neural networks.\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "A few notes about `Trainer` parameters:\n", - "\n", - "- `neuralnet` can be any neural network class that inherits from `torch.nn.Module`, and it shouldn't be specific to regression or classification in terms of output shape. The `Trainer` class takes care of formatting the output shape according to the task. This tutorial uses a simple network, `NaiveNetwork` (implemented in `deeprank2.neuralnets.gnn.naive_gnn`). All GNN architectures already implemented in the pakcage can be found [here](https://github.com/DeepRank/deeprank-core/tree/main/deeprank2/neuralnets/gnn) and can be used for training or as a basis for implementing new ones.\n", - "- `class_weights` is used for classification tasks only and assigns class weights based on the training dataset content to account for any potential inbalance between the classes. In this case the dataset is balanced (50% 0 and 50% 1), so it is not necessary to use it. It defaults to False.\n", - "- `cuda` and `ngpu` are used for indicating whether to use CUDA and how many GPUs. By default, CUDA is not used and `ngpu` is 0.\n", - "- The user can specify a deeprank2 exporter or a custom one in `output_exporters` parameter, together with the path where to save the results. Exporters are used for storing predictions information collected later on during training and testing. Later the results saved by `HDF5OutputExporter` will be read and evaluated.\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "##### Training\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "trainer = Trainer(\n", - " neuralnet=NaiveNetwork,\n", - " dataset_train=dataset_train,\n", - " dataset_val=dataset_val,\n", - " dataset_test=dataset_test,\n", - " output_exporters=[HDF5OutputExporter(os.path.join(output_path, f\"gnn_{task}\"))],\n", - ")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The default optimizer is `torch.optim.Adam`. It is possible to specify optimizer's parameters or to use another PyTorch optimizer object:\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "optimizer = torch.optim.SGD\n", - "lr = 1e-3\n", - "weight_decay = 0.001\n", - "\n", - "trainer.configure_optimizers(optimizer, lr, weight_decay)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The default loss function for classification is `torch.nn.CrossEntropyLoss` and for regression it is `torch.nn.MSELoss`. It is also possible to set some other PyTorch loss functions by using `Trainer.set_lossfunction` method, although not all are currently implemented.\n", - "\n", - "Then the model can be trained using the `train()` method of the `Trainer` class.\n", - "\n", - "A few notes about `train()` method parameters:\n", - "\n", - "- `earlystop_patience`, `earlystop_maxgap` and `min_epoch` are used for controlling early stopping logic. `earlystop_patience` indicates the number of epochs after which the training ends if the validation loss does not improve. `earlystop_maxgap` indicated the maximum difference allowed between validation and training loss, and `min_epoch` is the minimum number of epochs to be reached before evaluating `maxgap`.\n", - "- If `validate` is set to `True`, validation is performed on an independent dataset, which has been called `dataset_val` few cells above. If set to `False`, validation is performed on the training dataset itself (not recommended).\n", - "- `num_workers` can be set for indicating how many subprocesses to use for data loading. The default is 0 and it means that the data will be loaded in the main process.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "epochs = 20\n", - "batch_size = 8\n", - "earlystop_patience = 5\n", - "earlystop_maxgap = 0.1\n", - "min_epoch = 10\n", - "\n", - "trainer.train(\n", - " nepoch=epochs,\n", - " batch_size=batch_size,\n", - " earlystop_patience=earlystop_patience,\n", - " earlystop_maxgap=earlystop_maxgap,\n", - " min_epoch=min_epoch,\n", - " validate=True,\n", - " filename=os.path.join(output_path, f\"gnn_{task}\", \"model.pth.tar\"),\n", - ")\n", - "\n", - "epoch = trainer.epoch_saved_model\n", - "print(f\"Model saved at epoch {epoch}\")\n", - "pytorch_total_params = sum(p.numel() for p in trainer.model.parameters())\n", - "print(f\"Total # of parameters: {pytorch_total_params}\")\n", - "pytorch_trainable_params = sum(p.numel() for p in trainer.model.parameters() if p.requires_grad)\n", - "print(f\"Total # of trainable parameters: {pytorch_trainable_params}\")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "##### Testing\n", - "\n", - "And the trained model can be tested on `dataset_test`:\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "trainer.test()" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "##### Results visualization\n", - "\n", - "Finally, the results saved by `HDF5OutputExporter` can be inspected, which can be found in the `data/ppi/gnn_classif/` folder in the form of an HDF5 file, `output_exporter.hdf5`. Note that the folder contains the saved pre-trained model as well.\n", - "\n", - "`output_exporter.hdf5` contains [HDF5 Groups](https://docs.h5py.org/en/stable/high/group.html) which refer to each phase, e.g. training and testing if both are run, only one of them otherwise. Training phase includes validation results as well. This HDF5 file can be read as a Pandas Dataframe:\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "output_train = pd.read_hdf(\n", - " os.path.join(output_path, f\"gnn_{task}\", \"output_exporter.hdf5\"), key=\"training\"\n", - ")\n", - "output_test = pd.read_hdf(\n", - " os.path.join(output_path, f\"gnn_{task}\", \"output_exporter.hdf5\"), key=\"testing\"\n", - ")\n", - "output_train.head()" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The dataframes contain `phase`, `epoch`, `entry`, `output`, `target`, and `loss` columns, and can be easily used to visualize the results.\n", - "\n", - "For example, the loss across the epochs can be plotted for the training and the validation sets:\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "fig = px.line(output_train, x=\"epoch\", y=\"loss\", color=\"phase\", markers=True)\n", - "\n", - "fig.add_vline(x=trainer.epoch_saved_model, line_width=3, line_dash=\"dash\", line_color=\"green\")\n", - "\n", - "fig.update_layout(\n", - " xaxis_title=\"Epoch #\",\n", - " yaxis_title=\"Loss\",\n", - " title=\"Loss vs epochs - GNN training\",\n", - " width=700,\n", - " height=400,\n", - ")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "And now a few metrics of interest for classification tasks can be printed out: the [area under the ROC curve](https://en.wikipedia.org/wiki/Receiver_operating_characteristic#Area_under_the_curve) (AUC), and for a threshold of 0.5 the [precision, recall, accuracy and f1 score](https://en.wikipedia.org/wiki/Precision_and_recall#Definition).\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "threshold = 0.5\n", - "df = pd.concat([output_train, output_test])\n", - "df_plot = df[(df.epoch == trainer.epoch_saved_model) | ((df.epoch == trainer.epoch_saved_model) & (df.phase == \"testing\"))]\n", - "\n", - "for idx, set in enumerate([\"training\", \"validation\", \"testing\"]):\n", - " df_plot_phase = df_plot[(df_plot.phase == set)]\n", - " y_true = df_plot_phase.target\n", - " y_score = np.array(df_plot_phase.output.values.tolist())[:, 1]\n", - "\n", - " print(f\"\\nMetrics for {set}:\")\n", - " fpr_roc, tpr_roc, thr_roc = roc_curve(y_true, y_score)\n", - " auc_score = auc(fpr_roc, tpr_roc)\n", - " print(f\"AUC: {round(auc_score, 1)}\")\n", - " print(f\"Considering a threshold of {threshold}\")\n", - " y_pred = (y_score > threshold) * 1\n", - " print(f\"- Precision: {round(precision_score(y_true, y_pred), 1)}\")\n", - " print(f\"- Recall: {round(recall_score(y_true, y_pred), 1)}\")\n", - " print(f\"- Accuracy: {round(accuracy_score(y_true, y_pred), 1)}\")\n", - " print(f\"- F1: {round(f1_score(y_true, y_pred), 1)}\")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Note that the poor performance of this network is due to the small number of datapoints used in this tutorial. For a more reliable network we suggest using a number of data points on the order of at least tens of thousands.\n", - "\n", - "The same exercise can be repeated but using grids instead of graphs and CNNs instead of GNNs.\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### CNN\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### GridDataset\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For training CNNs the user can create `GridDataset` instances.\n", - "\n", - "A few notes about `GridDataset` parameters:\n", - "\n", - "- By default, all features contained in the HDF5 files are used, but the user can specify `features` in `GridDataset` if not all of them are needed. Since grids features are derived from node and edge features mapped from graphs to grid, the easiest way to see which features are available is to look at the HDF5 file, as explained in detail in `data_generation_ppi.ipynb` and `data_generation_srv.ipynb`, section \"Other tools\".\n", - "- As is the case for a `GraphDataset`, `task` can be assigned to `regress` and `target` to `BA` to perform a regression task. As mentioned previously, we do not provide sample data to perform a regression task for SRVs.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "target = \"binary\"\n", - "task = \"classif\"\n", - "\n", - "print(\"Loading training data...\")\n", - "dataset_train = GridDataset(\n", - " hdf5_path=input_data_path,\n", - " subset=list(df_train.entry), # selects only data points with ids in df_train.entry\n", - " target=target,\n", - " task=task,\n", - ")\n", - "print(\"\\nLoading validation data...\")\n", - "dataset_val = GridDataset(\n", - " hdf5_path=input_data_path,\n", - " subset=list(df_valid.entry), # selects only data points with ids in df_valid.entry\n", - " train_source=dataset_train,\n", - ")\n", - "print(\"\\nLoading test data...\")\n", - "dataset_test = GridDataset(\n", - " hdf5_path=input_data_path,\n", - " subset=list(df_test.entry), # selects only data points with ids in df_test.entry\n", - " train_source=dataset_train,\n", - ")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Trainer\n", - "\n", - "As for graphs, the class `Trainer` is used for training, validation and testing of the PyTorch-based CNN.\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- Also in this case, `neuralnet` can be any neural network class that inherits from `torch.nn.Module`, and it shouldn't be specific to regression or classification in terms of output shape. This tutorial uses `CnnClassification` (implemented in `deeprank2.neuralnets.cnn.model3d`). All CNN architectures already implemented in the pakcage can be found [here](https://github.com/DeepRank/deeprank2/tree/main/deeprank2/neuralnets/cnn) and can be used for training or as a basis for implementing new ones.\n", - "- The rest of the `Trainer` parameters can be used as explained already for graphs.\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "##### Training\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "optimizer = torch.optim.SGD\n", - "lr = 1e-3\n", - "weight_decay = 0.001\n", - "epochs = 20\n", - "batch_size = 8\n", - "earlystop_patience = 5\n", - "earlystop_maxgap = 0.1\n", - "min_epoch = 10\n", - "\n", - "trainer = Trainer(\n", - " neuralnet=CnnClassification,\n", - " dataset_train=dataset_train,\n", - " dataset_val=dataset_val,\n", - " dataset_test=dataset_test,\n", - " output_exporters=[HDF5OutputExporter(os.path.join(output_path, f\"cnn_{task}\"))],\n", - ")\n", - "\n", - "trainer.configure_optimizers(optimizer, lr, weight_decay)\n", - "\n", - "trainer.train(\n", - " nepoch=epochs,\n", - " batch_size=batch_size,\n", - " earlystop_patience=earlystop_patience,\n", - " earlystop_maxgap=earlystop_maxgap,\n", - " min_epoch=min_epoch,\n", - " validate=True,\n", - " filename=os.path.join(output_path, f\"cnn_{task}\", \"model.pth.tar\"),\n", - ")\n", - "\n", - "epoch = trainer.epoch_saved_model\n", - "print(f\"Model saved at epoch {epoch}\")\n", - "pytorch_total_params = sum(p.numel() for p in trainer.model.parameters())\n", - "print(f\"Total # of parameters: {pytorch_total_params}\")\n", - "pytorch_trainable_params = sum(p.numel() for p in trainer.model.parameters() if p.requires_grad)\n", - "print(f\"Total # of trainable parameters: {pytorch_trainable_params}\")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "##### Testing\n", - "\n", - "And the trained model can be tested on `dataset_test`:\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "trainer.test()" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "##### Results visualization\n", - "\n", - "As for GNNs, the results saved by `HDF5OutputExporter` can be inspected, and are saved in the `data/ppi/cnn_classif/` or `data/srv/cnn_classif/` folder in the form of an HDF5 file, `output_exporter.hdf5`, together with the saved pre-trained model.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "output_train = pd.read_hdf(\n", - " os.path.join(output_path, f\"cnn_{task}\", \"output_exporter.hdf5\"), key=\"training\"\n", - ")\n", - "output_test = pd.read_hdf(\n", - " os.path.join(output_path, f\"cnn_{task}\", \"output_exporter.hdf5\"), key=\"testing\"\n", - ")\n", - "output_train.head()" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Also in this case, the loss across the epochs can be plotted for the training and the validation sets:\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "fig = px.line(output_train, x=\"epoch\", y=\"loss\", color=\"phase\", markers=True)\n", - "\n", - "fig.add_vline(x=trainer.epoch_saved_model, line_width=3, line_dash=\"dash\", line_color=\"green\")\n", - "\n", - "fig.update_layout(\n", - " xaxis_title=\"Epoch #\",\n", - " yaxis_title=\"Loss\",\n", - " title=\"Loss vs epochs - CNN training\",\n", - " width=700,\n", - " height=400,\n", - ")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "And some metrics of interest for classification tasks:\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "threshold = 0.5\n", - "df = pd.concat([output_train, output_test])\n", - "df_plot = df[(df.epoch == trainer.epoch_saved_model) | ((df.epoch == trainer.epoch_saved_model) & (df.phase == \"testing\"))]\n", - "\n", - "for idx, set in enumerate([\"training\", \"validation\", \"testing\"]):\n", - " df_plot_phase = df_plot[(df_plot.phase == set)]\n", - " y_true = df_plot_phase.target\n", - " y_score = np.array(df_plot_phase.output.values.tolist())[:, 1]\n", - "\n", - " print(f\"\\nMetrics for {set}:\")\n", - " fpr_roc, tpr_roc, thr_roc = roc_curve(y_true, y_score)\n", - " auc_score = auc(fpr_roc, tpr_roc)\n", - " print(f\"AUC: {round(auc_score, 1)}\")\n", - " print(f\"Considering a threshold of {threshold}\")\n", - " y_pred = (y_score > threshold) * 1\n", - " print(f\"- Precision: {round(precision_score(y_true, y_pred), 1)}\")\n", - " print(f\"- Recall: {round(recall_score(y_true, y_pred), 1)}\")\n", - " print(f\"- Accuracy: {round(accuracy_score(y_true, y_pred), 1)}\")\n", - " print(f\"- F1: {round(f1_score(y_true, y_pred), 1)}\")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "It's important to note that the dataset used in this analysis is not sufficiently large to provide conclusive and reliable insights. Depending on your specific application, you might find regression, classification, GNNs, and/or CNNs to be valuable options. Feel free to choose the approach that best aligns with your particular problem!\n" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "deeprank2", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.13" - }, - "orig_nbformat": 4 - }, - "nbformat": 4, - "nbformat_minor": 2 + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Training Neural Networks\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Introduction\n", + "\n", + "\n", + "\n", + "This tutorial will demonstrate the use of DeepRank2 for training graph neural networks (GNNs) and convolutional neural networks (CNNs) using protein-protein interface (PPI) or single-residue variant (SRV) data for classification and regression predictive tasks.\n", + "\n", + "This tutorial assumes that the PPI data of interest have already been generated and saved as [HDF5 files](https://en.wikipedia.org/wiki/Hierarchical_Data_Format), with the data structure that DeepRank2 expects. This data can be generated using the [data_generation_ppi.ipynb](https://github.com/DeepRank/deeprank2/blob/main/tutorials/data_generation_ppi.ipynb) tutorial or downloaded from Zenodo at [this record address](https://zenodo.org/record/8349335). For more details on the data structure, please refer to the other tutorial, which also contains a detailed description of how the data is generated from PDB files.\n", + "\n", + "This tutorial assumes also a basic knowledge of the [PyTorch](https://pytorch.org/) framework, on top of which the machine learning pipeline of DeepRank2 has been developed, for which many online tutorials exist.\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Input data\n", + "\n", + "If you have previously run `data_generation_ppi.ipynb` or `data_generation_srv.ipynb` notebook, then their output can be directly used as input for this tutorial.\n", + "\n", + "Alternatively, preprocessed HDF5 files can be downloaded directly from Zenodo at [this record address](https://zenodo.org/record/8349335). To download the data used in this tutorial, please visit the link and download `data_processed.zip`. Unzip it, and save the `data_processed/` folder in the same directory as this notebook. The name and the location of the folder are optional but recommended, as they are the name and the location we will use to refer to the folder throughout the tutorial.\n", + "\n", + "Note that the datasets contain only ~100 data points each, which is not enough to develop an impactful predictive model, and the scope of their use is indeed only demonstrative and informative for the users.\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Utilities\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Libraries\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The libraries needed for this tutorial:\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import logging\n", + "import glob\n", + "import os\n", + "import h5py\n", + "import pandas as pd\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import roc_curve, auc, precision_score, recall_score, accuracy_score, f1_score\n", + "import plotly.express as px\n", + "import torch\n", + "import numpy as np\n", + "\n", + "np.seterr(divide=\"ignore\")\n", + "np.seterr(invalid=\"ignore\")\n", + "import pandas as pd\n", + "\n", + "logging.basicConfig(level=logging.INFO)\n", + "from deeprank2.dataset import GraphDataset, GridDataset\n", + "from deeprank2.trainer import Trainer\n", + "from deeprank2.neuralnets.gnn.naive_gnn import NaiveNetwork\n", + "from deeprank2.neuralnets.cnn.model3d import CnnClassification\n", + "from deeprank2.utils.exporters import HDF5OutputExporter\n", + "import warnings\n", + "\n", + "warnings.filterwarnings(\"ignore\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Paths and sets\n", + "\n", + "The paths for reading the processed data:\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data_type = \"ppi\"\n", + "level = \"residue\"\n", + "processed_data_path = os.path.join(\"data_processed\", data_type, level)\n", + "input_data_path = glob.glob(os.path.join(processed_data_path, \"*.hdf5\"))\n", + "output_path = os.path.join(\"data_processed\", data_type, level) # for saving predictions results" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The `data_type` can be either \"ppi\" or \"srv\", depending on which application the user is most interested in. The `level` can be either \"residue\" or \"atomic\", and refers to the structural resolution, where each node either represents a single residue or a single atom from the molecular structure.\n", + "\n", + "In this tutorial, we will use PPI residue-level data by default, but the same code can be applied to SRV or/and atomic-level data with no changes, apart from setting `data_type` and `level` parameters in the cell above.\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "A Pandas DataFrame containing data points' IDs and the binary target values can be defined:\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df_dict = {}\n", + "df_dict[\"entry\"] = []\n", + "df_dict[\"target\"] = []\n", + "for fname in input_data_path:\n", + " with h5py.File(fname, \"r\") as hdf5:\n", + " for mol in hdf5.keys():\n", + " target_value = float(hdf5[mol][\"target_values\"][\"binary\"][()])\n", + " df_dict[\"entry\"].append(mol)\n", + " df_dict[\"target\"].append(target_value)\n", + "\n", + "df = pd.DataFrame(data=df_dict)\n", + "df.head()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As explained in [data_generation_ppi.ipynb](https://github.com/DeepRank/deeprank2/blob/main/tutorials/data_generation_ppi.ipynb), for each data point there are two targets: \"BA\" and \"binary\". The first represents the strength of the interaction between two molecules that bind reversibly (interact) in nM, while the second represents its binary mapping, being 0 (BA > 500 nM) a not-binding complex and 1 (BA <= 500 nM) binding one.\n", + "\n", + "For SRVs, each data point has a single target, \"binary\", which is 0 if the SRV is considered benign, and 1 if it is pathogenic, as explained in [data_generation_srv.ipynb](https://github.com/DeepRank/deeprank-core/blob/main/tutorials/data_generation_srv.ipynb).\n", + "\n", + "The pandas DataFrame `df` is used only to split data points into training, validation and test sets according to the \"binary\" target - using target stratification to keep the proportion of 0s and 1s constant among the different sets. Training and validation sets will be used during the training for updating the network weights, while the test set will be held out as an independent test and will be used later for the model evaluation.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df_train, df_test = train_test_split(df, test_size=0.1, stratify=df.target, random_state=42)\n", + "df_train, df_valid = train_test_split(df_train, test_size=0.2, stratify=df_train.target, random_state=42)\n", + "\n", + "print(f\"Data statistics:\\n\")\n", + "print(f\"Total samples: {len(df)}\\n\")\n", + "print(f\"Training set: {len(df_train)} samples, {round(100*len(df_train)/len(df))}%\")\n", + "print(f\"\\t- Class 0: {len(df_train[df_train.target == 0])} samples, {round(100*len(df_train[df_train.target == 0])/len(df_train))}%\")\n", + "print(f\"\\t- Class 1: {len(df_train[df_train.target == 1])} samples, {round(100*len(df_train[df_train.target == 1])/len(df_train))}%\")\n", + "print(f\"Validation set: {len(df_valid)} samples, {round(100*len(df_valid)/len(df))}%\")\n", + "print(f\"\\t- Class 0: {len(df_valid[df_valid.target == 0])} samples, {round(100*len(df_valid[df_valid.target == 0])/len(df_valid))}%\")\n", + "print(f\"\\t- Class 1: {len(df_valid[df_valid.target == 1])} samples, {round(100*len(df_valid[df_valid.target == 1])/len(df_valid))}%\")\n", + "print(f\"Testing set: {len(df_test)} samples, {round(100*len(df_test)/len(df))}%\")\n", + "print(f\"\\t- Class 0: {len(df_test[df_test.target == 0])} samples, {round(100*len(df_test[df_test.target == 0])/len(df_test))}%\")\n", + "print(f\"\\t- Class 1: {len(df_test[df_test.target == 1])} samples, {round(100*len(df_test[df_test.target == 1])/len(df_test))}%\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Classification example\n", + "\n", + "A GNN and a CNN can be trained for a classification predictive task, which consists in predicting the \"binary\" target values.\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### GNN\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### GraphDataset\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For training GNNs the user can create `GraphDataset` instances. This class inherits from `DeeprankDataset` class, which in turns inherits from `Dataset` [PyTorch geometric class](https://pytorch-geometric.readthedocs.io/en/latest/_modules/torch_geometric/data/dataset.html), a base class for creating graph datasets.\n", + "\n", + "A few notes about `GraphDataset` parameters:\n", + "\n", + "- By default, all features contained in the HDF5 files are used, but the user can specify `node_features` and `edge_features` in `GraphDataset` if not all of them are needed. See the [docs](https://deeprank2.readthedocs.io/en/latest/features.html) for more details about all the possible pre-implemented features.\n", + "- For regression, `task` should be set to `regress` and the `target` to `BA`, which is a continuous variable and therefore suitable for regression tasks.\n", + "- For the `GraphDataset` class it is possible to define a dictionary to indicate which transformations to apply to the features, being the transformations lambda functions and/or standardization.\n", + " - If the `standardize` key is `True`, standardization is applied after transformation. Standardization consists in applying the following formula on each feature's value: ${x' = \\frac{x - \\mu}{\\sigma}}$, being ${\\mu}$ the mean and ${\\sigma}$ the standard deviation. Standardization is a scaling method where the values are centered around mean with a unit standard deviation.\n", + " - The transformation to apply can be speficied as a lambda function as a value of the key `transform`, which defaults to `None`.\n", + " - Since in the provided example standardization is applied, the training features' means and standard deviations need to be used for scaling validation and test sets. For doing so, `train_source` parameter is used. When `train_source` parameter is set, it will be used to scale the validation/testing sets. You need to pass `features_transform` to the training dataset only, since in other cases it will be ignored and only the one of `train_source` will be considered.\n", + " - Note that transformations have not currently been implemented for the `GridDataset` class.\n", + " - In the example below a logarithmic transformation and then the standardization are applied to all the features. It is also possible to use specific features as keys for indicating that transformation and/or standardization need to be apply to few features only.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "target = \"binary\"\n", + "task = \"classif\"\n", + "node_features = [\"res_type\"]\n", + "edge_features = [\"distance\"]\n", + "features_transform = {\"all\": {\"transform\": lambda x: np.cbrt(x), \"standardize\": True}}\n", + "\n", + "print(\"Loading training data...\")\n", + "dataset_train = GraphDataset(\n", + " hdf5_path=input_data_path,\n", + " subset=list(df_train.entry), # selects only data points with ids in df_train.entry\n", + " node_features=node_features,\n", + " edge_features=edge_features,\n", + " features_transform=features_transform,\n", + " target=target,\n", + " task=task,\n", + ")\n", + "print(\"\\nLoading validation data...\")\n", + "dataset_val = GraphDataset(\n", + " hdf5_path=input_data_path,\n", + " subset=list(df_valid.entry), # selects only data points with ids in df_valid.entry\n", + " train_source=dataset_train,\n", + ")\n", + "print(\"\\nLoading test data...\")\n", + "dataset_test = GraphDataset(\n", + " hdf5_path=input_data_path,\n", + " subset=list(df_test.entry), # selects only data points with ids in df_test.entry\n", + " train_source=dataset_train,\n", + ")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Trainer\n", + "\n", + "The class `Trainer` implements training, validation and testing of PyTorch-based neural networks.\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "A few notes about `Trainer` parameters:\n", + "\n", + "- `neuralnet` can be any neural network class that inherits from `torch.nn.Module`, and it shouldn't be specific to regression or classification in terms of output shape. The `Trainer` class takes care of formatting the output shape according to the task. This tutorial uses a simple network, `NaiveNetwork` (implemented in `deeprank2.neuralnets.gnn.naive_gnn`). All GNN architectures already implemented in the pakcage can be found [here](https://github.com/DeepRank/deeprank-core/tree/main/deeprank2/neuralnets/gnn) and can be used for training or as a basis for implementing new ones.\n", + "- `class_weights` is used for classification tasks only and assigns class weights based on the training dataset content to account for any potential inbalance between the classes. In this case the dataset is balanced (50% 0 and 50% 1), so it is not necessary to use it. It defaults to False.\n", + "- `cuda` and `ngpu` are used for indicating whether to use CUDA and how many GPUs. By default, CUDA is not used and `ngpu` is 0.\n", + "- The user can specify a deeprank2 exporter or a custom one in `output_exporters` parameter, together with the path where to save the results. Exporters are used for storing predictions information collected later on during training and testing. Later the results saved by `HDF5OutputExporter` will be read and evaluated.\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Training\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "trainer = Trainer(\n", + " neuralnet=NaiveNetwork,\n", + " dataset_train=dataset_train,\n", + " dataset_val=dataset_val,\n", + " dataset_test=dataset_test,\n", + " output_exporters=[HDF5OutputExporter(os.path.join(output_path, f\"gnn_{task}\"))],\n", + ")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The default optimizer is `torch.optim.Adam`. It is possible to specify optimizer's parameters or to use another PyTorch optimizer object:\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "optimizer = torch.optim.SGD\n", + "lr = 1e-3\n", + "weight_decay = 0.001\n", + "\n", + "trainer.configure_optimizers(optimizer, lr, weight_decay)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The default loss function for classification is `torch.nn.CrossEntropyLoss` and for regression it is `torch.nn.MSELoss`. It is also possible to set some other PyTorch loss functions by using `Trainer.set_lossfunction` method, although not all are currently implemented.\n", + "\n", + "Then the model can be trained using the `train()` method of the `Trainer` class.\n", + "\n", + "A few notes about `train()` method parameters:\n", + "\n", + "- `earlystop_patience`, `earlystop_maxgap` and `min_epoch` are used for controlling early stopping logic. `earlystop_patience` indicates the number of epochs after which the training ends if the validation loss does not improve. `earlystop_maxgap` indicated the maximum difference allowed between validation and training loss, and `min_epoch` is the minimum number of epochs to be reached before evaluating `maxgap`.\n", + "- If `validate` is set to `True`, validation is performed on an independent dataset, which has been called `dataset_val` few cells above. If set to `False`, validation is performed on the training dataset itself (not recommended).\n", + "- `num_workers` can be set for indicating how many subprocesses to use for data loading. The default is 0 and it means that the data will be loaded in the main process.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "epochs = 20\n", + "batch_size = 8\n", + "earlystop_patience = 5\n", + "earlystop_maxgap = 0.1\n", + "min_epoch = 10\n", + "\n", + "trainer.train(\n", + " nepoch=epochs,\n", + " batch_size=batch_size,\n", + " earlystop_patience=earlystop_patience,\n", + " earlystop_maxgap=earlystop_maxgap,\n", + " min_epoch=min_epoch,\n", + " validate=True,\n", + " filename=os.path.join(output_path, f\"gnn_{task}\", \"model.pth.tar\"),\n", + ")\n", + "\n", + "epoch = trainer.epoch_saved_model\n", + "print(f\"Model saved at epoch {epoch}\")\n", + "pytorch_total_params = sum(p.numel() for p in trainer.model.parameters())\n", + "print(f\"Total # of parameters: {pytorch_total_params}\")\n", + "pytorch_trainable_params = sum(p.numel() for p in trainer.model.parameters() if p.requires_grad)\n", + "print(f\"Total # of trainable parameters: {pytorch_trainable_params}\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Testing\n", + "\n", + "And the trained model can be tested on `dataset_test`:\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "trainer.test()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Results visualization\n", + "\n", + "Finally, the results saved by `HDF5OutputExporter` can be inspected, which can be found in the `data/ppi/gnn_classif/` folder in the form of an HDF5 file, `output_exporter.hdf5`. Note that the folder contains the saved pre-trained model as well.\n", + "\n", + "`output_exporter.hdf5` contains [HDF5 Groups](https://docs.h5py.org/en/stable/high/group.html) which refer to each phase, e.g. training and testing if both are run, only one of them otherwise. Training phase includes validation results as well. This HDF5 file can be read as a Pandas Dataframe:\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "output_train = pd.read_hdf(\n", + " os.path.join(output_path, f\"gnn_{task}\", \"output_exporter.hdf5\"), key=\"training\"\n", + ")\n", + "output_test = pd.read_hdf(\n", + " os.path.join(output_path, f\"gnn_{task}\", \"output_exporter.hdf5\"), key=\"testing\"\n", + ")\n", + "output_train.head()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The dataframes contain `phase`, `epoch`, `entry`, `output`, `target`, and `loss` columns, and can be easily used to visualize the results.\n", + "\n", + "For example, the loss across the epochs can be plotted for the training and the validation sets:\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fig = px.line(output_train, x=\"epoch\", y=\"loss\", color=\"phase\", markers=True)\n", + "\n", + "fig.add_vline(x=trainer.epoch_saved_model, line_width=3, line_dash=\"dash\", line_color=\"green\")\n", + "\n", + "fig.update_layout(\n", + " xaxis_title=\"Epoch #\",\n", + " yaxis_title=\"Loss\",\n", + " title=\"Loss vs epochs - GNN training\",\n", + " width=700,\n", + " height=400,\n", + ")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And now a few metrics of interest for classification tasks can be printed out: the [area under the ROC curve](https://en.wikipedia.org/wiki/Receiver_operating_characteristic#Area_under_the_curve) (AUC), and for a threshold of 0.5 the [precision, recall, accuracy and f1 score](https://en.wikipedia.org/wiki/Precision_and_recall#Definition).\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "skip-execution" + ] + }, + "outputs": [], + "source": [ + "threshold = 0.5\n", + "df = pd.concat([output_train, output_test])\n", + "df_plot = df[(df.epoch == trainer.epoch_saved_model) | ((df.epoch == trainer.epoch_saved_model) & (df.phase == \"testing\"))]\n", + "\n", + "for idx, set in enumerate([\"training\", \"validation\", \"testing\"]):\n", + " df_plot_phase = df_plot[(df_plot.phase == set)]\n", + " y_true = df_plot_phase.target\n", + " y_score = np.array(df_plot_phase.output.values.tolist())[:, 1]\n", + "\n", + " print(f\"\\nMetrics for {set}:\")\n", + " fpr_roc, tpr_roc, thr_roc = roc_curve(y_true, y_score)\n", + " auc_score = auc(fpr_roc, tpr_roc)\n", + " print(f\"AUC: {round(auc_score, 1)}\")\n", + " print(f\"Considering a threshold of {threshold}\")\n", + " y_pred = (y_score > threshold) * 1\n", + " print(f\"- Precision: {round(precision_score(y_true, y_pred), 1)}\")\n", + " print(f\"- Recall: {round(recall_score(y_true, y_pred), 1)}\")\n", + " print(f\"- Accuracy: {round(accuracy_score(y_true, y_pred), 1)}\")\n", + " print(f\"- F1: {round(f1_score(y_true, y_pred), 1)}\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note that the poor performance of this network is due to the small number of datapoints used in this tutorial. For a more reliable network we suggest using a number of data points on the order of at least tens of thousands.\n", + "\n", + "The same exercise can be repeated but using grids instead of graphs and CNNs instead of GNNs.\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### CNN\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### GridDataset\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For training CNNs the user can create `GridDataset` instances.\n", + "\n", + "A few notes about `GridDataset` parameters:\n", + "\n", + "- By default, all features contained in the HDF5 files are used, but the user can specify `features` in `GridDataset` if not all of them are needed. Since grids features are derived from node and edge features mapped from graphs to grid, the easiest way to see which features are available is to look at the HDF5 file, as explained in detail in `data_generation_ppi.ipynb` and `data_generation_srv.ipynb`, section \"Other tools\".\n", + "- As is the case for a `GraphDataset`, `task` can be assigned to `regress` and `target` to `BA` to perform a regression task. As mentioned previously, we do not provide sample data to perform a regression task for SRVs.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "target = \"binary\"\n", + "task = \"classif\"\n", + "\n", + "print(\"Loading training data...\")\n", + "dataset_train = GridDataset(\n", + " hdf5_path=input_data_path,\n", + " subset=list(df_train.entry), # selects only data points with ids in df_train.entry\n", + " target=target,\n", + " task=task,\n", + ")\n", + "print(\"\\nLoading validation data...\")\n", + "dataset_val = GridDataset(\n", + " hdf5_path=input_data_path,\n", + " subset=list(df_valid.entry), # selects only data points with ids in df_valid.entry\n", + " train_source=dataset_train,\n", + ")\n", + "print(\"\\nLoading test data...\")\n", + "dataset_test = GridDataset(\n", + " hdf5_path=input_data_path,\n", + " subset=list(df_test.entry), # selects only data points with ids in df_test.entry\n", + " train_source=dataset_train,\n", + ")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Trainer\n", + "\n", + "As for graphs, the class `Trainer` is used for training, validation and testing of the PyTorch-based CNN.\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Also in this case, `neuralnet` can be any neural network class that inherits from `torch.nn.Module`, and it shouldn't be specific to regression or classification in terms of output shape. This tutorial uses `CnnClassification` (implemented in `deeprank2.neuralnets.cnn.model3d`). All CNN architectures already implemented in the pakcage can be found [here](https://github.com/DeepRank/deeprank2/tree/main/deeprank2/neuralnets/cnn) and can be used for training or as a basis for implementing new ones.\n", + "- The rest of the `Trainer` parameters can be used as explained already for graphs.\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Training\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "optimizer = torch.optim.SGD\n", + "lr = 1e-3\n", + "weight_decay = 0.001\n", + "epochs = 20\n", + "batch_size = 8\n", + "earlystop_patience = 5\n", + "earlystop_maxgap = 0.1\n", + "min_epoch = 10\n", + "\n", + "trainer = Trainer(\n", + " neuralnet=CnnClassification,\n", + " dataset_train=dataset_train,\n", + " dataset_val=dataset_val,\n", + " dataset_test=dataset_test,\n", + " output_exporters=[HDF5OutputExporter(os.path.join(output_path, f\"cnn_{task}\"))],\n", + ")\n", + "\n", + "trainer.configure_optimizers(optimizer, lr, weight_decay)\n", + "\n", + "trainer.train(\n", + " nepoch=epochs,\n", + " batch_size=batch_size,\n", + " earlystop_patience=earlystop_patience,\n", + " earlystop_maxgap=earlystop_maxgap,\n", + " min_epoch=min_epoch,\n", + " validate=True,\n", + " filename=os.path.join(output_path, f\"cnn_{task}\", \"model.pth.tar\"),\n", + ")\n", + "\n", + "epoch = trainer.epoch_saved_model\n", + "print(f\"Model saved at epoch {epoch}\")\n", + "pytorch_total_params = sum(p.numel() for p in trainer.model.parameters())\n", + "print(f\"Total # of parameters: {pytorch_total_params}\")\n", + "pytorch_trainable_params = sum(p.numel() for p in trainer.model.parameters() if p.requires_grad)\n", + "print(f\"Total # of trainable parameters: {pytorch_trainable_params}\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Testing\n", + "\n", + "And the trained model can be tested on `dataset_test`:\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "trainer.test()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Results visualization\n", + "\n", + "As for GNNs, the results saved by `HDF5OutputExporter` can be inspected, and are saved in the `data/ppi/cnn_classif/` or `data/srv/cnn_classif/` folder in the form of an HDF5 file, `output_exporter.hdf5`, together with the saved pre-trained model.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "output_train = pd.read_hdf(\n", + " os.path.join(output_path, f\"cnn_{task}\", \"output_exporter.hdf5\"), key=\"training\"\n", + ")\n", + "output_test = pd.read_hdf(\n", + " os.path.join(output_path, f\"cnn_{task}\", \"output_exporter.hdf5\"), key=\"testing\"\n", + ")\n", + "output_train.head()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Also in this case, the loss across the epochs can be plotted for the training and the validation sets:\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fig = px.line(output_train, x=\"epoch\", y=\"loss\", color=\"phase\", markers=True)\n", + "\n", + "fig.add_vline(x=trainer.epoch_saved_model, line_width=3, line_dash=\"dash\", line_color=\"green\")\n", + "\n", + "fig.update_layout(\n", + " xaxis_title=\"Epoch #\",\n", + " yaxis_title=\"Loss\",\n", + " title=\"Loss vs epochs - CNN training\",\n", + " width=700,\n", + " height=400,\n", + ")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And some metrics of interest for classification tasks:\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "threshold = 0.5\n", + "df = pd.concat([output_train, output_test])\n", + "df_plot = df[(df.epoch == trainer.epoch_saved_model) | ((df.epoch == trainer.epoch_saved_model) & (df.phase == \"testing\"))]\n", + "\n", + "for idx, set in enumerate([\"training\", \"validation\", \"testing\"]):\n", + " df_plot_phase = df_plot[(df_plot.phase == set)]\n", + " y_true = df_plot_phase.target\n", + " y_score = np.array(df_plot_phase.output.values.tolist())[:, 1]\n", + "\n", + " print(f\"\\nMetrics for {set}:\")\n", + " fpr_roc, tpr_roc, thr_roc = roc_curve(y_true, y_score)\n", + " auc_score = auc(fpr_roc, tpr_roc)\n", + " print(f\"AUC: {round(auc_score, 1)}\")\n", + " print(f\"Considering a threshold of {threshold}\")\n", + " y_pred = (y_score > threshold) * 1\n", + " print(f\"- Precision: {round(precision_score(y_true, y_pred), 1)}\")\n", + " print(f\"- Recall: {round(recall_score(y_true, y_pred), 1)}\")\n", + " print(f\"- Accuracy: {round(accuracy_score(y_true, y_pred), 1)}\")\n", + " print(f\"- F1: {round(f1_score(y_true, y_pred), 1)}\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "It's important to note that the dataset used in this analysis is not sufficiently large to provide conclusive and reliable insights. Depending on your specific application, you might find regression, classification, GNNs, and/or CNNs to be valuable options. Feel free to choose the approach that best aligns with your particular problem!\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "deeprank2", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 }