From 9e8b8f37fce64423b1dd006f968bfa41e6b8d7e2 Mon Sep 17 00:00:00 2001 From: awa59kst120df Date: Mon, 7 Oct 2024 13:24:42 +0200 Subject: [PATCH] fixed github action --- .github/workflows/run_test.yml | 8 +- .gitignore | 3 + _pyproject_uv.toml | 69 +++++ custom_model/README.rst | 242 ------------------ ...erate_molecule_dataset_from_csv__custom.py | 141 ---------- pyproject.toml | 2 - 6 files changed, 76 insertions(+), 389 deletions(-) create mode 100644 _pyproject_uv.toml delete mode 100644 custom_model/README.rst delete mode 100644 custom_model/generate_molecule_dataset_from_csv__custom.py diff --git a/.github/workflows/run_test.yml b/.github/workflows/run_test.yml index d7f8fa0bf..4152c67d0 100644 --- a/.github/workflows/run_test.yml +++ b/.github/workflows/run_test.yml @@ -20,15 +20,15 @@ jobs: - name: Install dependencies run: | - python -m pip install uv - uv pip install --system --no-build-isolation torch==2.1.2 - uv pip install --system --no-build-isolation -e . + pip install torch=2.1.2 + pip install . - name: Lint with Ruff run: | - uv pip install --system ruff + pip install ruff ruff check . - name: Run tests run: | + pip install pytest python -m unittest discover -s tests \ No newline at end of file diff --git a/.gitignore b/.gitignore index ab17819e3..4a9ab1b06 100644 --- a/.gitignore +++ b/.gitignore @@ -23,3 +23,6 @@ graph_attention_student/examples/results/* graph_attention_student/examples/assets/* tests/artifacts/* + +# slurm logs +slurm-* \ No newline at end of file diff --git a/_pyproject_uv.toml b/_pyproject_uv.toml new file mode 100644 index 000000000..dfb01c292 --- /dev/null +++ b/_pyproject_uv.toml @@ -0,0 +1,69 @@ +[build-system] +requires = ["hatchling>=1.25.0", "setuptools>=70.0.0"] +#build-backend = "hatchling.build" + +[project] +name = "graph_attention_student" +version = "0.18.2" +description = "MEGAN: Multi Explanation Graph Attention Network" +requires-python = ">=3.10.0,<3.12" +dependencies = [ + "pycomex>=0.13.1", + "visual_graph_datasets>=0.11.0", + "click>=7.1.2,<8.0.0", + "rich_click>=1.8.0,<2.0.0", + "numpy>=1.22.0,<2.0.0", + "matplotlib>=3.5.3,<4.0.0", + "imageio>=2.19.0,<3.0.0", + "seaborn>=0.13.1,<0.14.0", + "cairosvg>=2.5.2,<3.0.0", + "rdkit>=2022.9.1", + "orjson>=3.8.0", + "hdbscan>=0.8.33", + "torch>=2.1.2,<=2.3.1", + "torch_scatter>=2.1.2", + "torch_geometric>=2.1.2", + "lightning>=2.1.3", + "nltk>=3.7,<4.0.0", + "setuptools>=70.0.0", +] + +[project.scripts] +graph_attention_student = 'graph_attention_student.cli:cli' + +[tool.uv] +no-build-isolation-package = ["torch", "torch_scatter", "torch_geometric"] +dev-dependencies = [ + "pytest==8.3.2", + "ruff==0.6.9", +] + +[tool.uv.workspace] +exclude = [ + "graph_attention_student/experiments/results", + "graph_attention_student/examples/results", + "venv", +] + +# Here we define the editable dependencies which are mainly used during development +[tool.uv.sources] +pycomex = { path = "../pycomex" } +visual_graph_datasets = { path = "../visual_graph_datasets" } + +# We need to provide some additional metadata for torch_geometric here, specifically +# the fact that it depends on torch because for some reason the package itself does +# not declare this dependency. +[[tool.uv.dependency-metadata]] +name = "torch_geometric" +version = "2.4.0" +requires-dist = [ + "torch>=2.1.2,<=2.3.1", + "torch_scatter>=2.1.2", +] + +[[tool.uv.dependency-metadata]] +name = "torch-scatter" +version = "2.1.2" +requires-dist = [ + "torch>=2.1.2,<=2.3.1", +] diff --git a/custom_model/README.rst b/custom_model/README.rst deleted file mode 100644 index 9aa0863cc..000000000 --- a/custom_model/README.rst +++ /dev/null @@ -1,242 +0,0 @@ -============================================== -๐Ÿค– Training a MEGAN Model for a Custom Dataset -============================================== - -This file will discuss the required steps to apply the MEGAN model to a custom dataset. Generally, the easiest way to do -this is to use the already existing experimentation code in this package and simply tune the parameters of these experiment -scripts to your specific needs. - -Overall, the process can be summerized into the following two steps: - -1. Convert the custom dataset from it's original format into a *visual graph dataset*. This is a special dataset format - from which the model can be easily trained. -2. Configure the model hyperparameters and train the model on the visual graph dataset created in the previous step. - - -๐Ÿงช PyComex - Computational Experiments -====================================== - -Generally, all the experimentation code is based on the PyComex_ micro-framework. PyComex is a simple and lightweight -framework that simplifies many aspects of developing, executing and managing computational experiments. This section -introduces some of its core aspects which are necessary for the subsequent sections. -For more detailed information, please visit the PyComex package: https://github.com/the16thpythonist/pycomex - -In PyComex, each individual experiment should be self-contained as its own Python module. These modules can be executed -to execute the corresponding experiment. Each experiment will automatically create a new folder containing all the -experiment artifacts in a subfolder of the ``results`` folder. An example folder structure may look like this: - -.. code-block:: text - - . - โ””โ”€โ”€ experiments/ - โ”œโ”€โ”€ results/ - โ”‚ โ”œโ”€โ”€ first_experiment/ - โ”‚ โ”‚ โ””โ”€โ”€ debug/ - โ”‚ โ”‚ โ”œโ”€โ”€ experiment_data.json - โ”‚ โ”‚ โ”œโ”€โ”€ experiment_meta.json - โ”‚ โ”‚ โ”œโ”€โ”€ experiment_log.txt - โ”‚ โ”‚ โ””โ”€โ”€ plot.png - โ”‚ โ””โ”€โ”€ second_experiment - โ”œโ”€โ”€ first_experiment.py - โ””โ”€โ”€ second_experiemnt.py - - -Experiments are designed to follow the DRY (dont repeat yourself) principle. When creating a new experiment, which is -simply a small deviation of an already existing experiment, the code doesn't have to be copied, but can be reused directly -by using *experiment inheritance*. A sub-experiment can be defined like this: - -.. code-block:: python - - from pycomex.functional.experiment import Experiment - from pycomex.utils import folder_path, file_namespace - - # The default parameters of the base path can be overwritten simply be assigning the - # parameter variables new values at the beginning of the module. - SOURCE_PATH: str = 'path/to/new/source/file.txt' - - experiment = Experiment.extend( - 'base_experiment.py', - base_path=folder_path(__file__), - namespace=file_namespace(__file__), - glob=globals() - ) - - # Besides parameter values, it is also possible to overwrite certain functionality - # in the base experiment by overwriting the corresponding hooks. - # In this example, whenever the "load_source" hook function would be invoked in the - # base experiment file, the functionality would be replaced by the custom code that - # is defined here in the sub-experiment. - @experiment.hook('load_source', default=False, replace=True) - def load_source(e: Experiment) -> str: - - # The parameter values from the top of the file can accessed as an instance attribute - # of the experiment instance, which is passed as a default parameter to each - # hook function. - with open(e.SOURCE_PATH, 'r') as file: - return file.read() - - # This special method has to be called at the top-most level of the module - # so that the experiment code actually gets executed when executing the script. - experiment.run_if_main() - - -Specifically, this experiment inheritance can be used to extend the already existing functionality for the -model training and only overwriting specific parameters and functionality that is required for each specific -use case. - - -๐Ÿ“ Converting SMILES-based Datasets -=================================== - -Since the application of MEGAN for molecular property prediction for a SMILES-based dataset is the most common use case, the -following elaborations will use this as an example to demonstrate the required steps to train a custom MEGAN model. - -Most often a dataset for molecular property prediction will be given as a CSV file which contains the representations of the -source molecules in their SMILES string representation in one column, and the corresponding target values in another column. -The following example illustrates how such a source CSV file may look like: - -.. code-block:: csv - - smiles,logP - CCO,0.2 - CCN,0.3 - CCC,0.5 - CC(=O)O,0.8 - CC(=O)N,0.7 - C1CC1,0.6 - ... - -Whenever the dataset is given in this CSV format, the pre-defined ``generate_molecule_dataset_from_csv.py`` experiment -can be used to conveniently convert this CSV format into a visual graph dataset. In essence, one has to create a new -sub-experiment module that inherits from this base experiment and modify the corresponding experiment parameters that -provide the necessary information about the source dataset. This sub-experiment can then be executed to generate the -visual graph dataset format. - -.. code-block:: python - - import os - from pycomex.functional.experiment import Experiment - from pycomex.utils import folder_path, file_namespace - from visual_graph_datasets.util import EXPERIMENTS_PATH - - # == CUSTOMIZE PARAMETERS == - - # Insert absolute path to your own CSV file - CSV_FILE_PATH: str = 'path/to/file.csv' - # Insert name of the column that contains the SMILES representation - SMILES_COLUMN_NAME: str = 'smiles' - # Insert name of the columns that contain the target values - TARGET_COLUMN_NAMES: t.List[str] = ['class_0', 'class_1'] - # Define the type of the dataset / task - TARGET_TYPE: str = 'classification' # or 'regression' - # The name of the dataset in the artifacts folder - DATASET_NAME: str = 'dataset' - - # == INHERIT EXPERIMENT == - - experiment = Experiment.extend( - os.path.join(EXPERIMENTS_PATH, 'generate_molecule_dataset_from_csv.py'), - base_path=folder_path(__file__), - namespace=file_namespace(__file__), - glob=globals() - ) - experiment.run_if_main() - - -**NOTE.** For a classification dataset, there should be as many target columns as there are classes in the dataset. -The corresponding values in these columns should be 0/1 values indicating if a molecule belongs to that class -or not. For regression problems, the single target column should contain the raw float property values. - - -After executing this sub-experiment, a new visual graph dataset will be created in the artifacts subfolder of the -``results`` folder. The absolute path to this dataset folder will be required in the training step. - - -๐Ÿค– Training the MEGAN Model -=========================== - -Assuming that a new visual graph dataset was successfully generated in the previous step, this section elaborates -how to train a new MEGAN model based on this dataset. - -To train, one has to create a new sub-experiment module that inherits from the ``vgd_torch__megan.py`` base experiment -like this: - -.. code-block:: python - - import os - import typing as t - from pycomex.functional.experiment import Experiment - from pycomex.utils import folder_path, file_namespace - from graph_attention_student.util import EXPERIMENTS_PATH - - # == CUSTOMIZE PARAMETERS == - - # Insert absolute path to the recently created visual graph dataset folder - VISUAL_GRAPH_DATASET: str = 'path/to/visual/graph/dataset/folder' - # Define the type of the dataset / task - DATASET_TYPE: str = 'classification' # or 'regression' - # The number of randomly chosen elements from the dataset to act as the test set - NUM_TEST: int = 1000 - - # The number of hidden units in the message passing layers of the network - UNITS: t.List[int] = [64, 64, 64] - # The numbers of hidden units in the dense projection networks - PROJECTION_UNITS: t.List[int] = [64, 128, 258] - # The number of hidden units in the final prediction network - # NOTE: The last value must be equal to the number of target values in the dataset! - FINAL_UNITS: t.List[int] = [32, 1] - # The number of explanation channels. - # regression: always 2 (positive and negative) - classification: number of classes - NUM_CHANNELS: int = 2 - # For regression tasks, this value should be set to the median target value of the - # dataset (defines the reference point of what is considered "negative" and "positive") - REGRESSION_REFERENCE: t.Optional[float] = None - # Number of graphs to use for one batch - BATCH_SIZE: int = 64 - # Number of full epochs to train the model for - EPOCHS: int = 100 - - # == INHERIT EXPERIMENT == - - experiment = Experiment.extend( - os.path.join(EXPERIMENTS_PATH, 'vgd_torch__megan.py'), - base_path=folder_path(__file__), - namespace=file_namespace(__file__), - glob=globals() - ) - experiment.run_if_main() - - -Executing this experiment will initiate the training of a new MEGAN model - depending on the dataset size and the chosen -number of training epochs, this may take a while. Once finished, the results of the training and the evaluation on the test -set are saved as artifacts in the corresponding subfolder of the ``results`` folder. - -The artifacts also include a ``model.ckpt`` which can be loaded to perform new predictions without having to retrain the model. - - -โ“ FAQs -======= - -This section will answer some common questions that may arise during the process of training a custom MEGAN model. - -What if I need to customize additional aspects not listed here? ---------------------------------------------------------------- - -In this case, a good first step is to read the read through the base experiment files that are used as the basis of -of the specific sub-experiments: - -- https://github.com/aimat-lab/graph_attention_student/blob/master/graph_attention_student/experiments/vgd_torch__megan.py -- https://github.com/aimat-lab/graph_attention_student/blob/master/graph_attention_student/experiments/vgd_torch.py -- https://github.com/aimat-lab/visual_graph_datasets/blob/master/visual_graph_datasets/experiments/generate_molecule_dataset_from_csv.py - -These files define a lot more parameters than the ones that are presented in this guide. Chances are, that you'll already -find a parameter for the customization you have in mind. If not, the next best option would be to look at what kinds of *hooks* are -used in these base experiments. As a first measure, it'd make sense to overwrite one of the hooks to achieve the desired -functionality. - -Only if None of these options are possible, you can modify the base experiment files directly to implement the desired functionality. -This measure is discouraged, however, because these custom modifications will conflict with future updates to the base experiment files. -If the changes are implemented purely by overwriting parameters or hooks in a sub-experiment, the chances are high that these will maintain -compatible even if the base experiments are upaded in future version of the ``graph_attention_student`` or ``visual_graph_datasets`` packages. - -.. _Pycomex: https://github.com/the16thpythonist/pycomex/tree/master \ No newline at end of file diff --git a/custom_model/generate_molecule_dataset_from_csv__custom.py b/custom_model/generate_molecule_dataset_from_csv__custom.py deleted file mode 100644 index 2ed285243..000000000 --- a/custom_model/generate_molecule_dataset_from_csv__custom.py +++ /dev/null @@ -1,141 +0,0 @@ -import os -import typing as t - -from pycomex.functional.experiment import Experiment -from pycomex.utils import folder_path, file_namespace - -from visual_graph_datasets.util import EXPERIMENTS_PATH - -# == SOURCE PARAMETERS == -# These parameters determine how to handle the source CSV file of the dataset. There exists the possibility -# to define a file from the local system or to download a file from the VGD remote file share location. -# In this section one also has to determine, for example, the type of the source dataset (regression, -# classification) and provide the names of the relevant columns in the CSV file. - -# :param FILE_SHARE_PROVIDER: -# The vgd file share provider from which to download the CSV file to be used as the source for the VGD -# conversion. -FILE_SHARE_PROVIDER: str = 'main' -# :param CSV_FILE_NAME: -# The name of the CSV file to be used as the source for the dataset conversion. -# This may be one of the following two things: -# 1. A valid absolute file path on the local system pointing to a CSV file to be used as the source for -# the VGD conversion -# 2. A valid relative path to a CSV file stashed on the given vgd file share provider which will be -# downloaded first and then processed. -CSV_FILE_NAME: str = 'source/benzene_solubility.csv' -# :param INDEX_COLUMN_NAME: -# (Optional) this may define the string name of the CSV column which contains the integer index -# associated with each dataset element. If this is not given, then integer indices will be randomly -# generated for each element in the final VGD -INDEX_COLUMN_NAME: t.Optional[str] = None -# :param INDICES_BLACKLIST_PATH: -# Optionally it is possible to define the path to a file which defines the blacklisted indices for the -# dataset. This file should contain a list of integers, where each integer represents the index of an -# element which should be excluded from the final dataset. The file should be a normal TXT file where each -# integer is on a new line. -# The indices listed in that file will be immediately skipped during processing without even loading the -# the molecule. -INDICES_BLACKLIST_PATH: t.Optional[str] = None -# :param SMILES_COLUMN_NAME: -# This has to be the string name of the CSV column which contains the SMILES string representation of -# the molecule. -SMILES_COLUMN_NAME: str = 'SMILES' -# :param TARGET_TYPE: -# This has to be the string name of the type of dataset that the source file represents. The valid -# options here are "regression" and "classification" -TARGET_TYPE: str = 'regression' # 'classification' -# :param TARGET_COLUMN_NAMES: -# This has to be a list of string column names within the source CSV file, where each name defines -# one column that contains a target value for each row. In the regression case, this may be multiple -# different regression targets for each element and in the classification case there has to be one -# column per class. -TARGET_COLUMN_NAMES: t.List[str] = ['LogS'] -# :param SPLIT_COLUMN_NAMES: -# The keys of this dictionary are integers which represent the indices of various train test splits. The -# values are the string names of the columns which define those corresponding splits. It is expected that -# these CSV columns contain a "1" if that corresponding element is considered as part of the training set -# of that split and "0" if it is part of the test set. -# This dictionary may be empty and then no information about splits will be added to the dataset at all. -SPLIT_COLUMN_NAMES: t.Dict[int, str] = { -} -# :param SUBSET: -# Optional. This can be used to set a number of elements after which to terminate the processing procedure. -# If this is None, the whole dataset will be processed. This feature can be useful if only a certain -# part of the datase should be processed or for testing reasons for example. -SUBSET: t.Optional[int] = None - -# == PROCESSING PARAMETERS == -# These parameters control the processing of the raw SMILES into the molecule representations with RDKit -# and then finally the conversion into the graph dict representation. - -# :param UNDIRECTED_EDGES_AS_TWO: -# If this flag is True, the undirected edges which make up molecular graph will be converted into two -# opposing directed edges. Depends on the downstream ML framework to be used. -UNDIRECTED_EDGES_AS_TWO: bool = True -# :param USE_NODE_COORDINATES: -# If this flag is True, the coordinates of each atom will be calculated for each molecule and the resulting -# 3D coordinate vector will be added as a separate property of the resulting graph dict. -USE_NODE_COORDINATES: bool = True -# :param GRAPH_METADATA_CALLBACKS: -# This is a dictionary that can be use to define additional information that should be extracted from the -# the csv file and to be transferred to the metadata dictionary of the visual graph dataset elements. -# The keys of this dict should be the string names that the properties will then have in the final metadata -# dictionary. The values should be callback functions with two parameters: "mol" is the rdkit molecule object -# representation of each dataset element and "data" is the corresponding dictionary containing all the -# values from the csv file indexed by the names of the columns. The function itself should return the actual -# data to be used for the corresponding custom property. -GRAPH_METADATA_CALLBACKS = { - 'name': lambda mol, data: data['smiles'], - 'smiles': lambda mol, data: data['smiles'], -} - - -# == DATASET PARAMETERS == -# These parameters control aspects of the visual graph dataset creation process. This for example includes -# the dimensions of the graph visualization images to be created or the name of the visual graph dataset -# that should be given to the dataset folder. - -# :param DATASET_CHUNK_SIZE: -# This number will determine the chunking of the dataset. Dataset chunking will split the dataset -# elements into multiple sub folders within the main VGD folder. Especially for larger datasets -# this should increase the efficiency of subsequent IO operations. -# If this is None then no chunking will be applied at all and everything will be placed into the -# top level folder. -DATASET_CHUNK_SIZE: t.Optional[int] = 10_000 -# :param DATASET_NAME: -# The name given to the visual graph dataset folder which will be created. -DATASET_NAME: str = 'dataset' -# :param IMAGE_WIDTH: -# The width molecule visualization PNG image -IMAGE_WIDTH: int = 1000 -# :param IMAGE_HEIGHT: -# The height of the molecule visualization PNG image -IMAGE_HEIGHT: int = 1000 - -# == EVALUATION PARAMETERS == -# These parameters control the evaluation process which included the plotting of the dataset statistics -# after the dataset has been completed for example. - -# :param EVAL_LOG_STEP: -# The number of iterations after which to print a log message -EVAL_LOG_STEP = 100 -# :param NUM_BINS: -# The number of bins to use for the histogram -NUM_BINS = 10 -# :param PLOT_COLOR: -# The color to be used for the plots -PLOT_COLOR = 'gray' - - -# == EXPERIMENT PARAMETERS == - -__DEBUG__ = True -experiment = Experiment.extend( - os.path.join(EXPERIMENTS_PATH, 'generate_molecule_dataset_from_csv.py'), - base_path=folder_path(__file__), - namespace=file_namespace(__file__), - glob=globals() -) - -experiment.run_if_main() diff --git a/pyproject.toml b/pyproject.toml index 3d5126f91..bc7d536d2 100755 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,8 +38,6 @@ seaborn = ">=0.13.1" imageio = ">=2.19.0" cairosvg = ">=2.5.2" numpy = ">=1.22.0,<2.0.0" -tensorflow = "<=2.12.0" -kgcnn = "==2.2.4" nltk = ">=3.7" rdkit = ">=2022.9.1" orjson = ">=3.8.0"