merging main into branch

ur-whitelab · Mar 18, 2024 · b8e51c6 · b8e51c6
2 parents ccc1b95 + ee28a70
commit b8e51c6
Show file tree

Hide file tree

Showing 57 changed files with 10,109 additions and 2,475 deletions.
diff --git a/.env.example b/.env.example
@@ -4,8 +4,5 @@
 # OpenAI API Key
 OPENAI_API_KEY=YOUR_OPENAI_API_KEY_GOES_HERE  # pragma: allowlist secret
 
-# PQA API Key
-PQA_API_KEY=YOUR_PQA_API_KEY_GOES_HERE  # pragma: allowlist secret
-
 # Serp API key
 SERP_API_KEY=YOUR_SERP_API_KEY_GOES_HERE  # pragma: allowlist secret
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -13,10 +13,10 @@ jobs:
 
     steps:
     - uses: actions/checkout@v2
-    - name: Set up Python "3.9"
+    - name: Set up Python "3.11"
       uses: actions/setup-python@v2
       with:
-        python-version: "3.9"
+        python-version: "3.11"
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -23,10 +23,10 @@ jobs:
         environment-file: environment.yaml
         python-version: ${{ matrix.python-version }}
         auto-activate-base: true
-    - name: Install openmm pdbfixer mdanalysis with conda
+    - name: Install pdbfixer with conda
       shell: bash -l {0}
       run: |
-        conda install -c conda-forge openmm pdbfixer mdanalysis
+        conda install -c conda-forge pdbfixer
     - name: Install dependencies
       shell: bash -l {0}
       run: |
@@ -45,6 +45,5 @@ jobs:
       env:
         OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
         SEMANTIC_SCHOLAR_API_KEY: ${{ secrets.SEMANTIC_SCHOLAR_API_KEY }}
-        PQA_API_KEY : ${{ secrets.PQA_API_TOKEN }}
       run: |
          pytest -m "not skip" tests
diff --git a/.secrets.baseline b/.secrets.baseline
@@ -3,8 +3,5 @@
 # Rule for detecting OpenAI API keys
 OpenAI API Key: \b[secrets]{3}_[a-zA-Z0-9]{32}\b
 
-# Rule for detecting pqa API keys
-PQA API Key: "pqa[a-zA-Z0-9-._]+"
-
 # Rule for detecting serp API keys
 # Serp API Key: "[a-zA-Z0-9]{64}"
diff --git a/README.md b/README.md
@@ -7,8 +7,8 @@ To use the OpenMM features in the agent, please set up a conda environment, foll
 - Create conda environment: `conda env create -n mdagent -f environment.yaml`
 - Activate your environment: `conda activate mdagent`
 
-If you already have a conda environment, you can install the necessary dependencies with the following steps.
-- Install the necessary conda dependencies: `conda install -c conda-forge openmm pdbfixer mdanalysis`
+If you already have a conda environment, you can install, pdbfixer, a necessary dependency with the following steps.
+- Install the necessary conda dependencies: `conda install -c conda-forge pdbfixer`
 
 
 ## Installation

diff --git a/dev-requirements.txt b/dev-requirements.txt
@@ -1,2 +1,3 @@
 pre-commit
 pytest
+pytest-mock
diff --git a/mdagent/tools/base_tools/__init__.py b/mdagent/tools/base_tools/__init__.py
@@ -1,28 +1,24 @@
 from .analysis_tools.plot_tools import SimulationOutputFigures
 from .analysis_tools.ppi_tools import PPIDistance
 from .analysis_tools.rdf_tool import RDFTool
-from .analysis_tools.rmsd_tools import RMSDCalculator
-from .analysis_tools.vis_tools import (
-    CheckDirectoryFiles,
-    VisFunctions,
-    VisualizeProtein,
+from .analysis_tools.rgy import (
+    RadiusofGyrationAverage,
+    RadiusofGyrationPerFrame,
+    RadiusofGyrationPlot,
 )
+from .analysis_tools.rmsd_tools import RMSDCalculator
+from .analysis_tools.vis_tools import VisFunctions, VisualizeProtein
 from .preprocess_tools.clean_tools import (
     AddHydrogensCleaningTool,
     CleaningToolFunction,
     CleaningTools,
     RemoveWaterCleaningTool,
     SpecializedCleanTool,
 )
-from .preprocess_tools.pdb_tools import (
-    PackMolTool,
-    ProteinName2PDBTool,
-    SmallMolPDB,
-    get_pdb,
-)
+from .preprocess_tools.packing import PackMolTool
+from .preprocess_tools.pdb_get import ProteinName2PDBTool, SmallMolPDB, get_pdb
 from .simulation_tools.create_simulation import ModifyBaseSimulationScriptTool
 from .simulation_tools.setup_and_run import (
-    InstructionSummary,
     SetUpandRunFunction,
     SetUpAndRunTool,
     SimulationFunctions,
@@ -33,9 +29,7 @@
 
 __all__ = [
     "AddHydrogensCleaningTool",
-    "CheckDirectoryFiles",
     "CleaningTools",
-    "InstructionSummary",
     "ListRegistryPaths",
     "MapPath2Name",
     "ProteinName2PDBTool",
@@ -45,6 +39,9 @@
     "VisualizeProtein",
     "RMSDCalculator",
     "RemoveWaterCleaningTool",
+    "RadiusofGyrationAverage",
+    "RadiusofGyrationPerFrame",
+    "RadiusofGyrationPlot",
     "Scholar2ResultLLM",
     "SerpGitTool",
     "SetUpAndRunTool",

diff --git a/mdagent/tools/base_tools/analysis_tools/__init__.py b/mdagent/tools/base_tools/analysis_tools/__init__.py
@@ -1,13 +1,16 @@
 from .plot_tools import SimulationOutputFigures
 from .ppi_tools import PPIDistance
+from .rgy import RadiusofGyrationAverage, RadiusofGyrationPerFrame, RadiusofGyrationPlot
 from .rmsd_tools import RMSDCalculator
-from .vis_tools import CheckDirectoryFiles, VisFunctions, VisualizeProtein
+from .vis_tools import VisFunctions, VisualizeProtein
 
 __all__ = [
     "PPIDistance",
     "RMSDCalculator",
+    "RadiusofGyrationPerFrame",
+    "RadiusofGyrationPlot",
     "SimulationOutputFigures",
-    "CheckDirectoryFiles",
     "VisualizeProtein",
     "VisFunctions",
+    "RadiusofGyrationAverage",
 ]
diff --git a/mdagent/tools/base_tools/analysis_tools/plot_tools.py b/mdagent/tools/base_tools/analysis_tools/plot_tools.py
@@ -1,67 +1,107 @@
 import csv
+import os
 import re
 from typing import Optional
 
 import matplotlib.pyplot as plt
 from langchain.tools import BaseTool
 
-from mdagent.utils import PathRegistry
-
-
-def process_csv(file_name):
-    with open(file_name, "r") as f:
-        reader = csv.DictReader(f)
-        headers = reader.fieldnames
-        data = list(reader)
-
-    matched_headers = [
-        (i, header)
-        for i, header in enumerate(headers)
-        if re.search(r"(step|time)", header, re.IGNORECASE)
-    ]
-
-    return data, headers, matched_headers
-
-
-def plot_data(data, headers, matched_headers):
-    # Get the first matched header
-    if matched_headers:
-        time_or_step = matched_headers[0][1]
-        xlab = "step" if "step" in time_or_step.lower() else "time"
-    else:
-        print("No 'step' or 'time' headers found.")
-        return
-
-    failed_headers = []
-    created_plots = []
-    for header in headers:
-        if header != time_or_step:
-            try:
-                x = [float(row[time_or_step]) for row in data]
-                y = [float(row[header]) for row in data]
-
-                header_lab = (
-                    header.split("(")[0].strip() if "(" in header else header
-                ).lower()
-                plot_name = f"{xlab}_vs_{header_lab}.png"
-
-                # Generate and save the plot
-                plt.figure()
-                plt.plot(x, y)
-                plt.xlabel(xlab)
-                plt.ylabel(header)
-                plt.title(f"{xlab} vs {header_lab}")
-                plt.savefig(plot_name)
-                plt.close()
-
-                created_plots.append(plot_name)
-            except ValueError:
-                failed_headers.append(header)
-
-    if len(failed_headers) == len(headers) - 1:  # -1 to account for time_or_step header
-        raise Exception("All plots failed due to non-numeric data.")
-
-    return ", ".join(created_plots)
+from mdagent.utils import FileType, PathRegistry
+
+
+class PlottingTools:
+    def __init__(
+        self,
+        path_registry,
+    ):
+        self.path_registry = path_registry
+        self.data = None
+        self.headers = None
+        self.matched_headers = None
+        self.file_id = None
+        self.file_path = None
+
+    def _find_file(self, file_id: str) -> None:
+        self.file_id = file_id
+        self.file_path = self.path_registry.get_mapped_path(file_id)
+        if not self.file_path:
+            raise FileNotFoundError("File not found.")
+        return None
+
+    def process_csv(self) -> None:
+        with open(self.file_path, "r") as f:
+            reader = csv.DictReader(f)
+            self.headers = reader.fieldnames if reader.fieldnames is not None else []
+            self.data = list(reader)
+
+        self.matched_headers = [
+            (i, header)
+            for i, header in enumerate(self.headers)
+            if re.search(r"(step|time)", header, re.IGNORECASE)
+        ]
+
+        if not self.matched_headers or not self.headers or not self.data:
+            raise ValueError("File could not be processed.")
+        return None
+
+    def plot_data(self) -> str:
+        if self.matched_headers:
+            time_or_step = self.matched_headers[0][1]
+            xlab = "step" if "step" in time_or_step.lower() else "time"
+        else:
+            raise ValueError("No timestep found.")
+
+        failed_headers = []
+        created_plots = []
+        for header in self.headers:
+            if header != time_or_step:
+                try:
+                    x = [float(row[time_or_step]) for row in self.data]
+                    y = [float(row[header]) for row in self.data]
+
+                    header_lab = (
+                        header.split("(")[0].strip() if "(" in header else header
+                    ).lower()
+                    # Generate and save the plot
+                    plt.figure()
+                    plt.plot(x, y)
+                    plt.xlabel(xlab)
+                    plt.ylabel(header)
+                    plt.title(f"{self.file_id}_{xlab} vs {header_lab}")
+                    fig_vs = f"{xlab}vs{header_lab}"
+                    # PR: Mapping involves writing file name -> get file id
+                    plot_name = self.path_registry.write_file_name(
+                        type=FileType.FIGURE,
+                        Log_id=self.file_id,
+                        fig_analysis=fig_vs,
+                        file_format="png",
+                    )
+                    plot_id = self.path_registry.get_fileid(
+                        file_name=plot_name, type=FileType.FIGURE
+                    )
+                    if not os.path.exists("files/figures"):  # PR: Needed to avoid error
+                        os.makedirs("files/figures")
+                    plt.savefig(f"files/figures/{plot_name}")
+                    self.path_registry.map_path(
+                        plot_id,
+                        f"files/figures/{plot_name}",
+                        (
+                            f"Post Simulation Figure for {self.file_id}"
+                            f" - {header_lab} vs {xlab}"
+                        ),
+                    )
+                    plt.close()
+
+                    created_plots.append(plot_name)
+                except ValueError:
+                    failed_headers.append(header)
+
+        if (
+            len(failed_headers) == len(self.headers) - 1
+        ):  # -1 to account for time_or_step header
+            raise Exception("All plots failed due to non-numeric data.")
+
+        return ", ".join(created_plots)
 
 
 class SimulationOutputFigures(BaseTool):
@@ -71,24 +111,27 @@ class SimulationOutputFigures(BaseTool):
     simulation and create figures for
     all physical parameters
     versus timestep of the simulation.
-    Give this tool the path to the
-    csv file output from the simulation."""
+    Give this tool the name of the
+    csv file output from the simulation.
+    The tool will get the exact path."""
 
     path_registry: Optional[PathRegistry]
 
-    def _run(self, file_path: str) -> str:
+    def __init__(self, path_registry: Optional[PathRegistry] = None):
+        super().__init__()
+        self.path_registry = path_registry
+
+    def _run(self, file_id: str) -> str:
         """use the tool."""
         try:
-            data, headers, matched_headers = process_csv(file_path)
-            plot_result = plot_data(data, headers, matched_headers)
+            plotting_tools = PlottingTools(self.path_registry)
+            plotting_tools._find_file(file_id)
+            plotting_tools.process_csv()
+            plot_result = plotting_tools.plot_data()
             if type(plot_result) == str:
                 return "Figures created: " + plot_result
             else:
                 return "No figures created."
-        except ValueError:
-            return "No timestep data found in csv file."
-        except FileNotFoundError:
-            return "Issue with CSV file, file not found."
         except Exception as e:
             return str(e)
 

diff --git a/mdagent/tools/base_tools/analysis_tools/ppi_tools.py b/mdagent/tools/base_tools/analysis_tools/ppi_tools.py
@@ -6,8 +6,10 @@
 from langchain.tools import BaseTool
 from pydantic import BaseModel, Field
 
+from mdagent.utils import PathRegistry
 
-def ppi_distance(pdb_file, binding_site="protein"):
+
+def ppi_distance(file_path, binding_site="protein"):
     """
     Calculates minimum heavy-atom distance between peptide (assumed to be
     smallest chain) and protein. Returns average distance between these two.
@@ -16,7 +18,7 @@ def ppi_distance(pdb_file, binding_site="protein"):
     Can work with any protein-protein interaction (PPI)
     """
     # load and find smallest chain
-    u = mda.Universe(pdb_file)
+    u = mda.Universe(file_path)
     peptide = None
     for chain in u.segments:
         if peptide is None or len(chain.residues) < len(peptide):
@@ -49,14 +51,25 @@ class PPIDistance(BaseTool):
     name: str = "ppi_distance"
     description: str = """Useful for calculating minimum heavy-atom distance
     between peptide and protein. First, make sure you have valid PDB file with
-    any protein-protein interaction."""
+    any protein-protein interaction. Give this tool the name of the file. The
+    tool will find the path."""
     args_schema: Type[BaseModel] = PPIDistanceInputSchema
+    path_registry: Optional[PathRegistry]
+
+    def __init__(self, path_registry: Optional[PathRegistry]):
+        super().__init__()
+        self.path_registry = path_registry
 
     def _run(self, pdb_file: str, binding_site: str = "protein"):
-        if not pdb_file.endswith(".pdb"):
+        if not self.path_registry:
+            return "Error: Path registry is not set"  # this should not happen
+        file_path = self.path_registry.get_mapped_path(pdb_file)
+        if not file_path:
+            return f"File not found: {pdb_file}"
+        if not file_path.endswith(".pdb"):
             return "Error with input: PDB file must have .pdb extension"
         try:
-            avg_dist = ppi_distance(pdb_file, binding_site=binding_site)
+            avg_dist = ppi_distance(file_path, binding_site=binding_site)
         except ValueError as e:
             return (
                 f"ValueError: {e}. \nMake sure to provide valid PBD "