Skip to content

Commit

Permalink
merging main into branch
Browse files Browse the repository at this point in the history
  • Loading branch information
Jgmedina95 committed Mar 18, 2024
2 parents ccc1b95 + ee28a70 commit b8e51c6
Show file tree
Hide file tree
Showing 57 changed files with 10,109 additions and 2,475 deletions.
3 changes: 0 additions & 3 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,5 @@
# OpenAI API Key
OPENAI_API_KEY=YOUR_OPENAI_API_KEY_GOES_HERE # pragma: allowlist secret

# PQA API Key
PQA_API_KEY=YOUR_PQA_API_KEY_GOES_HERE # pragma: allowlist secret

# Serp API key
SERP_API_KEY=YOUR_SERP_API_KEY_GOES_HERE # pragma: allowlist secret
4 changes: 2 additions & 2 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,10 @@ jobs:

steps:
- uses: actions/checkout@v2
- name: Set up Python "3.9"
- name: Set up Python "3.11"
uses: actions/setup-python@v2
with:
python-version: "3.9"
python-version: "3.11"
- name: Install dependencies
run: |
python -m pip install --upgrade pip
Expand Down
5 changes: 2 additions & 3 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,10 @@ jobs:
environment-file: environment.yaml
python-version: ${{ matrix.python-version }}
auto-activate-base: true
- name: Install openmm pdbfixer mdanalysis with conda
- name: Install pdbfixer with conda
shell: bash -l {0}
run: |
conda install -c conda-forge openmm pdbfixer mdanalysis
conda install -c conda-forge pdbfixer
- name: Install dependencies
shell: bash -l {0}
run: |
Expand All @@ -45,6 +45,5 @@ jobs:
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
SEMANTIC_SCHOLAR_API_KEY: ${{ secrets.SEMANTIC_SCHOLAR_API_KEY }}
PQA_API_KEY : ${{ secrets.PQA_API_TOKEN }}
run: |
pytest -m "not skip" tests
3 changes: 0 additions & 3 deletions .secrets.baseline
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,5 @@
# Rule for detecting OpenAI API keys
OpenAI API Key: \b[secrets]{3}_[a-zA-Z0-9]{32}\b

# Rule for detecting pqa API keys
PQA API Key: "pqa[a-zA-Z0-9-._]+"

# Rule for detecting serp API keys
# Serp API Key: "[a-zA-Z0-9]{64}"
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@ To use the OpenMM features in the agent, please set up a conda environment, foll
- Create conda environment: `conda env create -n mdagent -f environment.yaml`
- Activate your environment: `conda activate mdagent`

If you already have a conda environment, you can install the necessary dependencies with the following steps.
- Install the necessary conda dependencies: `conda install -c conda-forge openmm pdbfixer mdanalysis`
If you already have a conda environment, you can install, pdbfixer, a necessary dependency with the following steps.
- Install the necessary conda dependencies: `conda install -c conda-forge pdbfixer`


## Installation
Expand Down
1 change: 1 addition & 0 deletions dev-requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
pre-commit
pytest
pytest-mock
25 changes: 11 additions & 14 deletions mdagent/tools/base_tools/__init__.py
Original file line number Diff line number Diff line change
@@ -1,28 +1,24 @@
from .analysis_tools.plot_tools import SimulationOutputFigures
from .analysis_tools.ppi_tools import PPIDistance
from .analysis_tools.rdf_tool import RDFTool
from .analysis_tools.rmsd_tools import RMSDCalculator
from .analysis_tools.vis_tools import (
CheckDirectoryFiles,
VisFunctions,
VisualizeProtein,
from .analysis_tools.rgy import (
RadiusofGyrationAverage,
RadiusofGyrationPerFrame,
RadiusofGyrationPlot,
)
from .analysis_tools.rmsd_tools import RMSDCalculator
from .analysis_tools.vis_tools import VisFunctions, VisualizeProtein
from .preprocess_tools.clean_tools import (
AddHydrogensCleaningTool,
CleaningToolFunction,
CleaningTools,
RemoveWaterCleaningTool,
SpecializedCleanTool,
)
from .preprocess_tools.pdb_tools import (
PackMolTool,
ProteinName2PDBTool,
SmallMolPDB,
get_pdb,
)
from .preprocess_tools.packing import PackMolTool
from .preprocess_tools.pdb_get import ProteinName2PDBTool, SmallMolPDB, get_pdb
from .simulation_tools.create_simulation import ModifyBaseSimulationScriptTool
from .simulation_tools.setup_and_run import (
InstructionSummary,
SetUpandRunFunction,
SetUpAndRunTool,
SimulationFunctions,
Expand All @@ -33,9 +29,7 @@

__all__ = [
"AddHydrogensCleaningTool",
"CheckDirectoryFiles",
"CleaningTools",
"InstructionSummary",
"ListRegistryPaths",
"MapPath2Name",
"ProteinName2PDBTool",
Expand All @@ -45,6 +39,9 @@
"VisualizeProtein",
"RMSDCalculator",
"RemoveWaterCleaningTool",
"RadiusofGyrationAverage",
"RadiusofGyrationPerFrame",
"RadiusofGyrationPlot",
"Scholar2ResultLLM",
"SerpGitTool",
"SetUpAndRunTool",
Expand Down
7 changes: 5 additions & 2 deletions mdagent/tools/base_tools/analysis_tools/__init__.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
from .plot_tools import SimulationOutputFigures
from .ppi_tools import PPIDistance
from .rgy import RadiusofGyrationAverage, RadiusofGyrationPerFrame, RadiusofGyrationPlot
from .rmsd_tools import RMSDCalculator
from .vis_tools import CheckDirectoryFiles, VisFunctions, VisualizeProtein
from .vis_tools import VisFunctions, VisualizeProtein

__all__ = [
"PPIDistance",
"RMSDCalculator",
"RadiusofGyrationPerFrame",
"RadiusofGyrationPlot",
"SimulationOutputFigures",
"CheckDirectoryFiles",
"VisualizeProtein",
"VisFunctions",
"RadiusofGyrationAverage",
]
175 changes: 109 additions & 66 deletions mdagent/tools/base_tools/analysis_tools/plot_tools.py
Original file line number Diff line number Diff line change
@@ -1,67 +1,107 @@
import csv
import os
import re
from typing import Optional

import matplotlib.pyplot as plt
from langchain.tools import BaseTool

from mdagent.utils import PathRegistry


def process_csv(file_name):
with open(file_name, "r") as f:
reader = csv.DictReader(f)
headers = reader.fieldnames
data = list(reader)

matched_headers = [
(i, header)
for i, header in enumerate(headers)
if re.search(r"(step|time)", header, re.IGNORECASE)
]

return data, headers, matched_headers


def plot_data(data, headers, matched_headers):
# Get the first matched header
if matched_headers:
time_or_step = matched_headers[0][1]
xlab = "step" if "step" in time_or_step.lower() else "time"
else:
print("No 'step' or 'time' headers found.")
return

failed_headers = []
created_plots = []
for header in headers:
if header != time_or_step:
try:
x = [float(row[time_or_step]) for row in data]
y = [float(row[header]) for row in data]

header_lab = (
header.split("(")[0].strip() if "(" in header else header
).lower()
plot_name = f"{xlab}_vs_{header_lab}.png"

# Generate and save the plot
plt.figure()
plt.plot(x, y)
plt.xlabel(xlab)
plt.ylabel(header)
plt.title(f"{xlab} vs {header_lab}")
plt.savefig(plot_name)
plt.close()

created_plots.append(plot_name)
except ValueError:
failed_headers.append(header)

if len(failed_headers) == len(headers) - 1: # -1 to account for time_or_step header
raise Exception("All plots failed due to non-numeric data.")

return ", ".join(created_plots)
from mdagent.utils import FileType, PathRegistry


class PlottingTools:
def __init__(
self,
path_registry,
):
self.path_registry = path_registry
self.data = None
self.headers = None
self.matched_headers = None
self.file_id = None
self.file_path = None

def _find_file(self, file_id: str) -> None:
self.file_id = file_id
self.file_path = self.path_registry.get_mapped_path(file_id)
if not self.file_path:
raise FileNotFoundError("File not found.")
return None

def process_csv(self) -> None:
with open(self.file_path, "r") as f:
reader = csv.DictReader(f)
self.headers = reader.fieldnames if reader.fieldnames is not None else []
self.data = list(reader)

self.matched_headers = [
(i, header)
for i, header in enumerate(self.headers)
if re.search(r"(step|time)", header, re.IGNORECASE)
]

if not self.matched_headers or not self.headers or not self.data:
raise ValueError("File could not be processed.")
return None

def plot_data(self) -> str:
if self.matched_headers:
time_or_step = self.matched_headers[0][1]
xlab = "step" if "step" in time_or_step.lower() else "time"
else:
raise ValueError("No timestep found.")

failed_headers = []
created_plots = []
for header in self.headers:
if header != time_or_step:
try:
x = [float(row[time_or_step]) for row in self.data]
y = [float(row[header]) for row in self.data]

header_lab = (
header.split("(")[0].strip() if "(" in header else header
).lower()
# Generate and save the plot
plt.figure()
plt.plot(x, y)
plt.xlabel(xlab)
plt.ylabel(header)
plt.title(f"{self.file_id}_{xlab} vs {header_lab}")
fig_vs = f"{xlab}vs{header_lab}"
# PR: Mapping involves writing file name -> get file id
plot_name = self.path_registry.write_file_name(
type=FileType.FIGURE,
Log_id=self.file_id,
fig_analysis=fig_vs,
file_format="png",
)
plot_id = self.path_registry.get_fileid(
file_name=plot_name, type=FileType.FIGURE
)
if not os.path.exists("files/figures"): # PR: Needed to avoid error
os.makedirs("files/figures")
plt.savefig(f"files/figures/{plot_name}")
self.path_registry.map_path(
plot_id,
f"files/figures/{plot_name}",
(
f"Post Simulation Figure for {self.file_id}"
f" - {header_lab} vs {xlab}"
),
)
plt.close()

created_plots.append(plot_name)
except ValueError:
failed_headers.append(header)

if (
len(failed_headers) == len(self.headers) - 1
): # -1 to account for time_or_step header
raise Exception("All plots failed due to non-numeric data.")

return ", ".join(created_plots)


class SimulationOutputFigures(BaseTool):
Expand All @@ -71,24 +111,27 @@ class SimulationOutputFigures(BaseTool):
simulation and create figures for
all physical parameters
versus timestep of the simulation.
Give this tool the path to the
csv file output from the simulation."""
Give this tool the name of the
csv file output from the simulation.
The tool will get the exact path."""

path_registry: Optional[PathRegistry]

def _run(self, file_path: str) -> str:
def __init__(self, path_registry: Optional[PathRegistry] = None):
super().__init__()
self.path_registry = path_registry

def _run(self, file_id: str) -> str:
"""use the tool."""
try:
data, headers, matched_headers = process_csv(file_path)
plot_result = plot_data(data, headers, matched_headers)
plotting_tools = PlottingTools(self.path_registry)
plotting_tools._find_file(file_id)
plotting_tools.process_csv()
plot_result = plotting_tools.plot_data()
if type(plot_result) == str:
return "Figures created: " + plot_result
else:
return "No figures created."
except ValueError:
return "No timestep data found in csv file."
except FileNotFoundError:
return "Issue with CSV file, file not found."
except Exception as e:
return str(e)

Expand Down
23 changes: 18 additions & 5 deletions mdagent/tools/base_tools/analysis_tools/ppi_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,10 @@
from langchain.tools import BaseTool
from pydantic import BaseModel, Field

from mdagent.utils import PathRegistry

def ppi_distance(pdb_file, binding_site="protein"):

def ppi_distance(file_path, binding_site="protein"):
"""
Calculates minimum heavy-atom distance between peptide (assumed to be
smallest chain) and protein. Returns average distance between these two.
Expand All @@ -16,7 +18,7 @@ def ppi_distance(pdb_file, binding_site="protein"):
Can work with any protein-protein interaction (PPI)
"""
# load and find smallest chain
u = mda.Universe(pdb_file)
u = mda.Universe(file_path)
peptide = None
for chain in u.segments:
if peptide is None or len(chain.residues) < len(peptide):
Expand Down Expand Up @@ -49,14 +51,25 @@ class PPIDistance(BaseTool):
name: str = "ppi_distance"
description: str = """Useful for calculating minimum heavy-atom distance
between peptide and protein. First, make sure you have valid PDB file with
any protein-protein interaction."""
any protein-protein interaction. Give this tool the name of the file. The
tool will find the path."""
args_schema: Type[BaseModel] = PPIDistanceInputSchema
path_registry: Optional[PathRegistry]

def __init__(self, path_registry: Optional[PathRegistry]):
super().__init__()
self.path_registry = path_registry

def _run(self, pdb_file: str, binding_site: str = "protein"):
if not pdb_file.endswith(".pdb"):
if not self.path_registry:
return "Error: Path registry is not set" # this should not happen
file_path = self.path_registry.get_mapped_path(pdb_file)
if not file_path:
return f"File not found: {pdb_file}"
if not file_path.endswith(".pdb"):
return "Error with input: PDB file must have .pdb extension"
try:
avg_dist = ppi_distance(pdb_file, binding_site=binding_site)
avg_dist = ppi_distance(file_path, binding_site=binding_site)
except ValueError as e:
return (
f"ValueError: {e}. \nMake sure to provide valid PBD "
Expand Down
Loading

0 comments on commit b8e51c6

Please sign in to comment.