diff --git a/README.md b/README.md index ba6c9eae..16d0f402 100644 --- a/README.md +++ b/README.md @@ -21,7 +21,7 @@ pip install git+https://github.com/ur-whitelab/MDCrow.git ## Usage The next step is to set up your API keys in your environment. An API key for LLM provider is necessary for this project. Supported LLM providers are OpenAI, TogetherAI, Fireworks, and Anthropic. -We recommend setting up api keys in a .env file. You can use the provided .env.example file as a template. +Other tools require API keys, such as paper-qa for literature searches. We recommend setting up the keys in a .env file. You can use the provided .env.example file as a template. 1. Copy the `.env.example` file and rename it to `.env`: `cp .env.example .env` 2. Replace the placeholder values in `.env` with your actual keys diff --git a/mdcrow/agent/agent.py b/mdcrow/agent/agent.py index db1428de..2a704356 100644 --- a/mdcrow/agent/agent.py +++ b/mdcrow/agent/agent.py @@ -1,5 +1,4 @@ import os -from datetime import datetime from dotenv import load_dotenv from langchain.agents import AgentExecutor, OpenAIFunctionsAgent @@ -47,7 +46,6 @@ def __init__( uploaded_files=[], # user input files to add to path registry run_id="", use_memory=False, - safe_mode=False, paper_dir=None, # papers for pqa, relative path within repo ): self.llm = _make_llm(model, temp, streaming) @@ -62,8 +60,8 @@ def __init__( self.run_id = self.memory.run_id self.uploaded_files = uploaded_files - # for file in uploaded_files: # todo -> allow users to add descriptions? - # self.path_registry.map_path(file, file, description="User uploaded file") + for file in uploaded_files: # todo -> allow users to add descriptions? + self.path_registry.map_path(file, file, description="User uploaded file") self.agent = None self.agent_type = agent_type @@ -72,43 +70,6 @@ def __init__( self.user_tools = tools self.verbose = verbose - if self.uploaded_files: - self.add_file(self.uploaded_files) - self.safe_mode = safe_mode - - def _add_single_file(self, file_path, description=None): - now = datetime.now() - # Format the date and time as "YYYYMMDD_HHMMSS" - timestamp = now.strftime("%Y%m%d_%H%M%S") - i = 0 - ID = "UPL_" + str(i) + timestamp - while ID in self.path_registry.list_path_names(): # check if ID already exists - i += 1 - ID = "UPL_" + str(i) + timestamp - if not description: - # asks for user input to add description for file file_path - # wait for 20 seconds or set up a default description - description = "User uploaded file" - print(f"Adding file {file_path} with ID {ID}\n") - self.path_registry.map_path(ID, file_path, description=description) - - def add_file(self, uploaded_files): - if type(uploaded_files) == str: - self._add_single_file(uploaded_files) - elif type(uploaded_files) == tuple: - self._add_single_file(uploaded_files[0], description=uploaded_files[1]) - elif type(uploaded_files) == list: - for file_path in uploaded_files: - print(f"Adding file {file_path}\n") - print(type(file_path)) - self.add_file(file_path) - else: - raise ValueError( - "Invalid input. Please provide a file path \ - or list of file paths. Optionally, tuple or list of tuples\ - of file path and description" - ) - def _initialize_tools_and_agent(self, user_input=None): """Retrieve tools and initialize the agent.""" if self.user_tools is not None: @@ -127,7 +88,6 @@ def _initialize_tools_and_agent(self, user_input=None): self.tools = make_all_tools( self.tools_llm, human=self.use_human_tool, - safe_mode=self.safe_mode, ) return AgentExecutor.from_agent_and_tools( tools=self.tools, diff --git a/mdcrow/tools/base_tools/preprocess_tools/pdb_get.py b/mdcrow/tools/base_tools/preprocess_tools/pdb_get.py index c6606f1d..8faf7e5d 100644 --- a/mdcrow/tools/base_tools/preprocess_tools/pdb_get.py +++ b/mdcrow/tools/base_tools/preprocess_tools/pdb_get.py @@ -272,7 +272,7 @@ def small_molecule_pdb(self, mol_str: str) -> str: except Exception as e: print( "There was an error getting pdb. Please input a single molecule name." - f"{mol_str}" + f"{mol_str},{mol_name}" ) return ( "Failed. There was an error getting pdb. " diff --git a/mdcrow/tools/base_tools/simulation_tools/create_simulation.py b/mdcrow/tools/base_tools/simulation_tools/create_simulation.py index 163a7af5..86edd2d6 100644 --- a/mdcrow/tools/base_tools/simulation_tools/create_simulation.py +++ b/mdcrow/tools/base_tools/simulation_tools/create_simulation.py @@ -1,4 +1,3 @@ -import os import textwrap from typing import Optional @@ -17,7 +16,7 @@ class ModifyScriptUtils: def __init__(self, llm): self.llm = llm - def _prompt_summary(self, task: dict): + def _prompt_summary(self, query: str): if not self.llm: raise ValueError("No language model provided at ModifyScriptTool") @@ -51,7 +50,7 @@ def _prompt_summary(self, task: dict): ) llm_chain = prompt | self.llm | StrOutputParser() - return llm_chain.invoke(task) + return llm_chain.invoke(query) # Remove leading spaces for proper formatting @@ -62,16 +61,15 @@ def remove_leading_spaces(self, text): class ModifyScriptInput(BaseModel): - script_id: str = Field(..., description=" File ID of the simulation script file") query: str = Field( ..., description=( - "simulation required by the user. Be as descriptive as possible" - " including requirements of the simulation, such as the forcefields, " - "integrator, and constraints. Also, mention the protein you are working on." + "simulation required by the user.You MUST " + "specify the objective, requirements of the simulation as well " "as on what protein you are working." ), ) + script: str = Field(..., description=" simulation ID of the base script file") class ModifyBaseSimulationScriptTool(BaseTool): @@ -84,33 +82,25 @@ class ModifyBaseSimulationScriptTool(BaseTool): args_schema = ModifyScriptInput llm: Optional[BaseLanguageModel] path_registry: Optional[PathRegistry] - safe_mode: Optional[bool] - def __init__(self, path_registry, llm, safe_mode=False): + def __init__(self, path_registry: Optional[PathRegistry], llm): super().__init__() self.path_registry = path_registry self.llm = llm - self.safe_mode = safe_mode - - def _run(self, script_id: str, query: str) -> str: - # if len(args) > 0: - # return ( - # "Failed. This tool expects you to provide the input as a " - # "dictionary: {'query': 'your query', 'script': 'script id'}" - # ) + + def _run(self, *args, **input): + if len(args) > 0: + return ( + "Failed. This tool expects you to provide the input as a " + "dictionary: {'query': 'your query', 'script': 'script id'}" + ) if not self.path_registry: return "Failed. No path registry provided" # this should not happen - base_script_id = script_id + base_script_id = input.get("script") if not base_script_id: return ( "Failed. No id provided. The keys for the input are: " - "query' and 'script_id'" - ) - current_ids = self.path_registry.list_path_names() - if base_script_id not in current_ids: - return ( - f"Failed. File ID not found: {base_script_id}, make sure " - "the script ID is correct" + "query' and 'script'" ) try: base_script_path = self.path_registry.get_mapped_path(base_script_id) @@ -119,24 +109,18 @@ def _run(self, script_id: str, query: str) -> str: parts[-1] except Exception as e: return f"Failed. Error getting path from file id: {e}" - if os.path.exists(base_script_path): - with open(base_script_path, "r") as file: - base_script = file.read() - else: - return f"Failed. File not found: {base_script_id}" - + with open(base_script_path, "r") as file: + base_script = file.read() base_script = "".join(base_script) utils = ModifyScriptUtils(self.llm) - description = query + description = input.get("query") answer = utils._prompt_summary( - task={"base_script": base_script, "query": description} + query={"base_script": base_script, "query": description} ) - print("This the answer from the LLM\n\n", answer) - # script = answer["text"] - thoughts, new_script = answer.split("SCRIPT:") - # script_content = utils.remove_leading_spaces(new_script) - script_content = new_script + script = answer["text"] + thoughts, new_script = script.split("SCRIPT:") + script_content = utils.remove_leading_spaces(new_script) if "FINAL THOUGHTS:" in script_content: script_content, final_thoughts = script_content.split("FINAL THOUGHTS:") # replace ''' with # @@ -151,21 +135,8 @@ def _run(self, script_id: str, query: str) -> str: with open(f"{directory}/{filename}", "w") as file: file.write(script_content) - self.path_registry.map_path(file_id, f"{directory}/{filename}", description) - # if safe mode is on, return the file id - if self.safe_mode: - return f"Succeeded. Script modified successfully. Modified Script ID: {file_id}" - # if safe mode is off, try to run the script - try: - exec(script_content) - return f"Succeeded. Script modified and ran \ - successfully. Modified Script ID: {file_id}" - except Exception as e: - return ( - f"Failed. Error running the script: {e}." - "Modified Script ID: {file_id}. If you want to try to correct the " - "script, use the file id of the modified to correct the script." - ) + self.path_registry.map_path(file_id, filename, description) + return f"Succeeded. Script modified successfully. Modified Script ID: {file_id}" async def _arun(self, query) -> str: """Use the tool asynchronously.""" diff --git a/mdcrow/tools/base_tools/simulation_tools/setup_and_run.py b/mdcrow/tools/base_tools/simulation_tools/setup_and_run.py index 0f83af4c..fbf4e278 100644 --- a/mdcrow/tools/base_tools/simulation_tools/setup_and_run.py +++ b/mdcrow/tools/base_tools/simulation_tools/setup_and_run.py @@ -723,22 +723,14 @@ def _construct_script_content( system.addForce(MonteCarloBarostat(pressure, temperature, barostatInterval)) """ - if ( - integrator_type == "LangevinMiddle" - and constraints != "None" - and constraints - ): - print("Constraints must be set to 'None' for LangevinMiddle integrator.") - print(integrator_type, "constraints: ", constraints) + if integrator_type == "LangevinMiddle" and constraints != "None": script_content += """ integrator = LangevinMiddleIntegrator(temperature, friction, dt) integrator.setConstraintTolerance(constraintTolerance) simulation = Simulation(modeller.topology, system, integrator, platform) simulation.context.setPositions(modeller.positions) """ - if integrator_type == "LangevinMiddle" and ( - constraints == "None" or constraints is None - ): + if integrator_type == "LangevinMiddle" and constraints == "None": script_content += """ integrator = LangevinMiddleIntegrator(temperature, friction, dt) simulation = Simulation(modeller.topology, system, integrator, platform) @@ -750,15 +742,6 @@ def _construct_script_content( print('Performing energy minimization...') simulation.minimizeEnergy() - ## Save initial positions - top_name = 'simulation_initial_positions.pdb' - top_description = 'Initial positions of the simulation' - with open(top_name, "w") as f: - \tPDBFile.writeFile( - \tsimulation.topology, - \tsimulation.context.getState(getPositions=True).getPositions(), - \tf, - \t) print('Equilibrating...') simulation.context.setVelocitiesToTemperature(temperature) simulation.step(equilibrationSteps) diff --git a/mdcrow/tools/maketools.py b/mdcrow/tools/maketools.py index 86b5ec6c..e9cac91b 100644 --- a/mdcrow/tools/maketools.py +++ b/mdcrow/tools/maketools.py @@ -63,7 +63,6 @@ def make_all_tools( llm: BaseLanguageModel, human=False, - safe_mode=False, ): load_dotenv() all_tools = [] @@ -72,9 +71,7 @@ def make_all_tools( all_tools += agents.load_tools(["llm-math"], llm) # all_tools += [PythonREPLTool()] all_tools += [ - ModifyBaseSimulationScriptTool( - path_registry=path_instance, llm=llm, safe_mode=safe_mode - ), + ModifyBaseSimulationScriptTool(path_registry=path_instance, llm=llm), ] if path_instance.ckpt_papers: all_tools += [Scholar2ResultLLM(llm=llm, path_registry=path_instance)] diff --git a/notebooks/experiments/prompts.md b/notebooks/experiments/prompts.md new file mode 100644 index 00000000..b0185ac8 --- /dev/null +++ b/notebooks/experiments/prompts.md @@ -0,0 +1,25 @@ +Simulate pdb 1MBN at two different temperatures: 300K, 400K for 1ns seconds each. Plot RMSD of both over time, and compare the final secondary structures at the end of the simulations. +Download the pdb file for PDB ID 1LYZ. +Download the PDB file for PDB ID 1GZX. Then, analyze the secondary structure of the protein and tell me how many chains, sheets, etc. there are. +What are common parameters to simulate fibronectin? +Simulate 1XQ8 for 1ns at temperate 300K. Then tell me if the secondary structure changed from before the simulation to after. +Simulate 1A3N and 7VDE, two PDB IDs for hemoglobin with the same parameters. Find the appropriate parameters from literature. Then, plot the radius of gyration throughout the both simulations. +Simulate 1ZNI for 1ns at temp=300K. +Simulate 4RMB at 100K, 200K, and 300K. Then plot the radius of gyration over time for all three simulations. Lastly, compare the change in secondary structure for the three analyses throughout the simulation. +What are the known interractions of protein 1BDG? +Download the PDB file for 1AEE. Then, tell me how many chains and atoms are in the protein. +Simulate protein 1ZNI at 300K for 1ns and calculate RMSD. +Download the PDB files for 8PFK and 8PFQ. Then, compare the secondary structure of the two proteins, including number atoms, secondary structure, number of chains, etc. +Simulate fibronectin (PDB ID 1FNF) for 1ns. Use an appropriate temperature from literature. +Compare the RMSF of 1UBQ at high pressure and low pressure. Perform the simulation for 1 ns and vary only the pressure. +Simulate hemoglobin oxygenated (1A3N) and de-oxygenated (6BB5) +Simulate Trypsin (1TRN) for 1ns at 300K and compute SASA. +Download the pdb file for 1C3W and describe the secondary structure. Then simulate the protein at 300K for 1ns. Plot RMSD over time and radius of gyration over time. +Download the PDB file for 1XQ8. Then, save the visualization for it. +Download the PDB for 2YXF. Tell me about its stability, as found in literature. Then, simulate it for 1ns and plot its RMSD over time. +Simulate 1MBN in water and in methanol solutions. +Download Protein 1ATN +Download and clean protein 1A3N +Perform a short simulation of protein 1PQ2 +Analyze the rdf of the simulation of 1A3N solvated in water +Make an rdf analysis of both oxygenated and deoxygenated hemoglobin structures