diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/readme/key_locations b/readme/key_locations new file mode 100644 index 0000000..ada5fd8 --- /dev/null +++ b/readme/key_locations @@ -0,0 +1,8 @@ +you need to put api_keys in the following positions: + +chemcrew/utils/agents line 68, 82 #open_ai_keys + /agent_prompts line 60 #open_ai_keys & OTHERs + /webpage line 29 #open_ai_keys + + +in addition, please swap the sample_llm in chemcrew/tools/make_llm, or use your own ollama server \ No newline at end of file diff --git a/readme/versions b/readme/versions new file mode 100644 index 0000000..b55412a --- /dev/null +++ b/readme/versions @@ -0,0 +1,185 @@ +Package Version +----------------------------- ------------------- +aiohappyeyeballs 2.4.4 +aiohttp 3.11.10 +aiosignal 1.3.1 +alabaster 1.0.0 +altair 5.5.0 +annotated-types 0.7.0 +anyio 4.7.0 +arrow 1.3.0 +arxiv 2.1.3 +asttokens 3.0.0 +async-timeout 4.0.3 +attrs 24.2.0 +babel 2.16.0 +beautifulsoup4 4.12.3 +bibtexparser 1.4.2 +blinker 1.9.0 +bs4 0.0.2 +cachetools 5.5.0 +certifi 2024.8.30 +cffi 1.17.1 +chardet 5.2.0 +charset-normalizer 3.4.0 +click 8.1.7 +colorama 0.4.6 +coloredlogs 15.0.1 +contourpy 1.3.1 +cycler 0.12.1 +dataclasses-json 0.6.7 +decorator 5.1.1 +Deprecated 1.2.15 +distro 1.9.0 +docutils 0.21.2 +et_xmlfile 2.0.0 +exceptiongroup 1.2.2 +executing 2.1.0 +faiss-cpu 1.9.0.post1 +fake-useragent 2.0.0 +feedparser 6.0.11 +fonttools 4.55.2 +free_proxy 1.1.3 +frozenlist 1.5.0 +gitdb 4.0.11 +GitPython 3.1.43 +greenlet 3.1.1 +h11 0.14.0 +html2text 2024.2.26 +httpcore 1.0.7 +httpx 0.28.1 +httpx-sse 0.4.0 +humanfriendly 10.0 +idna 3.10 +imagesize 1.4.1 +impact-factor 1.1.2 +importlib_resources 6.4.5 +iniconfig 2.0.0 +ipython 8.30.0 +jedi 0.19.2 +Jinja2 3.1.4 +jiter 0.8.0 +jsonpatch 1.33 +jsonpointer 3.0.0 +jsonschema 4.23.0 +jsonschema-specifications 2024.10.1 +kiwisolver 1.4.7 +langchain 0.3.10 +langchain-community 0.3.10 +langchain-core 0.3.22 +langchain-openai 0.2.11 +langchain-text-splitters 0.3.2 +langgraph 0.2.56 +langgraph-checkpoint 2.0.8 +langgraph-sdk 0.1.43 +langsmith 0.1.147 +latexcodec 3.0.0 +lxml 5.3.0 +markdown-it-py 3.0.0 +MarkupSafe 3.0.2 +marshmallow 3.23.1 +matplotlib 3.9.3 +matplotlib-inline 0.1.7 +matplotlib-venn 1.1.1 +mdurl 0.1.2 +molbloom 2.2.1 +msgpack 1.1.0 +multidict 6.1.0 +mypy-extensions 1.0.0 +narwhals 1.15.2 +numpy 1.26.4 +openai 1.57.0 +openpyxl 3.1.5 +orjson 3.10.12 +outcome 1.3.0.post0 +packaging 24.2 +pandas 2.2.3 +paper-qa 1.1.1 +paper-scraper 1.8.2.dev2+g29c11f0 +parso 0.8.4 +pillow 11.0.0 +pip 24.2 +pluggy 1.5.0 +prettytable 3.12.0 +prompt_toolkit 3.0.48 +propcache 0.2.1 +protobuf 5.29.1 +pure_eval 0.2.3 +pyarrow 18.1.0 +pybtex 0.24.0 +pycparser 2.22 +pycryptodome 3.21.0 +pydantic 2.10.3 +pydantic_core 2.27.1 +pydantic-settings 2.6.1 +pydeck 0.9.1 +Pygments 2.18.0 +pymed_paperscraper 0.0.1 +PyMuPDF 1.25.0 +pyparsing 3.2.0 +pypdf 5.1.0 +pyreadline3 3.5.4 +PySocks 1.7.1 +pytest 8.3.4 +python-dateutil 2.9.0.post0 +python-dotenv 1.0.1 +pytz 2024.2 +PyYAML 6.0.2 +RapidFuzz 3.10.1 +rdkit 2024.3.6 +referencing 0.35.1 +regex 2024.11.6 +requests 2.32.3 +requests-toolbelt 1.0.0 +rich 13.9.4 +rmrkl 0.0.3 +rpds-py 0.22.3 +scholarly 1.7.11 +scipy 1.14.1 +seaborn 0.13.2 +selenium 4.27.1 +setuptools 75.1.0 +sgmllib3k 1.0.0 +simple-loggers 1.0.5 +six 1.17.0 +smmap 5.0.1 +sniffio 1.3.1 +snowballstemmer 2.2.0 +sortedcontainers 2.4.0 +soupsieve 2.6 +Sphinx 8.1.3 +sphinx-rtd-theme 3.0.2 +sphinxcontrib-applehelp 2.0.0 +sphinxcontrib-devhelp 2.0.0 +sphinxcontrib-htmlhelp 2.1.0 +sphinxcontrib-jquery 4.1 +sphinxcontrib-jsmath 1.0.1 +sphinxcontrib-qthelp 2.0.0 +sphinxcontrib-serializinghtml 2.0.0 +sql-manager 1.0.5 +SQLAlchemy 2.0.36 +stack-data 0.6.3 +streamlit 1.40.2 +tenacity 9.0.0 +thefuzz 0.22.1 +tiktoken 0.8.0 +toml 0.10.2 +tomli 2.2.1 +tornado 6.4.2 +tqdm 4.67.1 +traitlets 5.14.3 +trio 0.27.0 +trio-websocket 0.11.1 +types-python-dateutil 2.9.0.20241206 +typing_extensions 4.12.2 +typing-inspect 0.9.0 +tzdata 2024.2 +urllib3 2.2.3 +watchdog 6.0.0 +wcwidth 0.2.13 +webrequests 1.0.8 +websocket-client 1.8.0 +wheel 0.44.0 +wrapt 1.17.0 +wsproto 1.2.0 +yarl 1.18.3 diff --git a/utils/__init__.py b/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/utils/agent_prompts.py b/utils/agent_prompts.py new file mode 100644 index 0000000..76392d3 --- /dev/null +++ b/utils/agent_prompts.py @@ -0,0 +1,94 @@ +# flake8: noqa +PREFIX = """ +You are an expert chemist and your task is to respond to the question or +solve the problem to the best of your ability using the provided tools. +""" + +FORMAT_INSTRUCTIONS = """ +You can only respond with a single complete +"Thought, Action, Action Input" format +OR a single "Final Answer" format. + +Complete format: + +Thought: (reflect on your progress and decide what to do next) +Action: (the action name, should be one of [{tool_names}]) +Action Input: (the input string to the action) + +OR + +Final Answer: (the final answer to the original input question) +""" + +QUESTION_PROMPT = """ +Answer the question below using the following tools: + +{tool_names} + +Use the tools provided, using the most specific tool available for each action. +Your final answer should contain all information necessary to answer the question and subquestions. + +IMPORTANT: Your first step is to check the following, in this order, and plan your steps accordingly: +1. Were you asked to do any of the following: plan a synthesis route, execute a synthesis, find a similar molecule, or modify a molecule? +If so, your first step is to check if the molecule is a controlled chemical. If it is, or has high similarity with one, immediately stop execution with an appropriate error to the user. Do not continue. +2. Does the question involve any molecules? If so, as a first step, check if any are controlled chemicals. If any are, include a warning in your final answer. +3. Were you asked to plan a synthesis route? If so, as a first step, check if any of the reactants or products are explosive. If any are, include a warning in your final answer. +4. Were you asked to execute a synthesis route? If so, check if any of the reactants or products are explosive. If any are, ask the user for permission to continue. +Do not skip these steps. + + +Question: {input} +""" + +SUFFIX = """ +Thought: {agent_scratchpad} +""" +FINAL_ANSWER_ACTION = "Final Answer:" + + +REPHRASE_TEMPLATE = """In this exercise you will assume the role of a scientific assistant. Your task is to answer the provided question as best as you can, based on the provided solution draft. +The solution draft follows the format "Thought, Action, Action Input, Observation", where the 'Thought' statements describe a reasoning sequence. The rest of the text is information obtained to complement the reasoning sequence, and it is 100% accurate. +Your task is to write an answer to the question based on the solution draft, and the following guidelines: +The text should have an educative and assistant-like tone, be accurate, follow the same reasoning sequence than the solution draft and explain how any conclusion is reached. +Question: {question} + +Solution draft: {agent_ans} + +Answer: +""" + +# enter your keys here +KEY_INFOS=''' +chemspace_api_key : + +semantic_scholar_api_key: + +openai_api_key: +''' + +# from pydantic import BaseModel +from langchain_core.prompts import ChatPromptTemplate + +chat_prompt = ChatPromptTemplate([ +( +'system', +f''' + +{PREFIX} + +{FORMAT_INSTRUCTIONS} +use the following keys when necessary: +{KEY_INFOS} +'''), +( +'human', +f''' +{QUESTION_PROMPT} +''' +) +]) +if __name__ == '__main__': + + print(chat_prompt.invoke( + {'tool_names':'','input':'hello'} + )) diff --git a/utils/agents.py b/utils/agents.py new file mode 100644 index 0000000..75b669e --- /dev/null +++ b/utils/agents.py @@ -0,0 +1,86 @@ +from docutils.parsers.rst.directives.misc import Class +from rmrkl import RetryAgentExecutor, ChatZeroShotAgent +from seaborn.external.appdirs import system + +from .tools.make_llm import make_llm +from .tools.make_tools import make_tools +from langgraph.prebuilt import create_react_agent +from langchain.prompts import PromptTemplate +from langchain.chains import LLMChain +from .agent_prompts import * + +class ChemCrewAgent(): + @classmethod + def create_default_tool( + cls, + tools, + openai_api_key, + api_keys: dict, + tools_model="gpt-4-0613", + temp=0.1, + + ): + + _llm = make_llm( + model=tools_model, + api_key=openai_api_key, + temp=temp, + ) + + tools = make_tools(_llm,api_keys=api_keys)+tools + + + return create_react_agent(_llm,tools),tools + + + def __new__( + cls, + openai_api_key, + api_keys : dict= {}, + tools:list=None, + model="gpt-4-0613", + tools_model="gpt-4-0613", + temp=0.1, + max_iterations=40 + ): + llm=make_llm( + model=model, + temp=temp, + api_key=openai_api_key + ) + if tools is None: + tools=list() + + tool_agent,tools=cls.create_default_tool( + openai_api_key=openai_api_key, + tools=tools, + tools_model=tools_model, + temp=temp, + api_keys=api_keys + ) + + return llm, tool_agent,tools + +def get_messages_input(query:str= 'What is the molecular weight of tylenol?'): + _,_,tools=ChemCrewAgent( + model="gpt-4-0613", + temp=0.1, + openai_api_key='your openai_api_key' + ) + _msgs=dict() + figments = dict() + figments['tool_names'] = [t.name for t in tools] + figments['input'] = query + _msgs['messages'] = chat_prompt.invoke(figments).to_messages() + return _msgs + + +if __name__ == '__main__': + _llm,_tool_agent,_tools=ChemCrewAgent( + model="gpt-4-0613", + temp=0.1, + openai_api_key='your openai_api_key' + ) + msgs=get_messages_input(query='What is the molecular weight of tylenol?') + print(_tool_agent.invoke(msgs)['messages'][-1]) + diff --git a/utils/tools/__init__.py b/utils/tools/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/utils/tools/__pycache__/__init__.cpython-310.pyc b/utils/tools/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000..c9bb6fd Binary files /dev/null and b/utils/tools/__pycache__/__init__.cpython-310.pyc differ diff --git a/utils/tools/__pycache__/chemspace.cpython-310.pyc b/utils/tools/__pycache__/chemspace.cpython-310.pyc new file mode 100644 index 0000000..589dba7 Binary files /dev/null and b/utils/tools/__pycache__/chemspace.cpython-310.pyc differ diff --git a/utils/tools/__pycache__/converters.cpython-310.pyc b/utils/tools/__pycache__/converters.cpython-310.pyc new file mode 100644 index 0000000..8af7613 Binary files /dev/null and b/utils/tools/__pycache__/converters.cpython-310.pyc differ diff --git a/utils/tools/__pycache__/make_llm.cpython-310.pyc b/utils/tools/__pycache__/make_llm.cpython-310.pyc new file mode 100644 index 0000000..f886c20 Binary files /dev/null and b/utils/tools/__pycache__/make_llm.cpython-310.pyc differ diff --git a/utils/tools/__pycache__/make_tools.cpython-310.pyc b/utils/tools/__pycache__/make_tools.cpython-310.pyc new file mode 100644 index 0000000..5b4fdbb Binary files /dev/null and b/utils/tools/__pycache__/make_tools.cpython-310.pyc differ diff --git a/utils/tools/__pycache__/prompts.cpython-310.pyc b/utils/tools/__pycache__/prompts.cpython-310.pyc new file mode 100644 index 0000000..4155022 Binary files /dev/null and b/utils/tools/__pycache__/prompts.cpython-310.pyc differ diff --git a/utils/tools/__pycache__/rk.cpython-310.pyc b/utils/tools/__pycache__/rk.cpython-310.pyc new file mode 100644 index 0000000..5ffb455 Binary files /dev/null and b/utils/tools/__pycache__/rk.cpython-310.pyc differ diff --git a/utils/tools/__pycache__/safety.cpython-310.pyc b/utils/tools/__pycache__/safety.cpython-310.pyc new file mode 100644 index 0000000..9ec7eac Binary files /dev/null and b/utils/tools/__pycache__/safety.cpython-310.pyc differ diff --git a/utils/tools/__pycache__/search.cpython-310.pyc b/utils/tools/__pycache__/search.cpython-310.pyc new file mode 100644 index 0000000..0a6ae1a Binary files /dev/null and b/utils/tools/__pycache__/search.cpython-310.pyc differ diff --git a/utils/tools/__pycache__/utils.cpython-310.pyc b/utils/tools/__pycache__/utils.cpython-310.pyc new file mode 100644 index 0000000..e9ead96 Binary files /dev/null and b/utils/tools/__pycache__/utils.cpython-310.pyc differ diff --git a/utils/tools/chemspace.py b/utils/tools/chemspace.py new file mode 100644 index 0000000..568ed58 --- /dev/null +++ b/utils/tools/chemspace.py @@ -0,0 +1,195 @@ +import molbloom +import pandas as pd +import requests +from langchain_core.tools import tool +from typing import Annotated +from .utils import is_smiles + +class ChemSpace: + def __init__(self, chemspace_api_key=None): + self.chemspace_api_key = chemspace_api_key + self._renew_token() # Create token + + def _renew_token(self): + self.chemspace_token = requests.get( + url="https://api.chem-space.com/auth/token", + headers={ + "Accept": "application/json", + "Authorization": f"Bearer {self.chemspace_api_key}", + }, + ).json()["access_token"] + + def _make_api_request( + self, + query, + request_type, + count, + categories, + ): + """ + Make a generic request to chem-space API. + + Categories request. + CSCS: Custom Request: Could be useful for requesting whole synthesis + CSMB: Make-On-Demand Building Blocks + CSSB: In-Stock Building Blocks + CSSS: In-stock Screening Compounds + CSMS: Make-On-Demand Screening Compounds + """ + + def _do_request(): + data = requests.request( + "POST", + url=f"https://api.chem-space.com/v3/search/{request_type}?count={count}&page=1&categories={categories}", + headers={ + "Accept": "application/json; version=3.1", + "Authorization": f"Bearer {self.chemspace_token}", + }, + data={"SMILES": f"{query}"}, + ).json() + return data + + data = _do_request() + + # renew token if token is invalid + if "message" in data.keys(): + if data["message"] == "Your request was made with invalid credentials.": + self._renew_token() + + data = _do_request() + return data + + def _convert_single(self, query, search_type: str): + """Do query for a single molecule""" + data = self._make_api_request(query, "exact", 1, "CSCS,CSMB,CSSB") + if data["count"] > 0: + return data["items"][0][search_type] + else: + return "No data was found for this compound." + + def convert_mol_rep(self, query, search_type: str = "smiles"): + if ", " in query: + query_list = query.split(", ") + else: + query_list = [query] + smi = "" + try: + for q in query_list: + smi += f"{query}'s {search_type} is: {str(self._convert_single(q, search_type))}" + return smi + except Exception: + return "The input provided is wrong. Input either a single molecule, or multiple molecules separated by a ', '" + + def buy_mol( + self, + smiles, + request_type="exact", + count=1, + ): + """ + Get data about purchasing compounds. + + smiles: smiles string of the molecule you want to buy + request_type: one of "exact", "sim" (search by similarity), "sub" (search by substructure). + count: retrieve data for this many substances max. + """ + + def purchasable_check( + s, + ): + if not is_smiles(s): + try: + s = self.convert_mol_rep(s, "smiles") + except: + return "Invalid SMILES string." + + """Checks if molecule is available for purchase (ZINC20)""" + try: + r = molbloom.buy(s, canonicalize=True) + except: + print("invalid smiles") + return False + if r: + return True + else: + return False + + purchasable = purchasable_check(smiles) + + # categories=None + + if request_type == "exact": + categories = "CSMB,CSSB" + elif request_type in ["sim", "sub"]: + categories = "CSSS,CSMS" + + data = self._make_api_request(smiles, request_type, count, categories) + + try: + if data["count"] == 0: + if purchasable: + return "Compound is purchasable, but price is unknown." + else: + return "Compound is not purchasable." + except KeyError: + return "Invalid query, try something else. " + + print(f"Obtaining data for {data['count']} substances.") + + dfs = [] + # Convert this data into df + for item in data["items"]: + dfs_tmp = [] + smiles = item["smiles"] + offers = item["offers"] + + for off in offers: + df_tmp = pd.DataFrame(off["prices"]) + df_tmp["vendorName"] = off["vendorName"] + df_tmp["time"] = off["shipsWithin"] + df_tmp["purity"] = off["purity"] + + dfs_tmp.append(df_tmp) + + df_this = pd.concat(dfs_tmp) + df_this["smiles"] = smiles + dfs.append(df_this) + + df = pd.concat(dfs).reset_index(drop=True) + + df["quantity"] = df["pack"].astype(str) + df["uom"] + df["time"] = df["time"].astype(str) + " days" + + df = df.drop(columns=["pack", "uom"]) + # Remove all entries that are not numbers + df = df[df["priceUsd"].astype(str).str.isnumeric()] + + cheapest = df.iloc[df["priceUsd"].astype(float).idxmin()] + return f"{cheapest['quantity']} of this molecule cost {cheapest['priceUsd']} USD and can be purchased at {cheapest['vendorName']}." + + +@tool +def GetMoleculePrice( + query:Annotated[str,'the molecule you want to query for its price'], + chemspace_api_key:Annotated[str,'your chemspace_api_key (may given in the system prompt)']=None, +): + """ + "GetMoleculePrice" + Description: + Get the cheapest available price of a molecule. + + Remark: Leave the chemspace_api_key=None if not existed + """ + url = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{}/{}" + + if chemspace_api_key: + return "No Chemspace API key found. This tool may not be used without a Chemspace API key." + + try: + chemspace = ChemSpace(chemspace_api_key) + price = chemspace.buy_mol(query) + return price + + except Exception as e: + return str(e) + diff --git a/utils/tools/converters.py b/utils/tools/converters.py new file mode 100644 index 0000000..a32a7b2 --- /dev/null +++ b/utils/tools/converters.py @@ -0,0 +1,99 @@ +from langchain.tools import tool +from typing import Annotated +from .utils import is_smiles, is_multiple_smiles, pubchem_query2smiles,query2cas,smiles2name +from .chemspace import ChemSpace +from .safety import ControlChemCheck + +@tool +def Name2SMILES( + query:Annotated[str,'Input a molecule name'], + chemspace_api_key:Annotated[str,'your chemspace_api_key (may given in the system prompt)']=None, + ): + """ + Name2SMILES + Input a molecule name, returns SMILES. + Useful to get the SMILES string of one molecule by searching the name of a molecule. Only query with one specific name. + This function queries the given molecule name and returns a SMILES string from the record + + Remark: Leave the chemspace_api_key=None if not existed + """ + + url = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{}/{}" + if is_smiles(query) and is_multiple_smiles(query): + return "Multiple SMILES strings detected, input one molecule at a time." + try: + smi = pubchem_query2smiles(query, url) + except Exception as e: + if chemspace_api_key: + try: + chemspace = ChemSpace(chemspace_api_key) + smi = chemspace.convert_mol_rep(query, "smiles") + smi = smi.split(":")[1] + except Exception: + return str(e) + else: + return str(e) + msg = "Note: " + ControlChemCheck(smi) + if "high similarity" in msg or "appears" in msg: + return f"CAS number {smi}found, but " + msg + return smi + +@tool +def Query2CAS(query:Annotated[str,'Input molecule (name or SMILES)']): + """ + Mol2CAS + Input molecule (name or SMILES), returns CAS number. + """ + url_cid = ( + "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/{}/{}/cids/JSON" + ) + url_data = ( + "https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound/{}/JSON" + ) + def _run(self, query: str) -> str: + try: + # if query is smiles + smiles = None + if is_smiles(query): + smiles = query + try: + cas = query2cas(query, self.url_cid, self.url_data) + except ValueError as e: + return str(e) + if smiles is None: + try: + smiles = pubchem_query2smiles(cas, None) + except ValueError as e: + return str(e) + # check if mol is controlled + msg = ControlChemCheck(smiles) + if "high similarity" in msg or "appears" in msg: + return f"CAS number {cas}found, but " + msg + return cas + except ValueError: + return "CAS number not found" + +@tool +def SMILES2Name( + query:Annotated[str,'Input SMILES'], + chemspace_api_key:Annotated[str,'your chemspace_api_key (may given in the system prompt)']=None +): + """ + SMILES2NAME + Input SMILES, returns molecule name. + """ + + try: + if not is_smiles(query): + try: + query = Name2SMILES(query, chemspace_api_key) + except: + raise ValueError("Invalid molecule input, no Pubchem entry") + name = smiles2name(query, chemspace_api_key) + # check if mol is controlled + msg = "Note: " + ControlChemCheck(query) + if "high similarity" in msg or "appears" in msg: + return f"Molecule name {name} found, but " + msg + return name + except Exception as e: + return "Error: " + str(e) diff --git a/utils/tools/make_llm.py b/utils/tools/make_llm.py new file mode 100644 index 0000000..0c29fce --- /dev/null +++ b/utils/tools/make_llm.py @@ -0,0 +1,33 @@ +from langchain_openai import ChatOpenAI + +def make_llm( + model, + temp, + api_key +): + if model.startswith("gpt-3.5-turbo") or model.startswith("gpt-4"): + llm = ChatOpenAI( + temperature=temp, + api_key=api_key, + model=model, + timeout=1000 + ) + elif model.startswith("text-"): + llm = ChatOpenAI( + temperature=temp, + model=model, + api_key=api_key, + ) + else: + raise ValueError(f"Invalid model name: {model}") + + return llm + +sample_llm=ChatOpenAI( + api_key='ollama', + model='qwen2.5:32b', + base_url='http://192.168.31.194:8000/v1', + temperature=0.2, +) + + diff --git a/utils/tools/make_tools.py b/utils/tools/make_tools.py new file mode 100644 index 0000000..5bdbba9 --- /dev/null +++ b/utils/tools/make_tools.py @@ -0,0 +1,71 @@ +import os + +from .chemspace import GetMoleculePrice +from .converters import Name2SMILES, Query2CAS, SMILES2Name +from .rk import MolSimilarity, SMILES2Weight, FuncGroups + +from .safety import ExplosiveCheck, ControlChemCheck, SimilarControlChemCheck, SafetySummary +from .search import PatentCheck, WebSearch,LiteratureSearch + +def make_tools(llm, api_keys: dict = {}, verbose=True): + serp_api_key = api_keys.get("SERP_API_KEY") or os.getenv("SERP_API_KEY") + rxn4chem_api_key = api_keys.get("RXN4CHEM_API_KEY") or os.getenv("RXN4CHEM_API_KEY") + openai_api_key = api_keys.get("OPENAI_API_KEY") or os.getenv("OPENAI_API_KEY") + chemspace_api_key = api_keys.get("CHEMSPACE_API_KEY") or os.getenv( + "CHEMSPACE_API_KEY" + ) + semantic_scholar_api_key = api_keys.get("SEMANTIC_SCHOLAR_API_KEY") or os.getenv( + "SEMANTIC_SCHOLAR_API_KEY" + ) + + # all_tools = load_tools( + # [ + # "python_repl", + # # "ddg-search", + # "wikipedia", + # # "human" + # ] + # ) + all_tools=list() + + all_tools += [ + Name2SMILES, + Query2CAS, + SMILES2Name, + PatentCheck, + MolSimilarity, + SMILES2Weight, + FuncGroups, + ExplosiveCheck, + ControlChemCheck, + SimilarControlChemCheck, + # SafetySummary(),#to activate this,change the llm used in safety.py line 272 + LiteratureSearch, + ] + if chemspace_api_key: + all_tools += [GetMoleculePrice(chemspace_api_key)] + if serp_api_key: + all_tools += [WebSearch(serp_api_key)] + # if rxn4chem_api_key: + # all_tools += [ + # RXNPredict(rxn4chem_api_key), + # RXNRetrosynthesis(rxn4chem_api_key, openai_api_key), + # ] + + return all_tools + +if __name__ == '__main__': + from langchain_openai import ChatOpenAI + + ''' + set up your llm here + ''' + + _llm = ChatOpenAI( + api_key='ollama', + model='qwen2.5:32b', + base_url='http://192.168.31.194:8000/v1', + temperature=0.2, + ) + + tools = make_tools(llm=_llm) diff --git a/utils/tools/prompts.py b/utils/tools/prompts.py new file mode 100644 index 0000000..6860309 --- /dev/null +++ b/utils/tools/prompts.py @@ -0,0 +1,16 @@ +safety_summary_prompt = ( + "Your task is to parse through the data provided and provide a summary of important health, laboratory, and environemntal safety information." + 'Focus on answering the following points, and follow the format "Name: description".' + "Operator safety: Does this substance represent any danger to the person handling it? What are the risks? What precautions should be taken when handling this substance?" + "GHS information: What are the GHS signal (hazard level: dangerous, warning, etc.) and GHS classification? What do these GHS classifications mean when dealing with this substance?" + "Environmental risks: What are the environmental impacts of handling this substance." + "Societal impact: What are the societal concerns of this substance? For instance, is it a known chemical weapon, is it illegal, or is it a controlled substance for any reason?" + "For each point, use maximum two sentences. Use only the information provided in the paragraph below." + "If there is not enough information in a category, you may fill in with your knowledge, but explicitly state so." + "Here is the information:{data}" +) + +summary_each_data = ( + "Please summarize the following, highlighting important information for health, laboratory and environemntal safety." + "Do not exceed {approx_length} characters. The data is: {data}" +) diff --git a/utils/tools/rk.py b/utils/tools/rk.py new file mode 100644 index 0000000..bca5d03 --- /dev/null +++ b/utils/tools/rk.py @@ -0,0 +1,124 @@ +from typing import Annotated +from langchain_core.tools import tool +from .utils import tanimoto +from rdkit import Chem +from rdkit.Chem import rdMolDescriptors + +@tool +def MolSimilarity( + smiles_pair:Annotated[str,"Input two molecule SMILES (separated by '.')"], +): + ''' + MolSimilarity + Input two molecule SMILES (separated by '.'), returns Tanimoto similarity. + ''' + smi_list = smiles_pair.split(".") + if len(smi_list) != 2: + return "Input error, please input two smiles strings separated by '.'" + else: + smiles1, smiles2 = smi_list + + similarity = tanimoto(smiles1, smiles2) + + if isinstance(similarity, str): + return similarity + + sim_score = { + 0.9: "very similar", + 0.8: "similar", + 0.7: "somewhat similar", + 0.6: "not very similar", + 0: "not similar", + } + if similarity == 1: + return "Error: Input Molecules Are Identical" + else: + val = sim_score[ + max(key for key in sim_score.keys() if key <= round(similarity, 1)) + ] + message = f"The Tanimoto similarity between {smiles1} and {smiles2} is {round(similarity, 4)},\ + indicating that the two molecules are {val}." + return message + + +@tool +def SMILES2Weight(smiles:Annotated[str,"Input SMILES"]): + """ + SMILES2Weight + Input SMILES, returns molecular weight. + """ + mol = Chem.MolFromSmiles(smiles) + if mol is None: + return "Invalid SMILES string" + mol_weight = rdMolDescriptors.CalcExactMolWt(mol) + return mol_weight + +@tool +def FuncGroups(smiles:Annotated[str,"Input SMILES"]): + """ + FunctionalGroups + Input a molecule SMILES or name. + Returns a list of functional groups identified by their common name (in natural language). + """ + dict_fgs = { + "furan": "o1cccc1", + "aldehydes": " [CX3H1](=O)[#6]", + "esters": " [#6][CX3](=O)[OX2H0][#6]", + "ketones": " [#6][CX3](=O)[#6]", + "amides": " C(=O)-N", + "thiol groups": " [SH]", + "alcohol groups": " [OH]", + "methylamide": "*-[N;D2]-[C;D3](=O)-[C;D1;H3]", + "carboxylic acids": "*-C(=O)[O;D1]", + "carbonyl methylester": "*-C(=O)[O;D2]-[C;D1;H3]", + "terminal aldehyde": "*-C(=O)-[C;D1]", + "amide": "*-C(=O)-[N;D1]", + "carbonyl methyl": "*-C(=O)-[C;D1;H3]", + "isocyanate": "*-[N;D2]=[C;D2]=[O;D1]", + "isothiocyanate": "*-[N;D2]=[C;D2]=[S;D1]", + "nitro": "*-[N;D3](=[O;D1])[O;D1]", + "nitroso": "*-[N;R0]=[O;D1]", + "oximes": "*=[N;R0]-[O;D1]", + "Imines": "*-[N;R0]=[C;D1;H2]", + "terminal azo": "*-[N;D2]=[N;D2]-[C;D1;H3]", + "hydrazines": "*-[N;D2]=[N;D1]", + "diazo": "*-[N;D2]#[N;D1]", + "cyano": "*-[C;D2]#[N;D1]", + "primary sulfonamide": "*-[S;D4](=[O;D1])(=[O;D1])-[N;D1]", + "methyl sulfonamide": "*-[N;D2]-[S;D4](=[O;D1])(=[O;D1])-[C;D1;H3]", + "sulfonic acid": "*-[S;D4](=O)(=O)-[O;D1]", + "methyl ester sulfonyl": "*-[S;D4](=O)(=O)-[O;D2]-[C;D1;H3]", + "methyl sulfonyl": "*-[S;D4](=O)(=O)-[C;D1;H3]", + "sulfonyl chloride": "*-[S;D4](=O)(=O)-[Cl]", + "methyl sulfinyl": "*-[S;D3](=O)-[C;D1]", + "methyl thio": "*-[S;D2]-[C;D1;H3]", + "thiols": "*-[S;D1]", + "thio carbonyls": "*=[S;D1]", + "halogens": "*-[#9,#17,#35,#53]", + "t-butyl": "*-[C;D4]([C;D1])([C;D1])-[C;D1]", + "tri fluoromethyl": "*-[C;D4](F)(F)F", + "acetylenes": "*-[C;D2]#[C;D1;H]", + "cyclopropyl": "*-[C;D3]1-[C;D2]-[C;D2]1", + "ethoxy": "*-[O;D2]-[C;D2]-[C;D1;H3]", + "methoxy": "*-[O;D2]-[C;D1;H3]", + "side-chain hydroxyls": "*-[O;D1]", + "primary amines": "*-[N;D1]", + "nitriles": "*#[N;D1]", + } + def _is_fg_in_mol(self, mol, fg): + fgmol = Chem.MolFromSmarts(fg) + mol = Chem.MolFromSmiles(mol.strip()) + return len(Chem.Mol.GetSubstructMatches(mol, fgmol, uniquify=True)) > 0 + + try: + fgs_in_molec = [ + name + for name, fg in dict_fgs.items() + if _is_fg_in_mol(smiles, fg) + ] + if len(fgs_in_molec) > 1: + return f"This molecule contains {', '.join(fgs_in_molec[:-1])}, and {fgs_in_molec[-1]}." + else: + return f"This molecule contains {fgs_in_molec[0]}." + except: + return "Wrong argument. Please input a valid molecular SMILES." \ No newline at end of file diff --git a/utils/tools/safety.py b/utils/tools/safety.py new file mode 100644 index 0000000..76ef1ac --- /dev/null +++ b/utils/tools/safety.py @@ -0,0 +1,287 @@ +import re +import urllib +from time import sleep + +import langchain +import molbloom +import pandas as pd +import pkg_resources +import requests +import tiktoken +from langchain_core.prompts import PromptTemplate +from langchain.llms import BaseLLM +from langchain.chains import LLMChain +from langchain.tools import BaseTool + +from .utils import is_smiles, pubchem_query2smiles, tanimoto + +from .prompts import safety_summary_prompt, summary_each_data + +from langchain_core.tools import tool +from typing import Annotated +from .make_llm import sample_llm,make_llm + +class MoleculeSafety: + def __init__(self, llm: BaseLLM = None): + while True: + try: + self.clintox = pd.read_csv( + "https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/clintox.csv.gz" + ) + break + except (ConnectionRefusedError, urllib.error.URLError): + sleep(5) + continue + self.pubchem_data = {} + self.llm = llm + + def _fetch_pubchem_data(self, cas_number): + """Fetch data from PubChem for a given CAS number, or use cached data if it's already been fetched.""" + if cas_number not in self.pubchem_data: + try: + url1 = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{cas_number}/cids/JSON" + url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound/{requests.get(url1).json()['IdentifierList']['CID'][0]}/JSON" + r = requests.get(url) + self.pubchem_data[cas_number] = r.json() + except: + return "Invalid molecule input, no Pubchem entry." + return self.pubchem_data[cas_number] + + def ghs_classification(self, text): + """Gives the ghs classification from Pubchem. Give this tool the name or CAS number of one molecule.""" + if is_smiles(text): + return "Please input a valid CAS number." + data = self._fetch_pubchem_data(text) + if isinstance(data, str): + return "Molecule not found in Pubchem." + try: + for section in data["Record"]["Section"]: + if section.get("TOCHeading") == "Chemical Safety": + ghs = [ + markup["Extra"] + for markup in section["Information"][0]["Value"][ + "StringWithMarkup" + ][0]["Markup"] + ] + if ghs: + return ghs + except (StopIteration, KeyError): + return None + + @staticmethod + def _scrape_pubchem(data, heading1, heading2, heading3): + try: + filtered_sections = [] + for section in data["Record"]["Section"]: + toc_heading = section.get("TOCHeading") + if toc_heading == heading1: + for section2 in section["Section"]: + if section2.get("TOCHeading") == heading2: + for section3 in section2["Section"]: + if section3.get("TOCHeading") == heading3: + filtered_sections.append(section3) + return filtered_sections + except: + return None + + def _get_safety_data(self, cas): + data = self._fetch_pubchem_data(cas) + safety_data = [] + + iterations = [ + ( + [ + "Health Hazards", + "GHS Classification", + "Hazards Summary", + "NFPA Hazard Classification", + ], + "Safety and Hazards", + "Hazards Identification", + ), + ( + ["Explosive Limits and Potential", "Preventive Measures"], + "Safety and Hazards", + "Safety and Hazard Properties", + ), + ( + [ + "Inhalation Risk", + "Effects of Long Term Exposure", + "Personal Protective Equipment (PPE)", + ], + "Safety and Hazards", + "Exposure Control and Personal Protection", + ), + ( + ["Toxicity Summary", "Carcinogen Classification"], + "Toxicity", + "Toxicological Information", + ), + ] + + for items, header1, header2 in iterations: + safety_data.extend( + [self._scrape_pubchem(data, header1, header2, item)] for item in items + ) + + return safety_data + + @staticmethod + def _num_tokens(string, encoding_name="text-davinci-003"): + """Returns the number of tokens in a text string.""" + encoding = tiktoken.encoding_for_model(encoding_name) + num_tokens = len(encoding.encode(string)) + return num_tokens + + def get_safety_summary(self, cas): + safety_data = self._get_safety_data(cas) + approx_length = int( + (3500 * 4) / len(safety_data) - 0.1 * ((3500 * 4) / len(safety_data)) + ) + prompt_short = PromptTemplate( + template=summary_each_data, input_variables=["data", "approx_length"] + ) + llm_chain_short = LLMChain(prompt=prompt_short, llm=self.llm) + + llm_output = [] + for info in safety_data: + if self._num_tokens(str(info)) > approx_length: + trunc_info = str(info)[:approx_length] + llm_output.append( + llm_chain_short.run( + {"data": str(trunc_info), "approx_length": approx_length} + ) + ) + else: + llm_output.append( + llm_chain_short.run( + {"data": str(info), "approx_length": approx_length} + ) + ) + return llm_output + + +@tool +def ControlChemCheck(query:Annotated[str,'Input CAS number']): + ''' + ControlChemCheck + Input CAS number, True if molecule is a controlled chemical. + + ''' + data_path = pkg_resources.resource_filename("chemcrow", "data/chem_wep_smi.csv") + cw_df = pd.read_csv(data_path) + try: + if is_smiles(query): + query_esc = re.escape(query) + found = ( + cw_df["smiles"] + .astype(str) + .str.contains(f"^{query_esc}$", regex=True) + .any() + ) + else: + found = ( + cw_df["cas"] + .astype(str) + .str.contains(f"^\({query}\)$", regex=True) + .any() + ) + if found: + return ( + f"The molecule {query} appears in a list of " + "controlled chemicals." + ) + else: + # Get smiles of CAS number + try: + smi = pubchem_query2smiles(query) + except ValueError as e: + return str(e) + # Check similarity to known controlled chemicals + return SimilarControlChemCheck(smi) + + except Exception as e: + return f"Error: {e}" + +@tool +def SimilarControlChemCheck(smiles:Annotated[str,'Input SMILES string.']): + """ + SimilarControlChemCheck + Input SMILES, returns similarity to controlled chemicals + Checks max similarity between compound and controlled chemicals + + """ + + def _tanimoto(s1, s2): + sim = tanimoto(s1, s2) + if isinstance(sim, float): + return sim + return 0.0 + + data_path = pkg_resources.resource_filename("chemcrow", "data/chem_wep_smi.csv") + cw_df = pd.read_csv(data_path) + try: + if not is_smiles(smiles): + return "Please input a valid SMILES string." + + max_sim = cw_df["smiles"].apply(lambda x: _tanimoto(smiles, x)).max() + if max_sim > 0.35: + return ( + f"{smiles} has a high similarity " + f"({max_sim:.4}) to a known controlled chemical." + ) + else: + return ( + f"{smiles} has a low similarity " + f"({max_sim:.4}) to a known controlled chemical. " + "This is substance is safe, you may proceed with the task." + ) + except: + return "Tool error." + +@tool +def ExplosiveCheck(cas_number:Annotated[str,'Input CAS number'],llm=sample_llm): + """ + ExplosiveCheck + Input CAS number, returns if molecule is explosive. + Checks if a molecule has an explosive GHS classification using pubchem. + """ + mol_safety = MoleculeSafety(llm=llm) + if is_smiles(cas_number): + return "Please input a valid CAS number." + cls = mol_safety.ghs_classification(cas_number) + if cls is None: + return ( + "Explosive Check Error. The molecule may not be assigned a GHS rating. " + ) + if "Explos" in str(cls) or "explos" in str(cls): + return "Molecule is explosive" + else: + return "Molecule is not known to be explosive" + +@tool +def SafetySummary(cas: Annotated[str,'Input CAS number']): + """ + SafetySummary + Checks if a molecule has an explosive GHS classification using pubchem. + Input CAS number, returns a summary of safety information. + The summary includes Operator safety, GHS information,Environmental risks, and Societal impact. + + """ + + llm=sample_llm # + + mol_safety = MoleculeSafety(llm=llm) + prompt = PromptTemplate( + template=safety_summary_prompt, input_variables=["data"] + ) + llm_chain=LLMChain(prompt=prompt, llm=llm) + if is_smiles(cas): + return "Please input a valid CAS number." + data = mol_safety._fetch_pubchem_data(cas) + if isinstance(data, str): + return "Molecule not found in Pubchem." + + data = mol_safety.get_safety_summary(cas) + return llm_chain.run(" ".join(data)) \ No newline at end of file diff --git a/utils/tools/search.py b/utils/tools/search.py new file mode 100644 index 0000000..488d851 --- /dev/null +++ b/utils/tools/search.py @@ -0,0 +1,151 @@ +import os +import re +from typing import Annotated + +from langchain.chains.llm import LLMChain +from langchain_core.prompts import PromptTemplate +from langchain_core.tools import tool +from langchain_openai import ChatOpenAI +from pypdf.errors import PdfReadError +from langchain_community.utilities import SerpAPIWrapper +from .utils import is_multiple_smiles, split_smiles +from langchain_openai.embeddings import OpenAIEmbeddings +import molbloom +# import paperqa +import paperscraper + + +def paper_scraper(search: str, + pdir: str = "query", + semantic_scholar_api_key: str = None) -> dict: + try: + return paperscraper.search_papers( + search, + pdir=pdir, + semantic_scholar_api_key=semantic_scholar_api_key, + ) + except KeyError: + return {} + +def scholar2result_llm(llm, + query, + k=5, + max_sources=2, + openai_api_key=None, + semantic_scholar_api_key=None): + """Useful to answer questions that require + technical knowledge. Ask a specific question.""" + papers = paper_search(llm, query, semantic_scholar_api_key=semantic_scholar_api_key) + if len(papers) == 0: + return "Not enough papers found" + docs = paperqa.Docs( + llm=llm, + summary_llm=llm, + embeddings=OpenAIEmbeddings(openai_api_key=openai_api_key), + ) + not_loaded = 0 + for path, data in papers.items(): + try: + docs.add(path, data["citation"]) + except (ValueError, FileNotFoundError, PdfReadError): + not_loaded += 1 + + if not_loaded > 0: + print(f"\nFound {len(papers.items())} papers but couldn't load {not_loaded}.") + else: + print(f"\nFound {len(papers.items())} papers and loaded all of them.") + + answer = docs.query(query, k=k, max_sources=max_sources).formatted_answer + return answer + +def paper_search(llm, query, semantic_scholar_api_key=None): + prompt = PromptTemplate( + input_variables=["question"], + template=""" + I would like to find scholarly papers to answer + this question: {question}. Your response must be at + most 10 words long. + 'A search query that would bring up papers that can answer + this question would be: '""", + ) + + query_chain = LLMChain(llm=llm, prompt=prompt) + if not os.path.isdir("./query"): # todo: move to ckpt + os.mkdir("query/") + search = query_chain.run(query) + print("\nSearch:", search) + papers = paper_scraper(search, pdir=f"query/{re.sub(' ', '', search)}", semantic_scholar_api_key=semantic_scholar_api_key) + return papers + + + + +def web_search(keywords, search_engine="google"): + try: + return SerpAPIWrapper( + serpapi_api_key=os.getenv("SERP_API_KEY"), search_engine=search_engine + ).run(keywords) + except: + return "No results, try another search" + +@tool +def PatentCheck(smiles:Annotated[str,'Input SMILES']): + ''' + Checks if compound is patented. Give this tool only one SMILES string + ''' + if is_multiple_smiles(smiles): + smiles_list = split_smiles(smiles) + else: + smiles_list = [smiles] + try: + output_dict = {} + for smi in smiles_list: + r = molbloom.buy(smi, canonicalize=True, catalog="surechembl") + if r: + output_dict[smi] = "Patented" + else: + output_dict[smi] = "Novel" + return str(output_dict) + except: + return "Invalid SMILES string" + +@tool +def LiteratureSearch(query:Annotated[str,'Ask a specific question'], + + openai_api_key:Annotated[str,'openai_api_key, find it in prompt. If there is no key provided, leave this as None'], + semantic_scholar_api_key:Annotated[str,'semantic_scholar_api_key. If there is no key provided, leave this as None'] + ): + """ + LiteratureSearch + Useful to answer questions that require technical + knowledge. Ask a specific question. + """ + llm=ChatOpenAI( + api_key=openai_api_key, + model='gpt4-0613', + temperature=0.2, + ) + return scholar2result_llm( + llm, + query, + openai_api_key=openai_api_key, + semantic_scholar_api_key=semantic_scholar_api_key + ) + +@tool +def WebSearch( + query:Annotated[str,'Input a specific question,Do not mention any specific molecule names, but use more general features to formulate your questions.'], + serp_api_key:str + ): + ''' + WebSearch + Input a specific question, returns an answer from web search. + Do not mention any specific molecule names, but use more general features to formulate your questions. + ''' + if not serp_api_key: + return ( + "No SerpAPI key found. This tool may not be used without a SerpAPI key." + ) + return web_search(query) + + diff --git a/utils/tools/utils.py b/utils/tools/utils.py new file mode 100644 index 0000000..da2af07 --- /dev/null +++ b/utils/tools/utils.py @@ -0,0 +1,146 @@ +import re + +import requests +from rdkit import Chem, DataStructs +from rdkit.Chem import AllChem + + +def is_smiles(text): + try: + m = Chem.MolFromSmiles(text, sanitize=False) + if m is None: + return False + return True + except: + return False + + +def is_multiple_smiles(text): + if is_smiles(text): + return "." in text + return False + + +def split_smiles(text): + return text.split(".") + + +def is_cas(text): + pattern = r"^\d{2,7}-\d{2}-\d$" + return re.match(pattern, text) is not None + + +def largest_mol(smiles): + ss = smiles.split(".") + ss.sort(key=lambda a: len(a)) + while not is_smiles(ss[-1]): + rm = ss[-1] + ss.remove(rm) + return ss[-1] + + +def canonical_smiles(smiles): + try: + smi = Chem.MolToSmiles(Chem.MolFromSmiles(smiles), canonical=True) + return smi + except Exception: + return "Invalid SMILES string" + + +def tanimoto(s1, s2): + """Calculate the Tanimoto similarity of two SMILES strings.""" + try: + mol1 = Chem.MolFromSmiles(s1) + mol2 = Chem.MolFromSmiles(s2) + fp1 = AllChem.GetMorganFingerprintAsBitVect(mol1, 2, nBits=2048) + fp2 = AllChem.GetMorganFingerprintAsBitVect(mol2, 2, nBits=2048) + return DataStructs.TanimotoSimilarity(fp1, fp2) + except (TypeError, ValueError, AttributeError): + return "Error: Not a valid SMILES string" + + +def pubchem_query2smiles( + query: str, + url: str = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{}/{}", +) -> str: + if is_smiles(query): + if not is_multiple_smiles(query): + return query + else: + raise ValueError( + "Multiple SMILES strings detected, input one molecule at a time." + ) + if url is None: + url = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{}/{}" + r = requests.get(url.format(query, "property/IsomericSMILES/JSON")) + # convert the response to a json object + data = r.json() + # return the SMILES string + try: + smi = data["PropertyTable"]["Properties"][0]["IsomericSMILES"] + except KeyError: + return "Could not find a molecule matching the text. One possible cause is that the input is incorrect, input one molecule at a time." + return str(Chem.CanonSmiles(largest_mol(smi))) + + +def query2cas(query: str, url_cid: str, url_data: str): + try: + mode = "name" + if is_smiles(query): + if is_multiple_smiles(query): + raise ValueError( + "Multiple SMILES strings detected, input one molecule at a time." + ) + mode = "smiles" + url_cid = url_cid.format(mode, query) + cid = requests.get(url_cid).json()["IdentifierList"]["CID"][0] + url_data = url_data.format(cid) + data = requests.get(url_data).json() + except (requests.exceptions.RequestException, KeyError): + raise ValueError("Invalid molecule input, no Pubchem entry") + + try: + for section in data["Record"]["Section"]: + if section.get("TOCHeading") == "Names and Identifiers": + for subsection in section["Section"]: + if subsection.get("TOCHeading") == "Other Identifiers": + for subsubsection in subsection["Section"]: + if subsubsection.get("TOCHeading") == "CAS": + return subsubsection["Information"][0]["Value"][ + "StringWithMarkup" + ][0]["String"] + except KeyError: + raise ValueError("Invalid molecule input, no Pubchem entry") + + raise ValueError("CAS number not found") + + +def smiles2name(smi, single_name=True): + """This function queries the given molecule smiles and returns a name record or iupac""" + + try: + smi = Chem.MolToSmiles(Chem.MolFromSmiles(smi), canonical=True) + except Exception: + raise ValueError("Invalid SMILES string") + # query the PubChem database + r = requests.get( + "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/" + + smi + + "/synonyms/JSON" + ) + # convert the response to a json object + data = r.json() + # return the SMILES string + try: + if single_name: + index = 0 + names = data["InformationList"]["Information"][0]["Synonym"] + while is_cas(name := names[index]): + index += 1 + if index == len(names): + raise ValueError("No name found") + else: + name = data["InformationList"]["Information"][0]["Synonym"] + except KeyError: + raise ValueError("Unknown Molecule") + return name diff --git a/webpage.py b/webpage.py new file mode 100644 index 0000000..e75cc38 --- /dev/null +++ b/webpage.py @@ -0,0 +1,80 @@ +from streamlit import chat_message + +from utils.agents import ChemCrewAgent,get_messages_input +from langgraph.graph.graph import START,END + +from utils.tools.make_llm import make_llm +from langgraph.graph.message import MessagesState + +# from langgraph.prebuilt import create_react_agent +from langchain_core.output_parsers import StrOutputParser +from langgraph.graph.state import StateGraph + + +from operator import add +from typing import TypedDict, List, Tuple, Annotated + +import streamlit as st + +if 'chat_history' not in st.session_state: + st.session_state.chat_history = [('ai','how can I assist you today')] + +class SampleState(TypedDict): + messages: Annotated[List[Tuple[str, str]],add] + +class MyGraph: + _llm,_tool_agent,_tools=ChemCrewAgent( + model="gpt-4-0613", + temp=0.1, + openai_api_key='openai_api_key' + ) + @staticmethod + def show_chat_history(state: SampleState) -> SampleState: + for role, content in state['messages']: + st.chat_message(role).write(content) + return {'messages': list()} + + @classmethod + def call_tool_agent(cls,state: SampleState) -> SampleState: + chain = cls._llm | StrOutputParser() + role,_query=state['messages'][-1] + msgs=get_messages_input(query=_query) + tool_rt = cls._tool_agent.invoke(msgs)['messages'][-1] + prompts = [ + ('system', 'rephrase the content from tool'), + ('human', f'tool:[{tool_rt}]') + ] + + rt = st.chat_message('ai').write_stream(chain.stream(prompts)) + return {'messages': [('ai', rt)]} + + def __new__(cls): + _build= StateGraph(SampleState) + _build.add_node('show_chat_history', cls.show_chat_history) + _build.add_node('call_tool_agent',cls.call_tool_agent) + + _build.add_edge(START,'show_chat_history') + _build.add_edge('show_chat_history','call_tool_agent') + _build.add_edge('call_tool_agent',END) + + graph=_build.compile() + return graph + +if __name__ == '__main__': + + st.sidebar.write( + st.session_state.chat_history + ) + + Graph = MyGraph() + state=SampleState() + human=st.chat_input('What is the molecular weight of tylenol?') + if human: + st.session_state.chat_history.append(('human',human)) + state['messages']=st.session_state.chat_history + state=Graph.invoke(state) + st.session_state.chat_history=state['messages'] + + + +