From 41f729e9588720e1c6cc7696b0f9881941cb526e Mon Sep 17 00:00:00 2001 From: Ludwig Schneider Date: Tue, 17 Dec 2024 14:26:41 -0600 Subject: [PATCH] simplify AST building --- src/gbigsmiles/__init__.py | 100 ++++++++++++++-------- src/gbigsmiles/atom.py | 6 +- src/gbigsmiles/bond.py | 25 ++++++ src/gbigsmiles/data/g-bigsmiles.lark | 33 ++++---- src/gbigsmiles/exception.py | 46 +++------- src/gbigsmiles/molecule.py | 5 +- src/gbigsmiles/parser.py | 16 +++- src/gbigsmiles/system.py | 7 +- src/gbigsmiles/transformer.py | 120 +++------------------------ src/gbigsmiles/util.py | 12 +++ tests/test_core.py | 20 ----- 11 files changed, 171 insertions(+), 219 deletions(-) delete mode 100644 tests/test_core.py diff --git a/src/gbigsmiles/__init__.py b/src/gbigsmiles/__init__.py index d5b5d1e..f46f7bd 100644 --- a/src/gbigsmiles/__init__.py +++ b/src/gbigsmiles/__init__.py @@ -15,49 +15,79 @@ "Please make sure to install this module correctly via setuptools with setuptools_scm activated to generate a `_version.py` file." ) from exc -from .atom import Atom -from .bond import BondDescriptor -from .core import _GLOBAL_RNG, BigSMILESbase, reaction_graph_to_dot_string -from .distribution import Distribution, FlorySchulz, Gauss -from .exception import ( - GBigSMILESError, - GBigSMILESInitNotEnoughError, - GBigSMILESInitTooMuchError, - GBigSMILESParsingError, +from .atom import ( + AliphaticOrganic, + AromaticOrganic, + AromaticSymbol, + Atom, + AtomCharge, + AtomClass, + AtomSymbol, + BracketAtom, + Chiral, + HCount, + Isotope, ) -from .graph_generate import AtomGraph -from .mixture import Mixture -from .mol_prob import get_ensemble_prob -from .molecule import Molecule -from .stochastic import Stochastic -from .system import System -from .token import SmilesToken -from .transformer import GBigSMILESTransformer -from .util import camel_to_snake, snake_to_camel +from .bond import ( + BondDescriptor, + BondDescriptorGeneration, + BondDescriptorSymbol, + BondDescriptorSymbolIdx, + BondSymbol, + InnerBondDescriptor, + RingBond, + SimpleBondDescriptor, + TerminalBondDescriptor, +) +from .core import BigSMILESbase +from .distribution import ( + FlorySchulz, + Gauss, + LogNormal, + Poisson, + StochasticDistribution, + Uniform, +) +from .parser import get_global_parser +from .transformer import GBigSMILESTransformer, get_global_transformer +from .util import camel_to_snake, get_global_rng, snake_to_camel + +# from .graph_generate import AtomGraph +# from .mixture import Mixture +# from .mol_prob import get_ensemble_prob +# from .molecule import Molecule +# from .stochastic import Stochastic +# from .system import System +# from .token import SmilesToken __all__ = [ "__version__", "version_tuple", + "Atom", + "BracketAtom", + "Isotope", + "AtomSymbol", + "Chiral", + "HCount", + "AtomCharge", + "AtomClass", + "AromaticSymbol", + "AliphaticOrganic", + "AromaticOrganic", + "BondSymbol", + "RingBond", + "BondDescriptorSymbol", + "BondDescriptorSymbolIdx", + "BondDescriptorGeneration", + "InnerBondDescriptor", "BondDescriptor", - "_GLOBAL_RNG", + "SimpleBondDescriptor", + "TerminalBondDescriptor", "BigSMILESbase", - "reaction_graph_to_dot_string", - "Distribution", - "FlorySchulz", - "Gauss", - "Atom", - "AtomGraph", - "Mixture", - "get_ensemble_prob", - "Molecule", - "Stochastic", - "System", - "SmilesToken", - "GBigSMILESError", - "GBigSMILESParsingError", - "GBigSMILESInitNotEnoughError", - "GBigSMILESInitTooMuchError", "camel_to_snake", "snake_to_camel", + "get_global_rng", "GBigSMILESTransformer", + "get_global_transformer", + "get_global_parser", ] diff --git a/src/gbigsmiles/atom.py b/src/gbigsmiles/atom.py index 35b3d83..f03ce80 100644 --- a/src/gbigsmiles/atom.py +++ b/src/gbigsmiles/atom.py @@ -5,7 +5,7 @@ import lark from .core import BigSMILESbase -from .exception import GBigSMILESParsingError, GBigSMILESTooManyTokens +from .exception import ParsingError, TooManyTokens class Atom(BigSMILESbase): @@ -33,7 +33,7 @@ def __init__(self, children: list): for child in self._children: if isinstance(child, AtomSymbol): if self._symbol is not None: - raise GBigSMILESTooManyTokens(self.__class__, self._symbol, child) + raise TooManyTokens(self.__class__, self._symbol, child) self._symbol = child def generate_string(self, extension): @@ -294,7 +294,7 @@ def __init__(self, children: list): super().__init__(children) if str(self._children[0]) != "H": - raise GBigSMILESParsingError(self._children[0]) + raise ParsingError(self._children[0]) if len(self._children) > 1: self._count = int(self._children[1]) diff --git a/src/gbigsmiles/bond.py b/src/gbigsmiles/bond.py index d71481a..9e7c3ca 100644 --- a/src/gbigsmiles/bond.py +++ b/src/gbigsmiles/bond.py @@ -2,8 +2,13 @@ # Copyright (c) 2022: Ludwig Schneider # See LICENSE for details +try: + from typing import Self +except ImportError: + from typing_extensions import Self from .core import BigSMILESbase +from .parser import get_global_parser def _create_compatible_bond_text(bond): @@ -309,6 +314,12 @@ def transition(self): class BondDescriptor(BigSMILESbase): + @classmethod + def make(cls, text: str) -> Self: + if "$" in text or "<" in text or ">" in text: + return SimpleBondDescriptor.make(text) + return TerminalBondDescriptor.make(text) + @property def symbol(self): return None @@ -334,6 +345,13 @@ def __init__(self, children): if isinstance(child, InnerBondDescriptor): self._inner_bond_descriptor = child + @classmethod + def make(cls, text: str) -> Self: + # We use BigSMILESbase.make.__func__ to get the underlying function of the class method, + # then call it with cls as the first argument to ensure child typing. + # We do not want to call StochasticDistribution's make function, because it directs here. + return BigSMILESbase.make.__func__(cls, text) + def generate_string(self, extension): return "[" + self._inner_bond_descriptor.generate_string(extension) + "]" @@ -369,6 +387,13 @@ def __init__(self, children): if isinstance(child, BondDescriptorGeneration): self._generation = child + @classmethod + def make(cls, text: str) -> Self: + # We use BigSMILESbase.make.__func__ to get the underlying function of the class method, + # then call it with cls as the first argument to ensure child typing. + # We do not want to call StochasticDistribution's make function, because it directs here. + return BigSMILESbase.make.__func__(cls, text) + @property def weight(self): return self._generation.weight diff --git a/src/gbigsmiles/data/g-bigsmiles.lark b/src/gbigsmiles/data/g-bigsmiles.lark index ad324dd..bdb760e 100644 --- a/src/gbigsmiles/data/g-bigsmiles.lark +++ b/src/gbigsmiles/data/g-bigsmiles.lark @@ -40,8 +40,8 @@ bond_symbol: "-" | "/" | "\\" -ring_bond: bond_symbol? DIGIT - | bond_symbol? "%" DIGIT? DIGIT +ring_bond: bond_symbol? INT + | bond_symbol? "%" INT _branched_atom: _atom_stand_in ring_bond* branch* @@ -77,12 +77,18 @@ bond_descriptor: simple_bond_descriptor | ladder_bond_descriptor | non_covalent_ terminal_bond_descriptor: "[" bond_descriptor_symbol_idx? bond_descriptor_generation? "]" stochastic_generation: "|" stochastic_distribution "|" -stochastic_distribution: "flory_schulz(" WS_INLINE* NUMBER WS_INLINE* ")" - | "schulz_zimm(" WS_INLINE* NUMBER WS_INLINE* "," WS_INLINE* NUMBER WS_INLINE* ")" - | "gauss(" WS_INLINE* NUMBER WS_INLINE* "," WS_INLINE* NUMBER WS_INLINE* ")" - | "uniform(" WS_INLINE* NUMBER WS_INLINE* "," WS_INLINE* NUMBER WS_INLINE* ")" - | "log_normal(" WS_INLINE* NUMBER WS_INLINE* "," WS_INLINE* NUMBER WS_INLINE* ")" - | "poisson(" WS_INLINE* NUMBER WS_INLINE* ")" +flory_schulz: "flory_schulz(" WS_INLINE* NUMBER WS_INLINE* ")" +schulz_zimm: "schulz_zimm(" WS_INLINE* NUMBER WS_INLINE* "," WS_INLINE* NUMBER WS_INLINE* ")" +gauss: "gauss(" WS_INLINE* NUMBER WS_INLINE* "," WS_INLINE* NUMBER WS_INLINE* ")" +uniform: "uniform(" WS_INLINE* NUMBER WS_INLINE* "," WS_INLINE* NUMBER WS_INLINE* ")" +log_normal: "log_normal(" WS_INLINE* NUMBER WS_INLINE* "," WS_INLINE* NUMBER WS_INLINE* ")" +poisson: "poisson(" WS_INLINE* NUMBER WS_INLINE* ")" +stochastic_distribution: flory_schulz + | schulz_zimm + | gauss + | uniform + | log_normal + | poisson _unary_index_operator: "!" _binary_index_operator: "~" | "&" @@ -94,10 +100,10 @@ _non_covalent_key_value_pair: WS_INLINE* "," WS_INLINE* _printable_character+ "= _non_covalent_context: WS_INLINE* "|" WS_INLINE* _index_expression _non_covalent_key_value_pair* h_count: "H" - | "H" DIGIT + | "H" INT -atom_charge: "-" DIGIT? - | "+" DIGIT? +atom_charge: "-" INT? + | "+" INT? | "--" | "++" @@ -117,7 +123,6 @@ end_group.-1: ";" WS_INLINE* smiles _monomer_list* %import common.INT %import common.SIGNED_NUMBER %import common.NUMBER -%import common.DIGIT %import common.WS %import common.WS_INLINE @@ -152,8 +157,8 @@ chiral: "@" | "@SP1" | "@SP2" | "@SP3" - | "@TB" DIGIT? DIGIT - | "@OH" DIGIT? DIGIT + | "@TB" INT + | "@OH" INT _element_symbols: "H" | "He" diff --git a/src/gbigsmiles/exception.py b/src/gbigsmiles/exception.py index f523de2..c901532 100644 --- a/src/gbigsmiles/exception.py +++ b/src/gbigsmiles/exception.py @@ -11,7 +11,7 @@ class GBigSMILESError(Exception): pass -class GBigSMILESParsingError(GBigSMILESError): +class ParsingError(GBigSMILESError): """ Parsing the Grammar went in an unanticipated manner. Please report bug with input string. @@ -24,39 +24,7 @@ def __str__(self): return f"Unanticipated error while parsing. Please report and provide the input string. Token: {self.token} start: {self.token.start_pos}" -class GBigSMILESInitNotEnoughError(GBigSMILESError): - """ - GBigSMILES classes usually need to be initialized either via text, - or as part of parsing a different string. - - If this isn't followed, this exception is raise. - Initialize the elements of G-BigSMILES with (part of) a G-BigSMILES string. - """ - - def __init__(self, class_name): - self.class_name = class_name - - def __str__(self): - return f"Attempt to initialize {self.class_name} without sufficient arguments. Initialize objects of {self.class_name} by passing (part of) a G-BigSMILES string." - - -class GBigSMILESInitTooMuchError(GBigSMILESError): - """ - GBigSMILES classes usually need to be initialized either via text, - or as part of parsing a different string, but not both. - - If this isn't followed, this exception is raise. - Initialize the elements of G-BigSMILES with (part of) a G-BigSMILES string. - """ - - def __init__(self, class_name): - self.class_name = class_name - - def __str__(self): - return f"Attempt to initialize {self.class_name} with tree and text arguments. Initialize objects of {self.class_name} by passing (part of) a G-BigSMILES string." - - -class GBigSMILESTooManyTokens(GBigSMILESError): +class TooManyTokens(ParsingError): def __init__(self, class_name, existing_token, new_token): self.class_name = class_name self.existing_token = existing_token @@ -67,3 +35,13 @@ def __str__(self): string += f"The existing token is {self.existing_token} which conflicts with the new " string += f"token {self.new_token}. Most likely in implementation error, please report." return string + + +class UnknownDistribution(GBigSMILESError): + def __init__(self, distribution_text: str): + self.distribution_text = distribution_text + + def __str__(self): + string = f"GBigSMILES a distribution with the following text {self.distribution_text} is unknown." + string += " Typo or not implemented distribution." + return string diff --git a/src/gbigsmiles/molecule.py b/src/gbigsmiles/molecule.py index 23ce75b..8ff5a35 100644 --- a/src/gbigsmiles/molecule.py +++ b/src/gbigsmiles/molecule.py @@ -7,11 +7,12 @@ import networkx as nx from .bond import _create_compatible_bond_text -from .core import _GLOBAL_RNG, BigSMILESbase +from .core import BigSMILESbase from .mixture import Mixture from .stochastic import Stochastic from .stochastic_atom_graph import StochasticAtomGraph from .token import SmilesToken +from .util import get_global_rng class Molecule(BigSMILESbase): @@ -144,7 +145,7 @@ def generate_string(self, extension): string += self.mixture.generate_string(extension) return string - def generate(self, prefix=None, rng=_GLOBAL_RNG): + def generate(self, prefix=None, rng=get_global_rng()): my_mol = prefix for element in self._elements: my_mol = element.generate(my_mol, rng) diff --git a/src/gbigsmiles/parser.py b/src/gbigsmiles/parser.py index d3424ca..331526d 100644 --- a/src/gbigsmiles/parser.py +++ b/src/gbigsmiles/parser.py @@ -30,6 +30,7 @@ def _make_parser(filename=None, start_tokens=None): "ladder_bond_descriptor", "non_covalent_bond_descriptor", "bond_descriptor", + "simple_bond_descriptor", "terminal_bond_descriptor", "stochastic_generation", "stochastic_distribution", @@ -41,9 +42,22 @@ def _make_parser(filename=None, start_tokens=None): "atom_symbol", "aromatic_symbol", "bracket_atom", + "flory_schulz", + "uniform", + "schulz_zimm", + "log_normal", + "gauss", ] parser = Lark(rf"{grammar_text}", start=start_tokens, keep_all_tokens=True) return parser -_GLOBAL_PARSER = _make_parser() +_GLOBAL_PARSER: None | Lark = None + + +def get_global_parser(): + global _GLOBAL_PARSER + if _GLOBAL_PARSER is None: + _GLOBAL_PARSER = _make_parser() + + return _GLOBAL_PARSER diff --git a/src/gbigsmiles/system.py b/src/gbigsmiles/system.py index ada2247..1211e2d 100644 --- a/src/gbigsmiles/system.py +++ b/src/gbigsmiles/system.py @@ -7,9 +7,10 @@ import numpy as np -from .core import _GLOBAL_RNG, BigSMILESbase +from .core import BigSMILESbase from .mixture import Mixture from .molecule import Molecule +from .util import get_global_rng def _estimate_system_molecular_weight(molecules, system_molweight): @@ -154,7 +155,7 @@ def generate_string(self, extension): return string @property - def generator(self, rng=_GLOBAL_RNG): + def generator(self, rng=get_global_rng()): if not self.generable: raise RuntimeError("Generable system required") @@ -171,7 +172,7 @@ def generator(self, rng=_GLOBAL_RNG): raise RuntimeError("We expect a fully generated molecule here.") yield mol_gen - def generate(self, prefix=None, rng=_GLOBAL_RNG): + def generate(self, prefix=None, rng=get_global_rng()): relative_fractions = [mol.mixture.relative_mass for mol in self._molecules] mol_idx = rng.choice( diff --git a/src/gbigsmiles/transformer.py b/src/gbigsmiles/transformer.py index 316bd12..8291868 100644 --- a/src/gbigsmiles/transformer.py +++ b/src/gbigsmiles/transformer.py @@ -2,111 +2,10 @@ # Copyright (c) 2022: Ludwig Schneider # See LICENSE for details import lark +from lark.visitors import Discard class GBigSMILESTransformer(lark.Transformer): - def atom(self, children): - from .atom import Atom - - a = Atom(children) - return a - - def bracket_atom(self, children): - from .atom import BracketAtom - - return BracketAtom(children) - - def chiral(self, children): - from .atom import Chiral - - return Chiral(children) - - def h_count(self, children): - from .atom import HCount - - return HCount(children) - - def atom_charge(self, children): - from .atom import AtomCharge - - return AtomCharge(children) - - def atom_class(self, children): - from .atom import AtomClass - - return AtomClass(children) - - def isotope(self, children): - from .atom import Isotope - - return Isotope(children) - - def atom_symbol(self, children): - from .atom import AtomSymbol - - return AtomSymbol(children) - - def aromatic_symbol(self, children): - from .atom import AromaticSymbol - - return AromaticSymbol(children) - - def aliphatic_organic(self, children): - from .atom import AliphaticOrganic - - return AliphaticOrganic(children) - - def aromatic_organic(self, children): - from .atom import AromaticOrganic - - return AromaticOrganic(children) - - def bond_symbol(self, children): - from .bond import BondSymbol - - return BondSymbol(children) - - def ring_bond(self, children): - from .bond import RingBond - - return RingBond(children) - - def bond_descriptor_symbol(self, children): - from .bond import BondDescriptorSymbol - - return BondDescriptorSymbol(children) - - def bond_descriptor_symbol_idx(self, children): - from .bond import BondDescriptorSymbolIdx - - return BondDescriptorSymbolIdx(children) - - def terminal_bond_descriptor(self, children): - from .bond import TerminalBondDescriptor - - return TerminalBondDescriptor(children) - - def simple_bond_descriptor(self, children): - from .bond import SimpleBondDescriptor - - return SimpleBondDescriptor(children) - - def inner_bond_descriptor(self, children): - from .bond import InnerBondDescriptor - - return InnerBondDescriptor(children) - - def bond_descriptor_generation(self, children): - from .bond import BondDescriptorGeneration - - return BondDescriptorGeneration(children) - - def bond_descriptor(self, children): - from .bond import BondDescriptor - - assert isinstance(children[0], BondDescriptor) - return children[0] - def NUMBER(self, children): return float(children) @@ -114,13 +13,20 @@ def INT(self, children): return int(children) def WS_INLINE(self, children): - from lark.visitors import Discard - return Discard - # def big_smiles_fragment_definition(self, children +_GLOBAL_TRANSFORMER: None | GBigSMILESTransformer = None + + +def get_global_transformer(): + global _GLOBAL_TRANSFORMER + if _GLOBAL_TRANSFORMER is None: + import gbigsmiles -# "{[][<]N=Cc(cc1)ccc1C=N[13C@OH1H2+1:3]CC[Si](C)(C)O{[<][>][Si](C)(C)O[<][>]}[Si](C)(C)CCC[>][]}" + transformer = lark.ast_utils.create_transformer( + ast_module=gbigsmiles, transformer=GBigSMILESTransformer(visit_tokens=True) + ) + _GLOBAL_TRANSFORMER = transformer -_GLOBAL_TRANSFORMER = GBigSMILESTransformer(visit_tokens=True) + return _GLOBAL_TRANSFORMER diff --git a/src/gbigsmiles/util.py b/src/gbigsmiles/util.py index 0f3985c..b423b44 100644 --- a/src/gbigsmiles/util.py +++ b/src/gbigsmiles/util.py @@ -4,6 +4,10 @@ import re +import numpy as np + +_GLOBAL_RNG: None | np.random.Generator = None + def snake_to_camel(snake_str): """ @@ -73,3 +77,11 @@ def camel_to_snake(name): # Finally, handle the case where an acronym is at the start of the string s3 = re.sub("([A-Z])([A-Z][a-z])", r"\1_\2", s2) return s3.lower() + + +def get_global_rng(seed=None): + global _GLOBAL_RNG + if _GLOBAL_RNG is None: + _GLOBAL_RNG = np.random.default_rng(seed) + + return _GLOBAL_RNG diff --git a/tests/test_core.py b/tests/test_core.py deleted file mode 100644 index 974c927..0000000 --- a/tests/test_core.py +++ /dev/null @@ -1,20 +0,0 @@ -import pytest - -from gbigsmiles import Atom, GBigSMILESInitNotEnoughError, GBigSMILESInitTooMuchError - - -def test_core_errors(): - Atom("C") - with pytest.raises(GBigSMILESInitTooMuchError): - try: - Atom("C", ["C"]) - except Exception as exc: - print(exc) - raise exc - - with pytest.raises(GBigSMILESInitNotEnoughError): - try: - Atom() - except Exception as exc: - print(exc) - raise exc